22 years ago · 68963f34ab
--- a/Changes
+++ b/Changes
@@ -0,0 +1,27 @@
 
																+Change file for Text::WagnerFischer
															
 
																+Dree Mistrut <dree@friul.it>
															
 
																+
															
 
																+Version 0.0.4 : 2003/04/13
															
 
																+
															
 
																+   Fixed a typo in the doc
															
 
																+   Modified Makefile.PL
															
 
																+   (Thanks to Daniel Yacob for both)
															
 
																+
															
 
																+
															
 
																+Version 0.0.3 : 2003/04/12
															
 
																+
															
 
																+   Changed $REFC scope to permit default cost redefinition
															
 
																+   Added the "EXTENDING" section in the documentation
															
 
																+   (Thanks to Daniel Yacob for both)
															
 
																+   Modified test suite
															
 
																+
															
 
																+
															
 
																+Version 0.0.2 : 2002/08/16
															
 
																+
															
 
																+   Fixed matrix initialization (thanks to H. Zha to point me this)
															
 
																+
															
 
																+
															
 
																+Version 0.0.1 : 2002/07/07
															
 
																+
															
 
																+   No changes -- initial release
															
 
																+
															
--- a/MANIFEST
+++ b/MANIFEST
@@ -0,0 +1,6 @@
 
																+Changes
															
 
																+Makefile.PL
															
 
																+README
															
 
																+WagnerFischer.pm
															
 
																+test.pl
															
 
																+MANIFEST
															
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -0,0 +1,8 @@
 
																+use ExtUtils::MakeMaker;
															
 
																+# See lib/ExtUtils/MakeMaker.pm for details of how to influence
															
 
																+# the contents of the Makefile that is written.
															
 
																+WriteMakefile(
															
 
																+	      NAME		=> 'Text::WagnerFischer',
															
 
																+	      VERSION_FROM 	=> 'WagnerFischer.pm',
															
 
																+	      LINKTYPE		=> '$(INST_PM)',
															
 
																+	     );
															
--- a/README
+++ b/README
@@ -0,0 +1,48 @@
 
																+
															
 
																+Text::WagnerFischer is an implementation of the Wagner-Fischer edit distance in Perl.
															
 
																+
															
 
																+
															
 
																+PREREQUISITES
															
 
																+
															
 
																+This suite requires Perl 5; I tested it only under Perl 5.6.
															
 
																+
															
 
																+Text::WagnerFischer does not use any nonstandard modules.
															
 
																+
															
 
																+
															
 
																+INSTALLATION
															
 
																+
															
 
																+You install Text::WagnerFischer by running these commands in the *nix environment:
															
 
																+
															
 
																+   perl Makefile.PL
															
 
																+   make
															
 
																+   make test (optional)
															
 
																+   make install
															
 
																+
															
 
																+To install Text::WagnerFischer in the Win32 environment, use nmake instead of make.
															
 
																+nmake is available for free (in a self extracting executable):
															
 
																+<http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/Nmake15.exe>
															
 
																+After download and inflate, put nmake.exe and nmake.err somewhere in your path.
															
 
																+
															
 
																+
															
 
																+DOCUMENTATION
															
 
																+
															
 
																+POD format documentation is included in WagnerFischer.pm.
															
 
																+POD is readable with the command:
															
 
																+
															
 
																+  perldoc Text::WagnerFischer
															
 
																+
															
 
																+
															
 
																+AVAILABILITY
															
 
																+
															
 
																+The latest version of Text::WagnerFischer is available from the
															
 
																+CPAN <http://search.cpan.org/> 
															
 
																+
															
 
																+
															
 
																+COPYRIGHT
															
 
																+
															
 
																+Copyright 2002,2003 Dree Mistrut <dree@friul.it>
															
 
																+
															
 
																+This package is free software and is provided "as is" without express
															
 
																+or implied warranty.  You can redistribute it and/or modify it under 
															
 
																+the same terms as Perl itself.
															
 
																+
															
--- a/WagnerFischer.pm
+++ b/WagnerFischer.pm
@@ -0,0 +1,245 @@
 
																+package Text::WagnerFischer;
															
 
																+
															
 
																+use strict;
															
 
																+use Exporter;
															
 
																+use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $REFC);
															
 
																+
															
 
																+$VERSION     = '0.04';
															
 
																+@ISA         = qw(Exporter);
															
 
																+@EXPORT      = ();
															
 
																+@EXPORT_OK   = qw(&distance);
															
 
																+%EXPORT_TAGS = ();
															
 
																+
															
 
																+$REFC=[0,1,1];
															
 
																+
															
 
																+sub _min {
															
 
																+
															
 
																+	my ($first,$second,$third)=@_;
															
 
																+	my $result=$first;
															
 
																+
															
 
																+	$result=$second if ($second < $result);
															
 
																+	$result=$third if ($third < $result);
															
 
																+
															
 
																+	return $result
															
 
																+}
															
 
																+
															
 
																+sub _weight {
															
 
																+
															
 
																+	#the cost function
															
 
																+
															
 
																+	my ($x,$y,$refc)=@_;
															
 
																+
															
 
																+	if ($x eq $y) {
															
 
																+
															
 
																+		return $refc->[0] #cost for letter match
															
 
																+
															
 
																+	} elsif (($x eq '-') or ($y eq '-')) {
															
 
																+
															
 
																+		return $refc->[1] #cost for insertion/deletion operation
															
 
																+
															
 
																+	} else {
															
 
																+
															
 
																+		return $refc->[2] #cost for letter mismatch
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+sub distance {
															
 
																+
															
 
																+	my ($refc,$s,@t)=@_;
															
 
																+
															
 
																+	if (!@t) {
															
 
																+
															
 
																+		if (ref($refc) ne "ARRAY") {
															
 
																+
															
 
																+			if (ref($s) ne "ARRAY") {
															
 
																+
															
 
																+				#array cost missing: using default [0,1,1]
															
 
																+
															
 
																+				$t[0]=$s;
															
 
																+				$s=$refc;
															
 
																+				$refc=$REFC;
															
 
																+
															
 
																+			} else {
															
 
																+
															
 
																+	           		require Carp;
															
 
																+        	      		Carp::croak("Text::WagnerFischer: second string is needed");
															
 
																+			}
															
 
																+
															
 
																+		} else {
															
 
																+
															
 
																+           		require Carp;
															
 
																+       	      		Carp::croak("Text::WagnerFischer: second string is needed");
															
 
																+		}
															
 
																+
															
 
																+	} elsif (ref($refc) ne "ARRAY") {
															
 
																+
															
 
																+		#array cost missing: using default [0,1,1]
															
 
																+
															
 
																+		unshift @t,$s;
															
 
																+		$s=$refc;
															
 
																+		$refc=$REFC;
															
 
																+	}
															
 
																+
															
 
																+	my $n=length($s);
															
 
																+	my @result;
															
 
																+
															
 
																+	foreach my $t (@t) {
															
 
																+
															
 
																+		my @d;
															
 
																+
															
 
																+		my $m=length($t);
															
 
																+		if(!$n) {push @result,$m*$refc->[1];next}
															
 
																+		if(!$m) {push @result,$n*$refc->[1];next}
															
 
																+
															
 
																+		$d[0][0]=0;
															
 
																+
															
 
																+		# original algorithm should be:
															
 
																+		# foreach my $i (1 .. $n) {
															
 
																+		#
															
 
																+		#	my $dist_tmp=0;
															
 
																+		#	foreach my $k (1 .. $i) {$dist_tmp+=_weight(substr($s,$i,1),'-',$refc)}
															
 
																+		#	$d[$i][0]=$dist_tmp;
															
 
																+		# }
															
 
																+		#
															
 
																+		# foreach my $j (1 .. $m) {
															
 
																+		#
															
 
																+		#	my $dist_tmp=0;
															
 
																+		#	foreach my $k (1 .. $j) {$dist_tmp+=_weight('-',substr($t,$j,1),$refc)}
															
 
																+		#	$d[0][$j]=$dist_tmp;
															
 
																+		# }
															
 
																+		# that is:
															
 
																+
															
 
																+		foreach my $i (1 .. $n) {$d[$i][0]=$i*$refc->[1];}
															
 
																+		foreach my $j (1 .. $m) {$d[0][$j]=$j*$refc->[1];}
															
 
																+
															
 
																+		foreach my $i (1 .. $n) {
															
 
																+			my $s_i=substr($s,$i-1,1);
															
 
																+			foreach my $j (1 .. $m) {
															
 
																+
															
 
																+				my $t_i=substr($t,$j-1,1);
															
 
																+
															
 
																+				$d[$i][$j]=_min($d[$i-1][$j]+_weight($s_i,'-',$refc),
															
 
																+						 $d[$i][$j-1]+_weight('-',$t_i,$refc),
															
 
																+						 $d[$i-1][$j-1]+_weight($s_i,$t_i,$refc))
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		push @result,$d[$n][$m];
															
 
																+	}
															
 
																+
															
 
																+	if (wantarray) {return @result} else {return $result[0]}
															
 
																+}
															
 
																+	
															
 
																+1;
															
 
																+
															
 
																+__END__
															
 
																+
															
 
																+=head1 NAME
															
 
																+
															
 
																+Text::WagnerFischer - An implementation of the Wagner-Fischer edit distance
															
 
																+
															
 
																+=head1 SYNOPSIS
															
 
																+
															
 
																+
															
 
																+ use Text::WagnerFischer qw(distance);
															
 
																+
															
 
																+ print distance("foo","four");# prints "2"
															
 
																+
															
 
																+ print distance([0,1,2],"foo","four");# prints "3"
															
 
																+
															
 
																+
															
 
																+ my @words=("four","foo","bar");
															
 
																+
															
 
																+ my @distances=distance("foo",@words); 
															
 
																+ print "@distances"; # prints "2 0 3"
															
 
																+
															
 
																+ @distances=distance([0,2,1],"foo",@words); 
															
 
																+ print "@distances"; # prints "3 0 3"
															
 
																+
															
 
																+ 
															
 
																+
															
 
																+=head1 DESCRIPTION
															
 
																+
															
 
																+This module implements the Wagner-Fischer dynamic programming technique,
															
 
																+used here to calculate the edit distance of two strings.
															
 
																+The edit distance is a measure of the degree of proximity between two strings,
															
 
																+based on "edits": the operations of substitutions, deletions or insertions
															
 
																+needed to transform the string into the other one (and vice versa).
															
 
																+A cost (weight) is needed for every of the operation defined above:
															
 
																+
															
 
																+	    / a if x=y (cost for letter match)
															
 
																+ w(x,y) =  |  b if x=- or y=- (cost for insertion/deletion operation)
															
 
																+	    \ c if x!=y (cost for letter mismatch)
															
 
																+
															
 
																+These costs are given through an array reference as first argument of the 
															
 
																+distance subroutine: [a,b,c].
															
 
																+If the costs are not given, a default array cost is used: [0,1,1] that is the
															
 
																+case of the Levenshtein edit distance:
															
 
																+
															
 
																+	    / 0 if x=y (cost for letter match)
															
 
																+ w(x,y) =  |  1 if x=- or y=- (cost for insertion/deletion operation)
															
 
																+	    \ 1 if x!=y (cost for letter mismatch)
															
 
																+
															
 
																+This particular distance is the exact number of edit needed to transform 
															
 
																+the string into the other one (and vice versa).
															
 
																+When two strings have distance 0, they are the same.
															
 
																+Note that the distance is calculated to reach the _minimum_ cost, i.e.
															
 
																+choosing the most economic operation for each edit.
															
 
																+ 
															
 
																+
															
 
																+=head1 EXTENDING (by Daniel Yacob)
															
 
																+
															
 
																+New modules may build upon Text::WagnerFischer as a base class.
															
 
																+This is practical when you would like to apply the algorithm
															
 
																+to non-Roman character sets or would like to change some part
															
 
																+of the algorithm but not another.
															
 
																+
															
 
																+The following example demonstrates how to use the WagnerFisher
															
 
																+distance algorithm but apply your own weight function in a new
															
 
																+package:
															
 
																+
															
 
																+  package Text::WagnerFischer::MyModule;
															
 
																+  use base qw( Text::WagnerFischer );
															
 
																+
															
 
																+  #
															
 
																+  # Link to the WagnerFisher "distance" function so that the
															
 
																+  # new module may also export it:
															
 
																+  #
															
 
																+  use vars qw(@EXPORT_OK);
															
 
																+
															
 
																+  @EXPORT_OK = qw(&distance);
															
 
																+
															
 
																+  *distance = \&Text::WagnerFischer::distance;
															
 
																+
															
 
																+  #
															
 
																+  # "override" the _weight function with the a one:
															
 
																+  #
															
 
																+  *Text::WagnerFischer::_weight = \&_my_weight;
															
 
																+
															
 
																+  #
															
 
																+  # "override" the default WagnerFischer "costs" table:
															
 
																+  #
															
 
																+  $Text::WagnerFischer::REFC = [0,2,3,1,1];
															
 
																+
															
 
																+  sub _my_weight {
															
 
																+    :
															
 
																+    :
															
 
																+    :
															
 
																+  }
															
 
																+
															
 
																+=head1 AUTHOR
															
 
																+
															
 
																+Copyright 2002,2003 Dree Mistrut <F<dree@friul.it>>
															
 
																+
															
 
																+This package is free software and is provided "as is" without express
															
 
																+or implied warranty. You can redistribute it and/or modify it under 
															
 
																+the same terms as Perl itself.
															
 
																+
															
 
																+
															
 
																+=head1 SEE ALSO
															
 
																+
															
 
																+C<Text::Levenshtein>, C<Text::PhraseDistance>
															
 
																+
															
 
																+
															
 
																+=cut
															
 
																+
															
--- a/test.pl
+++ b/test.pl
@@ -0,0 +1,73 @@
 
																+use strict;
															
 
																+use Text::WagnerFischer qw(distance);
															
 
																+
															
 
																+my $ko=0;
															
 
																+my $test=1;
															
 
																+my $first_distance=distance("foo","four");
															
 
																+
															
 
																+if ($first_distance == 2) {
															
 
																+
															
 
																+	print $test.". ok\n"
															
 
																+
															
 
																+} else {
															
 
																+
															
 
																+	print $test.". NO <--\n";
															
 
																+	$ko=1;
															
 
																+}
															
 
																+
															
 
																+my $second_distance=distance("foo","foo");
															
 
																+$test++;
															
 
																+
															
 
																+if ($second_distance == 0) {
															
 
																+
															
 
																+	print $test.". ok\n"
															
 
																+
															
 
																+} else {
															
 
																+
															
 
																+	print $test.". NO <--\n";
															
 
																+	$ko=1;
															
 
																+}
															
 
																+
															
 
																+my $third_distance=distance([0,1,2],"foo","four");
															
 
																+$test++;
															
 
																+
															
 
																+if ($third_distance == 3) {
															
 
																+
															
 
																+	print $test.". ok\n"
															
 
																+
															
 
																+} else {
															
 
																+
															
 
																+	print $test.". NO <--\n";
															
 
																+	$ko=1;
															
 
																+}
															
 
																+
															
 
																+my @words=("four","foo","bar");
															
 
																+
															
 
																+my @distances=distance("foo",@words);
															
 
																+$test++;
															
 
																+
															
 
																+if (($distances[0] == 2) and ($distances[1] == 0) and ($distances[2] == 3)) {
															
 
																+
															
 
																+	print $test.". ok\n"
															
 
																+
															
 
																+} else {
															
 
																+
															
 
																+	print $test.". NO <--\n";
															
 
																+	$ko=1;
															
 
																+}
															
 
																+
															
 
																+@distances=distance([0,5,3],"foo",@words);
															
 
																+$test++;
															
 
																+
															
 
																+if (($distances[0] == 8) and ($distances[1] == 0) and ($distances[2] == 9)) {
															
 
																+
															
 
																+	print $test.". ok\n"
															
 
																+
															
 
																+} else {
															
 
																+
															
 
																+	print $test.". NO <--\n";
															
 
																+	$ko=1;
															
 
																+}
															
 
																+
															
 
																+
															
 
																+if ($ko) {print "\nTest suite failed\n"} else {print "\nTest suite ok\n"}