Browse Source

Import upstream version 0.04

Dree Mistrut 21 years ago
commit
68963f34ab
6 changed files with 407 additions and 0 deletions
  1. 27 0
      Changes
  2. 6 0
      MANIFEST
  3. 8 0
      Makefile.PL
  4. 48 0
      README
  5. 245 0
      WagnerFischer.pm
  6. 73 0
      test.pl

+ 27 - 0
Changes

@@ -0,0 +1,27 @@
+Change file for Text::WagnerFischer
+Dree Mistrut <dree@friul.it>
+
+Version 0.0.4 : 2003/04/13
+
+   Fixed a typo in the doc
+   Modified Makefile.PL
+   (Thanks to Daniel Yacob for both)
+
+
+Version 0.0.3 : 2003/04/12
+
+   Changed $REFC scope to permit default cost redefinition
+   Added the "EXTENDING" section in the documentation
+   (Thanks to Daniel Yacob for both)
+   Modified test suite
+
+
+Version 0.0.2 : 2002/08/16
+
+   Fixed matrix initialization (thanks to H. Zha to point me this)
+
+
+Version 0.0.1 : 2002/07/07
+
+   No changes -- initial release
+

+ 6 - 0
MANIFEST

@@ -0,0 +1,6 @@
+Changes
+Makefile.PL
+README
+WagnerFischer.pm
+test.pl
+MANIFEST

+ 8 - 0
Makefile.PL

@@ -0,0 +1,8 @@
+use ExtUtils::MakeMaker;
+# See lib/ExtUtils/MakeMaker.pm for details of how to influence
+# the contents of the Makefile that is written.
+WriteMakefile(
+	      NAME		=> 'Text::WagnerFischer',
+	      VERSION_FROM 	=> 'WagnerFischer.pm',
+	      LINKTYPE		=> '$(INST_PM)',
+	     );

+ 48 - 0
README

@@ -0,0 +1,48 @@
+
+Text::WagnerFischer is an implementation of the Wagner-Fischer edit distance in Perl.
+
+
+PREREQUISITES
+
+This suite requires Perl 5; I tested it only under Perl 5.6.
+
+Text::WagnerFischer does not use any nonstandard modules.
+
+
+INSTALLATION
+
+You install Text::WagnerFischer by running these commands in the *nix environment:
+
+   perl Makefile.PL
+   make
+   make test (optional)
+   make install
+
+To install Text::WagnerFischer in the Win32 environment, use nmake instead of make.
+nmake is available for free (in a self extracting executable):
+<http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/Nmake15.exe>
+After download and inflate, put nmake.exe and nmake.err somewhere in your path.
+
+
+DOCUMENTATION
+
+POD format documentation is included in WagnerFischer.pm.
+POD is readable with the command:
+
+  perldoc Text::WagnerFischer
+
+
+AVAILABILITY
+
+The latest version of Text::WagnerFischer is available from the
+CPAN <http://search.cpan.org/> 
+
+
+COPYRIGHT
+
+Copyright 2002,2003 Dree Mistrut <dree@friul.it>
+
+This package is free software and is provided "as is" without express
+or implied warranty.  You can redistribute it and/or modify it under 
+the same terms as Perl itself.
+

+ 245 - 0
WagnerFischer.pm

@@ -0,0 +1,245 @@
+package Text::WagnerFischer;
+
+use strict;
+use Exporter;
+use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $REFC);
+
+$VERSION     = '0.04';
+@ISA         = qw(Exporter);
+@EXPORT      = ();
+@EXPORT_OK   = qw(&distance);
+%EXPORT_TAGS = ();
+
+$REFC=[0,1,1];
+
+sub _min {
+
+	my ($first,$second,$third)=@_;
+	my $result=$first;
+
+	$result=$second if ($second < $result);
+	$result=$third if ($third < $result);
+
+	return $result
+}
+
+sub _weight {
+
+	#the cost function
+
+	my ($x,$y,$refc)=@_;
+
+	if ($x eq $y) {
+
+		return $refc->[0] #cost for letter match
+
+	} elsif (($x eq '-') or ($y eq '-')) {
+
+		return $refc->[1] #cost for insertion/deletion operation
+
+	} else {
+
+		return $refc->[2] #cost for letter mismatch
+	}
+}
+
+sub distance {
+
+	my ($refc,$s,@t)=@_;
+
+	if (!@t) {
+
+		if (ref($refc) ne "ARRAY") {
+
+			if (ref($s) ne "ARRAY") {
+
+				#array cost missing: using default [0,1,1]
+
+				$t[0]=$s;
+				$s=$refc;
+				$refc=$REFC;
+
+			} else {
+
+	           		require Carp;
+        	      		Carp::croak("Text::WagnerFischer: second string is needed");
+			}
+
+		} else {
+
+           		require Carp;
+       	      		Carp::croak("Text::WagnerFischer: second string is needed");
+		}
+
+	} elsif (ref($refc) ne "ARRAY") {
+
+		#array cost missing: using default [0,1,1]
+
+		unshift @t,$s;
+		$s=$refc;
+		$refc=$REFC;
+	}
+
+	my $n=length($s);
+	my @result;
+
+	foreach my $t (@t) {
+
+		my @d;
+
+		my $m=length($t);
+		if(!$n) {push @result,$m*$refc->[1];next}
+		if(!$m) {push @result,$n*$refc->[1];next}
+
+		$d[0][0]=0;
+
+		# original algorithm should be:
+		# foreach my $i (1 .. $n) {
+		#
+		#	my $dist_tmp=0;
+		#	foreach my $k (1 .. $i) {$dist_tmp+=_weight(substr($s,$i,1),'-',$refc)}
+		#	$d[$i][0]=$dist_tmp;
+		# }
+		#
+		# foreach my $j (1 .. $m) {
+		#
+		#	my $dist_tmp=0;
+		#	foreach my $k (1 .. $j) {$dist_tmp+=_weight('-',substr($t,$j,1),$refc)}
+		#	$d[0][$j]=$dist_tmp;
+		# }
+		# that is:
+
+		foreach my $i (1 .. $n) {$d[$i][0]=$i*$refc->[1];}
+		foreach my $j (1 .. $m) {$d[0][$j]=$j*$refc->[1];}
+
+		foreach my $i (1 .. $n) {
+			my $s_i=substr($s,$i-1,1);
+			foreach my $j (1 .. $m) {
+
+				my $t_i=substr($t,$j-1,1);
+
+				$d[$i][$j]=_min($d[$i-1][$j]+_weight($s_i,'-',$refc),
+						 $d[$i][$j-1]+_weight('-',$t_i,$refc),
+						 $d[$i-1][$j-1]+_weight($s_i,$t_i,$refc))
+			}
+		}
+
+		push @result,$d[$n][$m];
+	}
+
+	if (wantarray) {return @result} else {return $result[0]}
+}
+	
+1;
+
+__END__
+
+=head1 NAME
+
+Text::WagnerFischer - An implementation of the Wagner-Fischer edit distance
+
+=head1 SYNOPSIS
+
+
+ use Text::WagnerFischer qw(distance);
+
+ print distance("foo","four");# prints "2"
+
+ print distance([0,1,2],"foo","four");# prints "3"
+
+
+ my @words=("four","foo","bar");
+
+ my @distances=distance("foo",@words); 
+ print "@distances"; # prints "2 0 3"
+
+ @distances=distance([0,2,1],"foo",@words); 
+ print "@distances"; # prints "3 0 3"
+
+ 
+
+=head1 DESCRIPTION
+
+This module implements the Wagner-Fischer dynamic programming technique,
+used here to calculate the edit distance of two strings.
+The edit distance is a measure of the degree of proximity between two strings,
+based on "edits": the operations of substitutions, deletions or insertions
+needed to transform the string into the other one (and vice versa).
+A cost (weight) is needed for every of the operation defined above:
+
+	    / a if x=y (cost for letter match)
+ w(x,y) =  |  b if x=- or y=- (cost for insertion/deletion operation)
+	    \ c if x!=y (cost for letter mismatch)
+
+These costs are given through an array reference as first argument of the 
+distance subroutine: [a,b,c].
+If the costs are not given, a default array cost is used: [0,1,1] that is the
+case of the Levenshtein edit distance:
+
+	    / 0 if x=y (cost for letter match)
+ w(x,y) =  |  1 if x=- or y=- (cost for insertion/deletion operation)
+	    \ 1 if x!=y (cost for letter mismatch)
+
+This particular distance is the exact number of edit needed to transform 
+the string into the other one (and vice versa).
+When two strings have distance 0, they are the same.
+Note that the distance is calculated to reach the _minimum_ cost, i.e.
+choosing the most economic operation for each edit.
+ 
+
+=head1 EXTENDING (by Daniel Yacob)
+
+New modules may build upon Text::WagnerFischer as a base class.
+This is practical when you would like to apply the algorithm
+to non-Roman character sets or would like to change some part
+of the algorithm but not another.
+
+The following example demonstrates how to use the WagnerFisher
+distance algorithm but apply your own weight function in a new
+package:
+
+  package Text::WagnerFischer::MyModule;
+  use base qw( Text::WagnerFischer );
+
+  #
+  # Link to the WagnerFisher "distance" function so that the
+  # new module may also export it:
+  #
+  use vars qw(@EXPORT_OK);
+
+  @EXPORT_OK = qw(&distance);
+
+  *distance = \&Text::WagnerFischer::distance;
+
+  #
+  # "override" the _weight function with the a one:
+  #
+  *Text::WagnerFischer::_weight = \&_my_weight;
+
+  #
+  # "override" the default WagnerFischer "costs" table:
+  #
+  $Text::WagnerFischer::REFC = [0,2,3,1,1];
+
+  sub _my_weight {
+    :
+    :
+    :
+  }
+
+=head1 AUTHOR
+
+Copyright 2002,2003 Dree Mistrut <F<dree@friul.it>>
+
+This package is free software and is provided "as is" without express
+or implied warranty. You can redistribute it and/or modify it under 
+the same terms as Perl itself.
+
+
+=head1 SEE ALSO
+
+C<Text::Levenshtein>, C<Text::PhraseDistance>
+
+
+=cut
+

+ 73 - 0
test.pl

@@ -0,0 +1,73 @@
+use strict;
+use Text::WagnerFischer qw(distance);
+
+my $ko=0;
+my $test=1;
+my $first_distance=distance("foo","four");
+
+if ($first_distance == 2) {
+
+	print $test.". ok\n"
+
+} else {
+
+	print $test.". NO <--\n";
+	$ko=1;
+}
+
+my $second_distance=distance("foo","foo");
+$test++;
+
+if ($second_distance == 0) {
+
+	print $test.". ok\n"
+
+} else {
+
+	print $test.". NO <--\n";
+	$ko=1;
+}
+
+my $third_distance=distance([0,1,2],"foo","four");
+$test++;
+
+if ($third_distance == 3) {
+
+	print $test.". ok\n"
+
+} else {
+
+	print $test.". NO <--\n";
+	$ko=1;
+}
+
+my @words=("four","foo","bar");
+
+my @distances=distance("foo",@words);
+$test++;
+
+if (($distances[0] == 2) and ($distances[1] == 0) and ($distances[2] == 3)) {
+
+	print $test.". ok\n"
+
+} else {
+
+	print $test.". NO <--\n";
+	$ko=1;
+}
+
+@distances=distance([0,5,3],"foo",@words);
+$test++;
+
+if (($distances[0] == 8) and ($distances[1] == 0) and ($distances[2] == 9)) {
+
+	print $test.". ok\n"
+
+} else {
+
+	print $test.". NO <--\n";
+	$ko=1;
+}
+
+
+if ($ko) {print "\nTest suite failed\n"} else {print "\nTest suite ok\n"}