| 
					
				 | 
			
			
				@@ -0,0 +1,245 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+package Text::WagnerFischer; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+use strict; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+use Exporter; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $REFC); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+$VERSION     = '0.04'; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+@ISA         = qw(Exporter); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+@EXPORT      = (); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+@EXPORT_OK   = qw(&distance); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+%EXPORT_TAGS = (); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+$REFC=[0,1,1]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+sub _min { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	my ($first,$second,$third)=@_; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	my $result=$first; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	$result=$second if ($second < $result); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	$result=$third if ($third < $result); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	return $result 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+sub _weight { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	#the cost function 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	my ($x,$y,$refc)=@_; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	if ($x eq $y) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		return $refc->[0] #cost for letter match 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	} elsif (($x eq '-') or ($y eq '-')) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		return $refc->[1] #cost for insertion/deletion operation 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	} else { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		return $refc->[2] #cost for letter mismatch 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+sub distance { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	my ($refc,$s,@t)=@_; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	if (!@t) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		if (ref($refc) ne "ARRAY") { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			if (ref($s) ne "ARRAY") { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				#array cost missing: using default [0,1,1] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				$t[0]=$s; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				$s=$refc; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				$refc=$REFC; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			} else { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	           		require Carp; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        	      		Carp::croak("Text::WagnerFischer: second string is needed"); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		} else { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+           		require Carp; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+       	      		Carp::croak("Text::WagnerFischer: second string is needed"); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	} elsif (ref($refc) ne "ARRAY") { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		#array cost missing: using default [0,1,1] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		unshift @t,$s; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		$s=$refc; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		$refc=$REFC; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	my $n=length($s); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	my @result; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	foreach my $t (@t) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		my @d; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		my $m=length($t); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		if(!$n) {push @result,$m*$refc->[1];next} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		if(!$m) {push @result,$n*$refc->[1];next} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		$d[0][0]=0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		# original algorithm should be: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		# foreach my $i (1 .. $n) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		# 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		#	my $dist_tmp=0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		#	foreach my $k (1 .. $i) {$dist_tmp+=_weight(substr($s,$i,1),'-',$refc)} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		#	$d[$i][0]=$dist_tmp; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		# } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		# 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		# foreach my $j (1 .. $m) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		# 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		#	my $dist_tmp=0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		#	foreach my $k (1 .. $j) {$dist_tmp+=_weight('-',substr($t,$j,1),$refc)} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		#	$d[0][$j]=$dist_tmp; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		# } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		# that is: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		foreach my $i (1 .. $n) {$d[$i][0]=$i*$refc->[1];} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		foreach my $j (1 .. $m) {$d[0][$j]=$j*$refc->[1];} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		foreach my $i (1 .. $n) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			my $s_i=substr($s,$i-1,1); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			foreach my $j (1 .. $m) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				my $t_i=substr($t,$j-1,1); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				$d[$i][$j]=_min($d[$i-1][$j]+_weight($s_i,'-',$refc), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+						 $d[$i][$j-1]+_weight('-',$t_i,$refc), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+						 $d[$i-1][$j-1]+_weight($s_i,$t_i,$refc)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		push @result,$d[$n][$m]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	if (wantarray) {return @result} else {return $result[0]} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+__END__ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+=head1 NAME 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Text::WagnerFischer - An implementation of the Wagner-Fischer edit distance 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+=head1 SYNOPSIS 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ use Text::WagnerFischer qw(distance); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ print distance("foo","four");# prints "2" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ print distance([0,1,2],"foo","four");# prints "3" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ my @words=("four","foo","bar"); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ my @distances=distance("foo",@words);  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ print "@distances"; # prints "2 0 3" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ @distances=distance([0,2,1],"foo",@words);  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ print "@distances"; # prints "3 0 3" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+=head1 DESCRIPTION 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+This module implements the Wagner-Fischer dynamic programming technique, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+used here to calculate the edit distance of two strings. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+The edit distance is a measure of the degree of proximity between two strings, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+based on "edits": the operations of substitutions, deletions or insertions 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+needed to transform the string into the other one (and vice versa). 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+A cost (weight) is needed for every of the operation defined above: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	    / a if x=y (cost for letter match) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ w(x,y) =  |  b if x=- or y=- (cost for insertion/deletion operation) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	    \ c if x!=y (cost for letter mismatch) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+These costs are given through an array reference as first argument of the  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+distance subroutine: [a,b,c]. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+If the costs are not given, a default array cost is used: [0,1,1] that is the 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+case of the Levenshtein edit distance: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	    / 0 if x=y (cost for letter match) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ w(x,y) =  |  1 if x=- or y=- (cost for insertion/deletion operation) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	    \ 1 if x!=y (cost for letter mismatch) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+This particular distance is the exact number of edit needed to transform  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+the string into the other one (and vice versa). 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+When two strings have distance 0, they are the same. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Note that the distance is calculated to reach the _minimum_ cost, i.e. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+choosing the most economic operation for each edit. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+=head1 EXTENDING (by Daniel Yacob) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+New modules may build upon Text::WagnerFischer as a base class. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+This is practical when you would like to apply the algorithm 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+to non-Roman character sets or would like to change some part 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+of the algorithm but not another. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+The following example demonstrates how to use the WagnerFisher 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+distance algorithm but apply your own weight function in a new 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+package: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  package Text::WagnerFischer::MyModule; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  use base qw( Text::WagnerFischer ); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  # 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  # Link to the WagnerFisher "distance" function so that the 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  # new module may also export it: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  # 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  use vars qw(@EXPORT_OK); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  @EXPORT_OK = qw(&distance); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  *distance = \&Text::WagnerFischer::distance; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  # 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  # "override" the _weight function with the a one: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  # 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  *Text::WagnerFischer::_weight = \&_my_weight; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  # 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  # "override" the default WagnerFischer "costs" table: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  # 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  $Text::WagnerFischer::REFC = [0,2,3,1,1]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  sub _my_weight { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    : 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    : 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    : 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+=head1 AUTHOR 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+Copyright 2002,2003 Dree Mistrut <F<dree@friul.it>> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+This package is free software and is provided "as is" without express 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+or implied warranty. You can redistribute it and/or modify it under  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+the same terms as Perl itself. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+=head1 SEE ALSO 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+C<Text::Levenshtein>, C<Text::PhraseDistance> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+=cut 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 |