Browse Source

Import upstream version 0.9.3

Reinoso Guzman 10 years ago
commit
a488512ef9
12 changed files with 1040 additions and 0 deletions
  1. 18 0
      Changes
  2. 12 0
      MANIFEST
  3. 19 0
      MANIFEST.SKIP
  4. 43 0
      META.json
  5. 24 0
      META.yml
  6. 18 0
      Makefile.PL
  7. 29 0
      README
  8. 729 0
      lib/Digest/ssdeep.pm
  9. 15 0
      t/00.load.t
  10. 87 0
      t/01.hash.t
  11. 40 0
      t/02.compare.t
  12. 6 0
      t/pod.t

+ 18 - 0
Changes

@@ -0,0 +1,18 @@
+Revision history for Digest-ssdeep
+
+0.9.3  Fri Nov  8 00:15:25 2013
+       Solved bug in ssdeep_compare's minimum common length.
+	   Removed distribution tests like perlcritic and pod_coverage.
+
+
+0.9.2  Fri Nov  7 23:56:47 2013
+       Skip t/perlcritic if no Test::Perl::Critic installed.
+
+
+0.9.1  Thu Nov  7 23:51:24 2013
+       Problems to compile CPAN distribution (missing Makefile.PL).
+
+
+0.9.0  Sat Oct  5 01:54:35 2013
+       Initial working release.
+

+ 12 - 0
MANIFEST

@@ -0,0 +1,12 @@
+Changes
+lib/Digest/ssdeep.pm
+Makefile.PL
+MANIFEST			This list of files
+MANIFEST.SKIP
+README
+t/00.load.t
+t/01.hash.t
+t/02.compare.t
+t/pod.t
+META.yml                                 Module YAML meta-data (added by MakeMaker)
+META.json                                Module JSON meta-data (added by MakeMaker)

+ 19 - 0
MANIFEST.SKIP

@@ -0,0 +1,19 @@
+Build.PL
+blib.*
+Makefile
+Makefile.old
+Build
+Build.bat
+_build*
+pm_to_blib*
+.*tar.gz
+.lwpcookies
+cover_db
+pod2htm.*.tmp
+Digest-ssdeep-*
+\.svn
+.*\.swp
+^MYMETA\.yml$
+^MYMETA\.json$
+pod-coverage.t
+perlcritic.t

+ 43 - 0
META.json

@@ -0,0 +1,43 @@
+{
+   "abstract" : "Pure Perl ssdeep (CTPH) fuzzy hashing",
+   "author" : [
+      "Reinoso Guzman <reinoso.guzman@gmail.com>"
+   ],
+   "dynamic_config" : 1,
+   "generated_by" : "ExtUtils::MakeMaker version 6.66, CPAN::Meta::Converter version 2.120921",
+   "license" : [
+      "unknown"
+   ],
+   "meta-spec" : {
+      "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
+      "version" : "2"
+   },
+   "name" : "Digest-ssdeep",
+   "no_index" : {
+      "directory" : [
+         "t",
+         "inc"
+      ]
+   },
+   "prereqs" : {
+      "build" : {
+         "requires" : {
+            "ExtUtils::MakeMaker" : "0"
+         }
+      },
+      "configure" : {
+         "requires" : {
+            "ExtUtils::MakeMaker" : "0"
+         }
+      },
+      "runtime" : {
+         "requires" : {
+            "Test::More" : "0",
+            "Text::WagnerFischer" : "0",
+            "version" : "0"
+         }
+      }
+   },
+   "release_status" : "stable",
+   "version" : "v0.9.3"
+}

+ 24 - 0
META.yml

@@ -0,0 +1,24 @@
+---
+abstract: 'Pure Perl ssdeep (CTPH) fuzzy hashing'
+author:
+  - 'Reinoso Guzman <reinoso.guzman@gmail.com>'
+build_requires:
+  ExtUtils::MakeMaker: 0
+configure_requires:
+  ExtUtils::MakeMaker: 0
+dynamic_config: 1
+generated_by: 'ExtUtils::MakeMaker version 6.66, CPAN::Meta::Converter version 2.120921'
+license: unknown
+meta-spec:
+  url: http://module-build.sourceforge.net/META-spec-v1.4.html
+  version: 1.4
+name: Digest-ssdeep
+no_index:
+  directory:
+    - t
+    - inc
+requires:
+  Test::More: 0
+  Text::WagnerFischer: 0
+  version: 0
+version: v0.9.3

+ 18 - 0
Makefile.PL

@@ -0,0 +1,18 @@
+use strict;
+use warnings;
+use ExtUtils::MakeMaker;
+
+WriteMakefile(
+    NAME                => 'Digest::ssdeep',
+    AUTHOR              => 'Reinoso Guzman <reinoso.guzman@gmail.com>',
+    VERSION_FROM        => 'lib/Digest/ssdeep.pm',
+    ABSTRACT_FROM       => 'lib/Digest/ssdeep.pm',
+    PL_FILES            => {},
+    PREREQ_PM => {
+        'Text::WagnerFischer' => 0,
+        'Test::More' => 0,
+        'version'    => 0,
+    },
+    dist                => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', },
+    clean               => { FILES => 'Digest-ssdeep-*' },
+);

+ 29 - 0
README

@@ -0,0 +1,29 @@
+Digest-ssdeep version 0.9.0
+
+This module provides simple implementation of ssdeep fuzzy hashing also known
+as Context Triggered Piecewise Hashing (CTPH).
+
+Please, refer to Jesse Kornblum's paper for a detailed discussion (L<SEE ALSO>).
+
+
+INSTALLATION
+
+To install this module, run the following commands:
+
+	perl Makefile.PL
+	make
+	make test
+	make install
+
+
+DEPENDENCIES
+
+Text::WagnerFischer - An implementation of the Wagner-Fischer edit distance.
+
+
+COPYRIGHT AND LICENCE
+
+Copyright (C) 2013, Reinoso Guzman
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself.

+ 729 - 0
lib/Digest/ssdeep.pm

@@ -0,0 +1,729 @@
+package Digest::ssdeep;
+
+use warnings;
+use strict;
+use Carp;
+use Text::WagnerFischer qw/distance/;
+use List::Util qw/max/;
+
+use version; 
+our $VERSION = qv('0.9.3');
+
+BEGIN {
+    require Exporter;
+    use vars qw(@ISA @EXPORT_OK);
+    @ISA       = qw(Exporter);
+    @EXPORT_OK = qw(
+      ssdeep_hash
+      ssdeep_hash_file
+      ssdeep_compare
+      ssdeep_dump_last
+    );
+}
+
+use constant FNV_PRIME  => 0x01000193;
+use constant FNV_INIT   => 0x28021967;
+use constant MAX_LENGTH => 64;
+
+# Weights:
+#  same                = 0
+#  insertion/deletion  = 1
+#  mismatch            = 2
+#  swap                = N/A (should be 5)
+$Text::WagnerFischer::REFC = [ 0, 1, 2 ];
+
+my @b64 = split '',
+  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+my @DEBUG_LAST;
+
+my @last7chars;    # will use character 7 places before
+{                  # begin rolling hash internals
+
+    my $roll_h1;    # rolling hash internal
+    my $roll_h2;    # rolling hash internal
+    my $roll_h3;    # rolling hash internal
+
+    # Resets the roll hash internal status
+    sub _reset_rollhash {
+        @last7chars =
+          ( 0, 0, 0, 0, 0, 0, 0 );    # will use character 7 places before
+        $roll_h1 = 0;
+        $roll_h2 = 0;
+        $roll_h3 = 0;
+    }
+
+    # Updates rolling_hash's internal state and return the rolling_hash value.
+    # Parameters: the next character.
+    # Returns: the actual rolling hash value
+    sub _roll_hash {
+        my $char    = shift;
+        my $char7bf = shift @last7chars;
+
+        push @last7chars, $char;
+
+        $roll_h2 += 7 * $char - $roll_h1;
+        $roll_h1 += $char - $char7bf;
+
+        $roll_h3 <<= 5;    # 5*7 = 35 (so it vanish after 7 iterations)
+        $roll_h3 &= 0xffffffff;
+        $roll_h3 ^= $char;
+
+        #printf("c=%d  cAnt=%d    H1=%u  H2=%u  H3=%u\n",
+        #	$char, $char7bf,
+        #	$roll_h1, $roll_h2, $roll_h3);
+
+        return $roll_h1 + $roll_h2 + $roll_h3;
+    }
+
+}    # end rolling hash internals
+
+# In-place updates the FNV hash using the new character
+# _update_fnv($fnvhash, $newchar);
+sub _update_fnv {
+    use integer;    # we need integer overflow in multiplication
+    $_[0] *= FNV_PRIME;
+    $_[0] &= 0xffffffff;
+    $_[0] ^= $_[1];
+    no integer;
+}
+
+# Calculates initial blocksize
+# Parameter: the length of the whole data
+sub _calc_initbs {
+    my $length = shift;
+
+    # MAX_LENGTH * bs < length
+    # MAX_LENGTH * 3 * 2 * 2 * 2 * ... < length
+    #my $n = int(log($length / (MAX_LENGTH * 3)) / log(2));
+    #my $bs = 3 * 2**$n;
+    my $bs = 3;
+    $bs *= 2 while ( $bs * MAX_LENGTH < $length );
+
+    return $bs > 3 ? $bs : 3;
+}
+
+# Calculates the ssdeep fuzzy hash of a string
+# Parameters: the string
+# Returns: the fuzzy hash in string or array
+sub ssdeep_hash {
+    my $string = shift;
+
+    return unless defined $string;
+
+    my $bs = _calc_initbs( length $string );
+    @DEBUG_LAST = ();
+
+    my $hash1;
+    my $hash2;
+
+    while (1) {
+        _reset_rollhash();
+        my $fnv1 = FNV_INIT;    # traditional hash blocksize
+        my $fnv2 = FNV_INIT;    # traditional hash 2*blocksize
+
+        $hash1 = '';
+        $hash2 = '';
+
+        for my $i ( 0 .. length($string) - 1 ) {
+            my $c = ord( substr( $string, $i, 1 ) );
+
+            #printf("c: %u, H1=%x\tH2=%x\n", $c, $fnv1, $fnv2);
+
+            my $h = _roll_hash($c);
+            _update_fnv( $fnv1, $c );    # blocksize FNV hash
+            _update_fnv( $fnv2, $c );    # 2* blocksize FNV hash
+
+            if ( $h % $bs == ( $bs - 1 ) and length $hash1 < MAX_LENGTH - 1 ) {
+
+                #printf "Hash $h Trigger 1 at $i\n";
+                my $b64char = $b64[ $fnv1 & 63 ];
+                $hash1 .= $b64char;
+
+                push @DEBUG_LAST,
+                  [ 1, $i + 1, join( '|', @last7chars ), $fnv1, $b64char ];
+
+                $fnv1 = FNV_INIT;
+
+            }
+
+            if ( $h % ( 2 * $bs ) == ( 2 * $bs - 1 )
+                and length $hash2 < MAX_LENGTH / 2 - 1 )
+            {
+
+                #printf "Hash $h Trigger 2 at $i\n";
+                my $b64char = $b64[ $fnv2 & 63 ];
+                $hash2 .= $b64char;
+
+                push @DEBUG_LAST,
+                  [ 2, $i + 1, join( '|', @last7chars ), $fnv2, $b64char ];
+
+                $fnv2 = FNV_INIT;
+            }
+
+        }
+
+        $hash1 .= $b64[ $fnv1 & 63 ];
+        $hash2 .= $b64[ $fnv2 & 63 ];
+
+        push @DEBUG_LAST,
+          [
+            1, length($string),
+            join( '|', @last7chars ), $fnv1,
+            $b64[ $fnv1 & 63 ]
+          ];
+
+        push @DEBUG_LAST,
+          [
+            2, length($string),
+            join( '|', @last7chars ), $fnv2,
+            $b64[ $fnv2 & 63 ]
+          ];
+
+        last if $bs <= 3 or length $hash1 >= MAX_LENGTH / 2;
+
+        $bs = int( $bs / 2 ); # repeat with half blocksize if no enough triggers
+        $bs > 3 or $bs = 3;
+    }
+
+    my @outarray = ( $bs, $hash1, $hash2 );
+    return wantarray ? @outarray : join ':', @outarray;
+}
+
+# Convenient function. Slurps file. You should not use it for long files.
+# You should not use pure perl implementation for long files anyway.
+# Parameter: filename
+# Returns: ssdeep hash in string or array format
+sub ssdeep_hash_file {
+    my $file = shift;
+
+    # Slurp the file (we can also use File::Slurp
+    local ($/);
+    open( my $fh, '<', $file ) or return;
+    my $string = <$fh>;
+    close $fh;
+
+    return ssdeep_hash($string);
+}
+
+# Determines the longest common substring
+sub _lcss {
+    my $strings = join "\0", @_;
+    my $lcs = '';
+
+    for my $n ( 1 .. length $strings ) {
+        my $re = "(.{$n})" . '.*\0.*\1' x ( @_ - 1 );
+        last unless $strings =~ $re;
+        $lcs = $1;
+    }
+
+    return $lcs;
+}
+
+# Calculates how similar two strings are using the Wagner-Fischer package.
+# Parameters: min_lcs, string A, string B
+# Returns: the likeliness being 0 totally dissimilar and 100 same string
+# Returns 0 also if the longest common substring is shorter than min_lcs
+sub _likeliness {
+    my ( $min_lcs, $a, $b ) = @_;
+
+    return 0 unless length( _lcss( $a, $b ) ) >= $min_lcs;
+
+    my $dist = distance( $a, $b );
+
+    #$DB::single = 2;
+
+    # Must follow ssdeep original's code for compatibility
+    #  $dist = 100 * $dist / (length($a) + length($b));
+    $dist = int( $dist * MAX_LENGTH / ( length($a) + length($b) ) );
+    $dist = int( 100 * $dist / 64 );
+
+    $dist > 100 and $dist = 100;
+    return 100 - $dist;
+}
+
+# We accept hash in both array and scalar format
+# Parameters: $hashA, $hashB, [$min_lcs]
+# Parameters: \@hashA, \@hashB, [$min_lcs]
+# Returns: file matching in %
+sub ssdeep_compare {
+    my @hashA;    # hash = bs:hash1:hash2
+    my @hashB;    # hash = bs:hash1:hash2
+    @hashA = ref( $_[0] ) eq 'ARRAY' ? @{ $_[0] } : split ':', $_[0];
+    @hashB = ref( $_[1] ) eq 'ARRAY' ? @{ $_[1] } : split ':', $_[1];
+    my $min_lcs = $_[2] || 7;
+
+    if ( @hashA != 3 or $hashA[0] !~ /\d+/ ) {
+        carp "Argument 1 is not a ssdeep hash.";
+        return;
+    }
+
+    if ( @hashB != 3 or $hashB[0] !~ /\d+/ ) {
+        carp "Argument 2 is not a ssdeep hash.";
+        return;
+    }
+
+    # Remove sequences of more than three repeated character
+    s/(.)\1{3,}/$1/gi for @hashA;
+    s/(.)\1{3,}/$1/gi for @hashB;
+
+    # Remove trailing newlines
+    s/\s+$//gi for @hashA;
+    s/\s+$//gi for @hashB;
+
+    #$DB::single = 2;
+
+    my $like;
+
+    # Blocksize comparison
+    # bsA:hash_bsA:hash_2*bsA
+    # bsB:hash_bsB:hash_2*bsB
+    if ( $hashA[0] == $hashB[0] ) {
+
+        # Compare both hashes
+        my $like1 = _likeliness( $min_lcs, $hashA[1], $hashB[1] );
+        my $like2 = _likeliness( $min_lcs, $hashA[2], $hashB[2] );
+        $like = max( $like1, $like2 );
+    }
+    elsif ( $hashA[0] == 2 * $hashB[0] ) {
+
+        # Compare hash_bsA with hash_2*bsB
+        $like = _likeliness( $min_lcs, $hashA[1], $hashB[2] );
+    }
+    elsif ( 2 * $hashA[0] == $hashB[0] ) {
+
+        # Compare hash_2*bsA with hash_bsB
+        $like = _likeliness( $min_lcs, $hashA[2], $hashB[1] );
+    }
+    else {
+
+        # Nothing suitable to compare, sorry
+        return 0;
+    }
+
+    return $like;
+}
+
+# Dump internals information. See help.
+sub ssdeep_dump_last {
+    my @result;
+    for (@DEBUG_LAST) {
+        push @result, join ",", @{$_};
+    }
+    return @result;
+}
+
+1;    # Magic true value required at end of module
+__END__
+
+=head1 NAME
+
+Digest::ssdeep - Pure Perl ssdeep (CTPH) fuzzy hashing
+
+=head1 VERSION
+
+This document describes Digest::ssdeep version 0.9.0
+
+
+=head1 SYNOPSIS
+
+    use Digest::ssdeep qw/ssdeep_hash ssdeep_hash_file/;
+
+    $hash = ssdeep_hash( $string );
+    # or in array context:
+    @hash = ssdeep_hash( $string );
+
+    $hash = ssdeep_hash_file( "data.txt" );
+
+    @details = ssdeep_dump_last();
+    
+    
+    use Digest::ssdeep qw/ssdeep_compare/;
+
+    $match = ssdeep_compare( $hashA, $hashB );
+    $match = ssdeep_compare( \@hashA, \@hashB );
+	
+
+
+=head1 DESCRIPTION
+
+This module provides simple implementation of ssdeep fuzzy hashing also known as Context Triggered Piecewise
+Hashing (CTPH).
+
+=head2 Fuzzy hashing algorithm
+
+Please, refer to Jesse Kornblum's paper for a detailed discussion (L<SEE ALSO>).
+
+To calculate the CTPH we should choose a maximum signature length. Then divide
+the file in as many chunks as this length. Calculate a hash or checksum for
+each chunk and map it to a character. The fuzzy hashing is the concatenation of
+all the characters.
+
+We cannot use fixed length blocks to separate the file. Because if we add or
+remove a character all of the following blocks are also changed. So we must
+divide the file using the "context" i.e. a block starts and ends in one of the
+predefined sequence of characters. So the problem is 'Which contexts
+-sequences- we define to separate the file in N parts?.'
+
+This is the 'roll' of the I<rolling hash>. It is a function of the N last
+inputs, in this case the 7 last characters. The result of the rolling hash
+function is uniformly spread between all valid output values.  This makes the
+rolling hash some kind of I<pseudo-random> function whose output depends only
+on the last N characters. Since the output is supposed to be uniform, we can
+modulus BS and the expected values are 0 to BS-1 with the same probability.
+
+Let the blocksize (BS) be the length of file divided by the maximum signature
+length (i.e. 64). If we split the file each time the rolling hash mod BS gives
+BS-1 we get 64 blocks.  This is not a good approach because if the length
+changes, blocksize changes also. So we cannot compare files with dissimilar
+sizes. One good approach is to take some 'predefined' blocksizes and choose the
+one that fits based on the file size. The blocksizes in ssdeep are C<3, 6, 12,
+..., 3 * 2^i>.
+
+So this is the algorithm:
+
+=over
+
+=item *
+
+Given the file size we calculate an initial blocksize (BS).
+
+=item *
+
+For each character we calculate the rolling hash R. Its output value depends
+only on the 7 last characters sequence.
+
+=item *
+
+Each time C<R mod BS = BS-1> (we meet one of the trigger 7 characters
+sequences) we write down the I<traditional hash> of the current block and start
+another block.
+
+=back
+
+The pitfall is Rolling Hash is statistically uniform, but it does not mean it
+will give us exactly 64 blocks.
+
+=over
+
+=item *
+
+Sometimes it will gives us more than 64 blocks. In that case we will
+concatenate the trailing blocks.
+
+=item *
+
+Sometimes it will gives us less than 64 blocks. No problem, 64 is the maximum
+length, it can be less.
+
+=item *
+
+Sometimes it will gives us less than 32 blocks. In that case, we should try a
+half-size blocksize to get more blocks.
+
+=back
+
+The I<traditional hash> is an usual hash or checksum function. We use 32 bit
+FNV-1a hash (L<SEE ALSO>). But its output is 32 bits, so we need to map it to a
+base-64 character alphabet. That is, we only use the 6 least significant bits
+of FNV-1a hash.
+
+
+=head2 Output
+
+The ssdeep hash has this shape: C<BS:hash1:hash2>
+
+=over
+
+=item B<BS>
+
+It is the blocksize. We can only compare hashes from the same blocksize.
+
+=item B<hash1>
+
+This is the concatenation of FNV-1a results (mapped to 64 characters) for each block in the file.
+
+=item B<hash2>
+
+This is the same that hash1 but using double the blocksize. We write this result
+because a small change can halve or double the blocksize. If this happens,
+we can compare at least one part of the two signatures.
+
+=back
+
+=head2 Comparison
+
+There are several algorithms to compare two strings. I have used the same that
+ssdeep uses for compatibility reasons. Only in certain cases, the result from
+this module is not the same as ssdeep compiled version.  Please see
+L<DIFFERENCES> below for details.
+
+These are the steps for matching calculation:
+
+=over
+
+=item *
+
+The first step is to compare the block sizes. We only can compare hashes calculated
+for the same block size. In one ssdeep string we have both blocksize and double
+blocksize hashes. So we try to match at least of the hashes. If they have no
+common block sizes, the comparison returns 0.
+
+=item *
+
+Remove sequences of more than three equal characters. These same character
+sequences have little information about the file and bias the matching score.
+
+=item *
+
+Test for a coincidence of, at least 7 characters. This is the default, but this
+value can be changed. If the longest common substring is not a least this
+length, the function returns 0. We expect a lot of collisions since we are
+mapping 32 bit FNV values into 64 character output. This is a way to remove
+false positives.
+
+=item *
+
+We use the Wagner-Fischer algorithm to compute the Levenshtein distance using
+these weights:
+
+=over
+
+=item *
+
+Same character: 0
+
+=item *
+
+Adition or deletion: 1
+
+=item *
+
+Sustitution: 2
+
+=back
+
+=item *
+
+Following the original ssdeep algorithm we scale the value so the output be between 0
+and 100.
+
+=back
+
+
+
+
+=head1 INTERFACE 
+
+This section describes the recommended interface for generating and comparing
+ssdeep fuzzy hashes.
+
+=over
+
+=item B<ssdeep_hash>
+
+Calculates the ssdeep hash of the input string. 
+
+Usage:
+
+    $hash = ssdeep_hash( $string );
+
+or in array context
+
+    @hash = ssdeep_hash( $string );
+
+In scalar context it returns a
+hash with the format C<bs:hash1:hash2>. Being C<bs> the blocksize, C<hash1>
+the fuzzy hash for this blocksize and C<hash2> the hash for double blocksize.
+The maximum length of each hash is 64 characters.
+
+In array context it returns the same components above but in a 3 elements array.
+
+=item B<ssdeep_hash_file>
+
+Calculates the hash of a file.
+
+Usage:
+
+    $hash = ssdeep_hash_file( "/tmp/malware1.exe" );
+
+This is a convenient function. Returns the same of ssdeep_file in scalar or
+array context.
+
+Since this function slurps the whole file into memory, you should not use it in
+big files. You should not use this module for big files, use libfuzzy wrapper
+instead (L<BUGS AND LIMITATIONS>).
+
+Returns B<undef> on errors.
+
+=item B<ssdeep_compare>
+
+Calculates the matching between two hashes.
+
+Usage. To compare two scalar hashes:
+
+    $match = ssdeep_compare( $hashA, $hashB );
+
+To compare two hashes in array format:
+
+    $match = ssdeep_compare( \@hashA, \@hashB );
+
+The default is to discard hashes with less than 7 characters common substring.
+To override this default and set this limit to any number you can use:
+
+    $match = ssdeep_compare( $hashA, $hashB, 4 );
+
+The result is a matching score between 0 and 100. See L<Comparison> for
+algorithm details.
+
+
+=item B<ssdeep_dump_last>
+
+Returns an array with information of the last hash calculation. Useful for
+debugging or extended details.
+
+Usage after a calculation:
+
+    $hash    = ssdeep_hash_file( "/tmp/malware1.exe" );
+    @details = ssdeep_dump_last();
+
+The output is an array of CSV values.
+
+    ...
+    2,125870,187|245|110|27|190|66|97,1393131242,q
+    1,210575,13|216|13|115|29|52|208,4009217630,e
+    2,210575,13|216|13|115|29|52|208,4009217630,e
+    1,210730,61|231|220|179|40|89|210,1069791891,T
+    1,237707,45|66|251|98|56|138|91,4014305026,C
+    ....
+
+Meaning of the output array:
+
+=over
+
+=item B<Field 1>
+
+Part of the hash which is affected. 1 for the fist part, 2 for the second part.
+
+=item B<Field 2>
+
+Offset of the file where the chunk ends. 
+
+=item B<Field 3>
+
+Sequence of 7 characters that triggered the rolling hash.
+
+=item B<Field 4>
+
+Value of the rolling hash at this moment.
+
+=item B<Field 5>
+
+Character output to the fuzzy hash due to this rolling hash trigger.
+
+=back
+
+So we can read it this way:
+
+At byte 125870 of the input file, there is a sequence of these 7 characters:
+C<187 245 110 27 190 66 97>. That sequence triggered the second part of the
+hash. The FNV hash value of the current chunk is 1393131242 that maps to
+character C<q>.
+
+Or this way:
+
+From the 4th row I know the letter C<T> in the first hash comes from the
+chunk that started at 210575+1 (the one-starting row before) and ends at
+210730. The whole FNV hash of this block was 1069791891.
+
+=back
+
+
+=head1 BUGS AND LIMITATIONS
+
+=over
+
+=item B<Small blocksize comparison>
+
+Original ssdeep limit the matching of small blocksize hashes. So when comparing
+them the matching is limited by its size and is never 100%. This algorithm do
+not behaviours that way. Small block sizes hashes are compared as big block
+sizes ones.
+
+=item B<Performance>
+
+This is a Pure Perl implementation. The performance is far from optimal. To
+calculate hashes more efficiently, please use compiled software like libfuzzy
+bindings (L<SEE ALSO>).
+
+=item B<Test 64 bits systems>
+
+This module has not been tested in 64 bit systems yet.
+
+=back
+
+
+Please report any bugs or feature requests to
+C<bug-digest-ssdeep@rt.cpan.org>, or through the web interface at
+L<http://rt.cpan.org>.
+
+=head1 SEE ALSO
+
+=over
+
+=item Ssdeep's home page
+
+L<http://ssdeep.sourceforge.net/>
+
+=item Jesse Kornblum's original paper I<Identifying almost identical files using context triggered piecewise hashing>
+
+L<http://dfrws.org/2006/proceedings/12-Kornblum.pdf>
+
+=item I<Data::FuzzyHash> Perl binding of binary libfuzzy libraries
+
+L<https://github.com/hideo55/Data-FuzzyHash>
+
+=item Text::WagnerFischer - An implementation of the Wagner-Fischer edit distance.
+
+L<http://search.cpan.org/perldoc?Text%3A%3AWagnerFischer>
+
+=item FNV hash's description
+
+L<http://www.isthe.com/chongo/tech/comp/fnv/>
+
+=back
+
+=head1 AUTHOR
+
+Reinoso Guzman  C<< <reinoso.guzman@gmail.com> >>
+
+
+=head1 LICENCE AND COPYRIGHT
+
+Copyright (c) 2013, Reinoso Guzman C<< <reinoso.guzman@gmail.com> >>. All rights reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself. See L<perlartistic>.
+
+
+=head1 DISCLAIMER OF WARRANTY
+
+BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
+EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
+ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH
+YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
+NECESSARY SERVICING, REPAIR, OR CORRECTION.
+
+IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE
+LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL,
+OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE
+THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.

+ 15 - 0
t/00.load.t

@@ -0,0 +1,15 @@
+use Test::More tests => 2;
+
+BEGIN {
+use_ok( 'Digest::ssdeep' );
+}
+
+
+can_ok('Digest::ssdeep', qw(ssdeep_compare ssdeep_hash ssdeep_hash_file));
+
+#my $obj = new_ok( $class );
+
+
+
+
+#diag( "Testing Digest::ssdeep $Digest::ssdeep::VERSION" );

+ 87 - 0
t/01.hash.t

@@ -0,0 +1,87 @@
+use Test::More tests => 32;
+use File::Temp qw/tempfile/;
+
+use Digest::ssdeep qw/ssdeep_hash ssdeep_hash_file/;
+
+while (my $line = <DATA>) {
+	next unless $line =~ /^\d+,/;
+
+	my ($i, $hash, $file) = split /,/, $line;
+
+	ok ( ssdeep_hash(rand_str($i)) eq $hash, "Tested $i characters long" );
+}
+
+# Test array output as well
+my @expected = qw{24 ivk6g/3pXNpRobD3qebULKLTTSOcHwbKLmp/j/FdkAs5n4QKnn6mU5kOHQpBVUQY Ek6g/3pX/ebD3qebcKLTTcmKm/Tkx5nP};
+my @got = ssdeep_hash(rand_str(1477));
+is_deeply ( \@got, \@expected, 'Array output' );
+
+
+
+# Test hash_file
+my ($fh, $filename) = tempfile();
+print $fh rand_str(11222);
+close $fh;
+
+@expected = qw{192 vuwl3pBbm9YPL2zSUnZBXGmy3977H63khOKmU4LoDdX7tbIi5h/GiLjNIpap7c6Q vuM3vmyp2ZBXGPPj8LapbIi//LLjKpYa};
+@got = ssdeep_hash_file($filename);
+is_deeply ( \@got, \@expected, 'Hash file (array output)' );
+
+my $expected = '192:vuwl3pBbm9YPL2zSUnZBXGmy3977H63khOKmU4LoDdX7tbIi5h/GiLjNIpap7c6Q:vuM3vmyp2ZBXGPPj8LapbIi//LLjKpYa';
+my $got = ssdeep_hash_file($filename);
+ok ( $got eq $expected, 'Hash file (string output)' );
+
+unlink $filename;
+
+
+
+
+sub rand_str {
+	my $i = shift;
+	srand($i);
+	return join "", map chr(rand(256)), 1..$i;
+}
+
+__DATA__
+# Cannot use same-character string to test CTPH.
+# The random string is the result of:
+#
+# for $i = ...
+#	srand($i);
+#	print join "", map chr(rand(256)), 1..$i;'
+#
+# Bash:
+# for j in {1..35}; do i=$(echo "1.5^$j/1" | bc); echo -n $i,; i=$i perl -e '$i = $ENV{i}; srand($i); print join "", map chr(rand(256)), 1..$i;' | (ssdeep|grep -v filename) ; done > /tmp/gg
+#
+#
+# bytes,blocksize:hash1:hash2,"filename"
+1,3:v:v,"stdin"
+2,3:Mn:Mn,"stdin"
+3,3:xn:x,"stdin"
+5,3:L:L,"stdin"
+7,3:dzfn:Nn,"stdin"
+11,3:uc1gC:uc1gC,"stdin"
+17,3:uxFtK263n:uxw,"stdin"
+25,3:YtuzKZXO:YtuzKpO,"stdin"
+38,3:sFbOyjIJhgGUnEn:sFiybE,"stdin"
+57,3:PMomopd2UfXYppfj:PMx0QLffj,"stdin"
+86,3:opgonrBMOZB12pieqyrxBRNTDidtHMi:h6rqO/1ITrxBWdT,"stdin"
+129,3:IxbXw09OOwvKtc76KwgSYvTt3lnNNqyph:I9XwMptc76rYL/J,"stdin"
+194,3:Pfo9vJGQBNx9W3wMR+9rKTALvZq48HSR9IELUfjnLRUfmDDfbqjT:Pfov7GgB8MLB71IELmjnNU+DDDqX,"stdin"
+291,6:AfzKNnARBm8mPTRaWGuvJV7n5Z0XB6hL7mSgQHf:oglTRaWjV75eMF/,"stdin"
+437,6:Oujj0YPmJZ+7WduliNE4A2fT+iuYS0PV66XlyQMXm/pIZb9MSm6jxEMUGSFoK/YH:M3kliNE4A27uqPc6MQMY7l69xilhDP+,"stdin"
+774,12:SCUqMuapucCeqQX1EXdjv8/Q/KkdAj92uEpz5SrBa8fHc+dn3MRdYhNiwk:jMu6uvHNjvwQ/KOM0pzoYu3MmNib,"stdin"
+781,24:t2KPOJaY8jGIvIStgvaB/Jb06mP+g0cnXMSNsn:20amIrF/rMwsn,"stdin"
+790,12:EFFzuHf0r7rBnz0yE1IdHpciXyMVzgh+sShm9vZUXK7IGtNZ:E7hv1pciCM0SmBv7tX,"stdin"
+985,24:RuzbjIE/Z+dONABU13ZDj5xzHdBoX975kc8eFFhX:RyXI5dVBexzHvCpXFDX,"stdin"
+1477,24:ivk6g/3pXNpRobD3qebULKLTTSOcHwbKLmp/j/FdkAs5n4QKnn6mU5kOHQpBVUQY:Ek6g/3pX/ebD3qebcKLTTcmKm/Tkx5nP,"stdin"
+2216,48:xYD4x+0DrgLQ337bompHHYsJ8cfF0iXWrnOwFK4B76axlaztA6R5WeHc998AvNbo:xa440D4Q3fompYsjt7XaOoXPsz+6R5Wg,"stdin"
+3325,96:evgnpMXzamDB84B2lbdCvaNpPyJCfA1k9vTR1x1:7md72NIC2JatFTT,"stdin"
+4987,96:gjSMDET+aSRpDP6dEtJnHVq+3FyDa5Skna3vJmTxiV0qDOcQOXjU:UHgbmDSAHVq+34e51nYxqkOqTU,"stdin"
+7481,192:5rn1Ka+ZNuwPf004r/LI/za5ISVh00wEs8NWmOI:5rnkaMPf0lLsST+38Z,"stdin"
+11222,192:vuwl3pBbm9YPL2zSUnZBXGmy3977H63khOKmU4LoDdX7tbIi5h/GiLjNIpap7c6Q:vuM3vmyp2ZBXGPPj8LapbIi//LLjKpYa,"stdin"
+25251,384:HgEQu3dxfXKHYINP/4iKjxhEXuydSP16Vac9TrN+y5sPRa73le/ejEFJjXgNIWo:HgIPK4CojxhEXuGVac9TrNMm+/8NTo,"stdin"
+37876,768:rCuHZxxXi6BXUfX6IFX6UTfrCt/aizyg2JJM4+GDcDXm9qkDftzCF:/HFLBXCcUkaimgKaQUXcXVzY,"stdin"
+85222,1536:wiP27adNLE8kqAomXaQxWOQLqbEqW6OxqNNx+wVmxjlYvvqwnSea:Wa7LvkrvXatOTFOMNxDVmxuvJ4,"stdin"
+287626,6144:vAj/s/W02dAPOeb8I29VgXa3t5MezyGSLh+5Tw/U91tl3/MCL6:IAd6AWetbK3t5j9SLE5R91thB2,"stdin"
+

+ 40 - 0
t/02.compare.t

@@ -0,0 +1,40 @@
+use Test::More tests => 19;
+
+use Digest::ssdeep qw/ssdeep_compare/;
+
+while (my $line = <DATA>) {
+	next unless $line =~ /^\d+,/;
+
+	my ($expected, $hashA, $hashB) = split /,/, $line;
+
+	my $result = ssdeep_compare($hashA, $hashB);
+	ok ( $result == $expected, "Expected $expected got $result" );
+}
+
+# Test array inputs
+my @a = qw{12 0HRluo1TQSDWvSIQLx6MI4kl0TZQ1I8g1Lwzg9/ZtLFlJlVtD5HaEmzm2 0H910SiKIU6MI4klIQ1Ymzg9BlVz6k2};
+my @b = qw{12 qw4HGEhP87m4TAnveuu5LL6n0X56kluo1TQSDWvSIQLx6MI4kl0TZG qwcGEhgmLve9Sn0pJ10SiKIU6MI4klIG};
+my $result = ssdeep_compare(\@a, \@b);
+ok ( $result == 49, "Array inputs" );
+
+
+
+__DATA__
+0,96:5oCWg1vDT2ioEb9wa6XLN95Mz7TAu+HktzDO3s4OqoNPFGTnNnJJ:1vDT2mbj6Z9Eyi/nvKp3,3:69PRML+LFJz+JRjA+TwSXiwKX3FuglXSF8XRKFVEXEtk9J2MZALNfy+vFzuFJnML:tLy5SRjAnSXuQ+DReVE3oeupvFuFJMLn
+0,12:sOI4kl0TZQ1I8g1Lwzg9/ZtLFlJlVtD5HaEmzmQSD6ecw/kP1Nbgi2:sOI4klIQ1Ymzg9BlVz6kQXwu4,6:QkluDCwgF9nebATXgFSHpXriQKStdWuKJfKHoZuESI0Lx6LkLILtWk0bXeENpXzA:Qkluo1TQSDWvSIQLx6MI4kl0Ts
+36,48:EtzDRs3ns4VcOFMPNyy5CsGLve9SnqJS9xJ5lIsFUzLzGXww:EtzDO3s4OqoNPFGTnNnJ4XzGXH,96:HtgFlk/vOAm4hGOjaoCWg1vDT2ioEb9wa6XLN95Mz7TAu+HktzDO3s4OqoNPFGTi:OryvOshGfvDT2mbj6Z9Eyi/nvKpk
+40,48:UYq+C3O2xFLyW9I8MzCIBTASrh+HnktzDRs3ns4VcOFMPNyy5Cw:J6XLN95Mz7TAu+HktzDO3s4OqoNPf,24:wYfnsz38augbOFMPNyyoobuwcGEhgmLve9Sn0pJ10F:rns4VcOFMPNyy5CsGLve9SnqJSF
+49,12:0HRluo1TQSDWvSIQLx6MI4kl0TZQ1I8g1Lwzg9/ZtLFlJlVtD5HaEmzm2:0H910SiKIU6MI4klIQ1Ymzg9BlVz6k2,12:qw4HGEhP87m4TAnveuu5LL6n0X56kluo1TQSDWvSIQLx6MI4kl0TZG:qwcGEhgmLve9Sn0pJ10SiKIU6MI4klIG
+52,48:t1iswmvOfLGMmyGmiQqhm8sxbSmGJXTyVV1QuNWg1UAbBc2T2ixuerlPD99ygGny:t/vOAm4hGOjaoCWg1vDT2ioEb9w+,96:jGOjaoCWg1vDT2ioEb9wa6XLN95Mz7TAu+HktzDO3s4OqoNPFq:jGfvDT2mbj6Z9Eyi/nvq
+57,96:jv1+OCIZXltgFlk/vOAm4hGOjaoCWg1vP:jt+AZXEryvOshGfvP,96:FL/i2TzD8PIfD4PGFJE0h40u7fpdv1+OCIZXltgFlk/vOAm4hGU:FL6lP44P+JEH77fHt+AZXEryvOshGU
+58,192:v77fHt+AZXEryvOshGfvDT2mbj6Z9Eyi/nvKpq:vnfHt+2XSyPhGfrTH/6Z9jQnCpq,96:PL/i2TzD8PIfD4PGFJE0h40u7fpdv1+OCIZXltgFlk/vOAm4hGOjaoCWg1vDT2iP:PL6lP44P+JEH77fHt+AZXEryvOshGfvb
+61,192:F4P+JEH77fHt+AZXEryvOshGfvDT2mbj6Z9Eyi/nvKpQ:FQ++HnfHt+2XSyPhGfrTH/6Z9jQnCpQ,96:sv1+OCIZXltgFlk/vOAm4hGOjaoCWg1vDT2ioEb9wt:st+AZXEryvOshGfvDT2mbO
+72,192:G7fHt+AZXEryvOshGfvDT2mbj6Z9Eyi5:ifHt+2XSyPhGfrTH/6Z9jg,96:51gFlk/vOAm4hGOjaoCWg1vDT2ioEb9wa6XLN95E:5mryvOshGfvDT2mbj6Z9i
+75,96:Av1+OCIZXltgFlk/vOAm4hGOjaoCWg1vDT2ioEb9wa6XLN95MV:At+AZXEryvOshGfvDT2mbj6Z9A,192:t44P+JEH77fHt+AZXEryvOshGfvDT2mbj6Z9Eyx:t4Q++HnfHt+2XSyPhGfrTH/6Z9jx
+86,96:oL/i2TzD8PIfD4PGFJE0h40u7fpdv1+OCIZXltgFlk/vOAm4hGOjaoCWg1vV:oL6lP44P+JEH77fHt+AZXEryvOshGfvV,192:XL6lP44P+JEH77fHt+AZXEryvOshGfvDT2mbj6E:XOlP4Q++HnfHt+2XSyPhGfrTH/6E
+93,192:jL6lP44P+JEH77fHt+AZXEryvOshGfvDT2mbj6Z9Eyi/nvKpSzh:jOlP4Q++HnfHt+2XSyPhGfrTH/6Z9jQV,192:sLpL6lP44P+JEH77fHt+AZXEryvOshGfvDT2mbj6Z9Eyi/nvKpF:sVOlP4Q++HnfHt+2XSyPhGfrTH/6Z9j9
+29,96:7v1+OCIZXltgFlk/vOAm4hGOjaoCWg1vDT2ioEb9wa6s:7t+AZXEryvOshGfvDT2mbj6s,96:aT2ioEb9wa6XLN95Mz7TAu+HktzDO3s4OqoNPFGTnNnJ4:aT2mbj6Z9Eyi/nvKpC
+27,48:T9ygGnpLYq+C3O2xFLyW9I8MzCIBTASrh+HnktzDRs3ns4V8:Twa6XLN95Mz7TAu+HktzDO3s42,96:clk/vOAm4hGOjaoCWg1vDT2ioEb9wa6XLN9N:ayvOshGfvDT2mbj6Z9N
+24,192:EHLpL6lP44P+JEH77fHt+AZXEryvOshGfvDT2mbjR:uVOlP4Q++HnfHt+2XSyPhGfrTH/R,96:PaoCWg1vDT2ioEb9wa6XLN95Mz7TAu+HktzDO3s4OqoNPFGTnNnJ4XzGX:CvDT2mbj6Z9Eyi/nvKpSza
+24,48:IOfLGMmyGmiQqhm8sxbSmGJXTyVV1QuNWg1UAN:IOAm4hGOjaoCWg1vN,48:XV1QuNWg1UAbBc2T2ixuerlPD99ygGnpLYq+C3O2xFLyW9I8MzCIBTASi:XoCWg1vDT2ioEb9wa6XLN95Mz7TAz
+100,192:jL6lP44P+JEH77fHt+AZXEryvOshGfvDT2mbj6Z9Eyi/nvKpSzh:jOlP4Q++HnfHt+2XSyPhGfrTH/6Z9jQV,192:jL6lP44P+JEH77fHt+AZXEryvOshGfvDT2mbj6Z9Eyi/nvKpSzh:jOlP4Q++HnfHt+2XSyPhGfrTH/6Z9jQV

+ 6 - 0
t/pod.t

@@ -0,0 +1,6 @@
+#!perl -T
+
+use Test::More;
+eval "use Test::Pod 1.14";
+plan skip_all => "Test::Pod 1.14 required for testing POD" if $@;
+all_pod_files_ok();