123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729 |
- package Digest::ssdeep;
- use warnings;
- use strict;
- use Carp;
- use Text::WagnerFischer qw/distance/;
- use List::Util qw/max/;
- use version;
- our $VERSION = qv('0.9.3');
- BEGIN {
- require Exporter;
- use vars qw(@ISA @EXPORT_OK);
- @ISA = qw(Exporter);
- @EXPORT_OK = qw(
- ssdeep_hash
- ssdeep_hash_file
- ssdeep_compare
- ssdeep_dump_last
- );
- }
- use constant FNV_PRIME => 0x01000193;
- use constant FNV_INIT => 0x28021967;
- use constant MAX_LENGTH => 64;
- # Weights:
- # same = 0
- # insertion/deletion = 1
- # mismatch = 2
- # swap = N/A (should be 5)
- $Text::WagnerFischer::REFC = [ 0, 1, 2 ];
- my @b64 = split '',
- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
- my @DEBUG_LAST;
- my @last7chars; # will use character 7 places before
- { # begin rolling hash internals
- my $roll_h1; # rolling hash internal
- my $roll_h2; # rolling hash internal
- my $roll_h3; # rolling hash internal
- # Resets the roll hash internal status
- sub _reset_rollhash {
- @last7chars =
- ( 0, 0, 0, 0, 0, 0, 0 ); # will use character 7 places before
- $roll_h1 = 0;
- $roll_h2 = 0;
- $roll_h3 = 0;
- }
- # Updates rolling_hash's internal state and return the rolling_hash value.
- # Parameters: the next character.
- # Returns: the actual rolling hash value
- sub _roll_hash {
- my $char = shift;
- my $char7bf = shift @last7chars;
- push @last7chars, $char;
- $roll_h2 += 7 * $char - $roll_h1;
- $roll_h1 += $char - $char7bf;
- $roll_h3 <<= 5; # 5*7 = 35 (so it vanish after 7 iterations)
- $roll_h3 &= 0xffffffff;
- $roll_h3 ^= $char;
- #printf("c=%d cAnt=%d H1=%u H2=%u H3=%u\n",
- # $char, $char7bf,
- # $roll_h1, $roll_h2, $roll_h3);
- return $roll_h1 + $roll_h2 + $roll_h3;
- }
- } # end rolling hash internals
- # In-place updates the FNV hash using the new character
- # _update_fnv($fnvhash, $newchar);
- sub _update_fnv {
- use integer; # we need integer overflow in multiplication
- $_[0] *= FNV_PRIME;
- $_[0] &= 0xffffffff;
- $_[0] ^= $_[1];
- no integer;
- }
- # Calculates initial blocksize
- # Parameter: the length of the whole data
- sub _calc_initbs {
- my $length = shift;
- # MAX_LENGTH * bs < length
- # MAX_LENGTH * 3 * 2 * 2 * 2 * ... < length
- #my $n = int(log($length / (MAX_LENGTH * 3)) / log(2));
- #my $bs = 3 * 2**$n;
- my $bs = 3;
- $bs *= 2 while ( $bs * MAX_LENGTH < $length );
- return $bs > 3 ? $bs : 3;
- }
- # Calculates the ssdeep fuzzy hash of a string
- # Parameters: the string
- # Returns: the fuzzy hash in string or array
- sub ssdeep_hash {
- my $string = shift;
- return unless defined $string;
- my $bs = _calc_initbs( length $string );
- @DEBUG_LAST = ();
- my $hash1;
- my $hash2;
- while (1) {
- _reset_rollhash();
- my $fnv1 = FNV_INIT; # traditional hash blocksize
- my $fnv2 = FNV_INIT; # traditional hash 2*blocksize
- $hash1 = '';
- $hash2 = '';
- for my $i ( 0 .. length($string) - 1 ) {
- my $c = ord( substr( $string, $i, 1 ) );
- #printf("c: %u, H1=%x\tH2=%x\n", $c, $fnv1, $fnv2);
- my $h = _roll_hash($c);
- _update_fnv( $fnv1, $c ); # blocksize FNV hash
- _update_fnv( $fnv2, $c ); # 2* blocksize FNV hash
- if ( $h % $bs == ( $bs - 1 ) and length $hash1 < MAX_LENGTH - 1 ) {
- #printf "Hash $h Trigger 1 at $i\n";
- my $b64char = $b64[ $fnv1 & 63 ];
- $hash1 .= $b64char;
- push @DEBUG_LAST,
- [ 1, $i + 1, join( '|', @last7chars ), $fnv1, $b64char ];
- $fnv1 = FNV_INIT;
- }
- if ( $h % ( 2 * $bs ) == ( 2 * $bs - 1 )
- and length $hash2 < MAX_LENGTH / 2 - 1 )
- {
- #printf "Hash $h Trigger 2 at $i\n";
- my $b64char = $b64[ $fnv2 & 63 ];
- $hash2 .= $b64char;
- push @DEBUG_LAST,
- [ 2, $i + 1, join( '|', @last7chars ), $fnv2, $b64char ];
- $fnv2 = FNV_INIT;
- }
- }
- $hash1 .= $b64[ $fnv1 & 63 ];
- $hash2 .= $b64[ $fnv2 & 63 ];
- push @DEBUG_LAST,
- [
- 1, length($string),
- join( '|', @last7chars ), $fnv1,
- $b64[ $fnv1 & 63 ]
- ];
- push @DEBUG_LAST,
- [
- 2, length($string),
- join( '|', @last7chars ), $fnv2,
- $b64[ $fnv2 & 63 ]
- ];
- last if $bs <= 3 or length $hash1 >= MAX_LENGTH / 2;
- $bs = int( $bs / 2 ); # repeat with half blocksize if no enough triggers
- $bs > 3 or $bs = 3;
- }
- my @outarray = ( $bs, $hash1, $hash2 );
- return wantarray ? @outarray : join ':', @outarray;
- }
- # Convenient function. Slurps file. You should not use it for long files.
- # You should not use pure perl implementation for long files anyway.
- # Parameter: filename
- # Returns: ssdeep hash in string or array format
- sub ssdeep_hash_file {
- my $file = shift;
- # Slurp the file (we can also use File::Slurp
- local ($/);
- open( my $fh, '<', $file ) or return;
- my $string = <$fh>;
- close $fh;
- return ssdeep_hash($string);
- }
- # Determines the longest common substring
- sub _lcss {
- my $strings = join "\0", @_;
- my $lcs = '';
- for my $n ( 1 .. length $strings ) {
- my $re = "(.{$n})" . '.*\0.*\1' x ( @_ - 1 );
- last unless $strings =~ $re;
- $lcs = $1;
- }
- return $lcs;
- }
- # Calculates how similar two strings are using the Wagner-Fischer package.
- # Parameters: min_lcs, string A, string B
- # Returns: the likeliness being 0 totally dissimilar and 100 same string
- # Returns 0 also if the longest common substring is shorter than min_lcs
- sub _likeliness {
- my ( $min_lcs, $a, $b ) = @_;
- return 0 unless length( _lcss( $a, $b ) ) >= $min_lcs;
- my $dist = distance( $a, $b );
- #$DB::single = 2;
- # Must follow ssdeep original's code for compatibility
- # $dist = 100 * $dist / (length($a) + length($b));
- $dist = int( $dist * MAX_LENGTH / ( length($a) + length($b) ) );
- $dist = int( 100 * $dist / 64 );
- $dist > 100 and $dist = 100;
- return 100 - $dist;
- }
- # We accept hash in both array and scalar format
- # Parameters: $hashA, $hashB, [$min_lcs]
- # Parameters: \@hashA, \@hashB, [$min_lcs]
- # Returns: file matching in %
- sub ssdeep_compare {
- my @hashA; # hash = bs:hash1:hash2
- my @hashB; # hash = bs:hash1:hash2
- @hashA = ref( $_[0] ) eq 'ARRAY' ? @{ $_[0] } : split ':', $_[0];
- @hashB = ref( $_[1] ) eq 'ARRAY' ? @{ $_[1] } : split ':', $_[1];
- my $min_lcs = $_[2] || 7;
- if ( @hashA != 3 or $hashA[0] !~ /\d+/ ) {
- carp "Argument 1 is not a ssdeep hash.";
- return;
- }
- if ( @hashB != 3 or $hashB[0] !~ /\d+/ ) {
- carp "Argument 2 is not a ssdeep hash.";
- return;
- }
- # Remove sequences of more than three repeated character
- s/(.)\1{3,}/$1/gi for @hashA;
- s/(.)\1{3,}/$1/gi for @hashB;
- # Remove trailing newlines
- s/\s+$//gi for @hashA;
- s/\s+$//gi for @hashB;
- #$DB::single = 2;
- my $like;
- # Blocksize comparison
- # bsA:hash_bsA:hash_2*bsA
- # bsB:hash_bsB:hash_2*bsB
- if ( $hashA[0] == $hashB[0] ) {
- # Compare both hashes
- my $like1 = _likeliness( $min_lcs, $hashA[1], $hashB[1] );
- my $like2 = _likeliness( $min_lcs, $hashA[2], $hashB[2] );
- $like = max( $like1, $like2 );
- }
- elsif ( $hashA[0] == 2 * $hashB[0] ) {
- # Compare hash_bsA with hash_2*bsB
- $like = _likeliness( $min_lcs, $hashA[1], $hashB[2] );
- }
- elsif ( 2 * $hashA[0] == $hashB[0] ) {
- # Compare hash_2*bsA with hash_bsB
- $like = _likeliness( $min_lcs, $hashA[2], $hashB[1] );
- }
- else {
- # Nothing suitable to compare, sorry
- return 0;
- }
- return $like;
- }
- # Dump internals information. See help.
- sub ssdeep_dump_last {
- my @result;
- for (@DEBUG_LAST) {
- push @result, join ",", @{$_};
- }
- return @result;
- }
- 1; # Magic true value required at end of module
- __END__
- =head1 NAME
- Digest::ssdeep - Pure Perl ssdeep (CTPH) fuzzy hashing
- =head1 VERSION
- This document describes Digest::ssdeep version 0.9.0
- =head1 SYNOPSIS
- use Digest::ssdeep qw/ssdeep_hash ssdeep_hash_file/;
- $hash = ssdeep_hash( $string );
- # or in array context:
- @hash = ssdeep_hash( $string );
- $hash = ssdeep_hash_file( "data.txt" );
- @details = ssdeep_dump_last();
-
-
- use Digest::ssdeep qw/ssdeep_compare/;
- $match = ssdeep_compare( $hashA, $hashB );
- $match = ssdeep_compare( \@hashA, \@hashB );
-
- =head1 DESCRIPTION
- This module provides simple implementation of ssdeep fuzzy hashing also known as Context Triggered Piecewise
- Hashing (CTPH).
- =head2 Fuzzy hashing algorithm
- Please, refer to Jesse Kornblum's paper for a detailed discussion (L<SEE ALSO>).
- To calculate the CTPH we should choose a maximum signature length. Then divide
- the file in as many chunks as this length. Calculate a hash or checksum for
- each chunk and map it to a character. The fuzzy hashing is the concatenation of
- all the characters.
- We cannot use fixed length blocks to separate the file. Because if we add or
- remove a character all of the following blocks are also changed. So we must
- divide the file using the "context" i.e. a block starts and ends in one of the
- predefined sequence of characters. So the problem is 'Which contexts
- -sequences- we define to separate the file in N parts?.'
- This is the 'roll' of the I<rolling hash>. It is a function of the N last
- inputs, in this case the 7 last characters. The result of the rolling hash
- function is uniformly spread between all valid output values. This makes the
- rolling hash some kind of I<pseudo-random> function whose output depends only
- on the last N characters. Since the output is supposed to be uniform, we can
- modulus BS and the expected values are 0 to BS-1 with the same probability.
- Let the blocksize (BS) be the length of file divided by the maximum signature
- length (i.e. 64). If we split the file each time the rolling hash mod BS gives
- BS-1 we get 64 blocks. This is not a good approach because if the length
- changes, blocksize changes also. So we cannot compare files with dissimilar
- sizes. One good approach is to take some 'predefined' blocksizes and choose the
- one that fits based on the file size. The blocksizes in ssdeep are C<3, 6, 12,
- ..., 3 * 2^i>.
- So this is the algorithm:
- =over
- =item *
- Given the file size we calculate an initial blocksize (BS).
- =item *
- For each character we calculate the rolling hash R. Its output value depends
- only on the 7 last characters sequence.
- =item *
- Each time C<R mod BS = BS-1> (we meet one of the trigger 7 characters
- sequences) we write down the I<traditional hash> of the current block and start
- another block.
- =back
- The pitfall is Rolling Hash is statistically uniform, but it does not mean it
- will give us exactly 64 blocks.
- =over
- =item *
- Sometimes it will gives us more than 64 blocks. In that case we will
- concatenate the trailing blocks.
- =item *
- Sometimes it will gives us less than 64 blocks. No problem, 64 is the maximum
- length, it can be less.
- =item *
- Sometimes it will gives us less than 32 blocks. In that case, we should try a
- half-size blocksize to get more blocks.
- =back
- The I<traditional hash> is an usual hash or checksum function. We use 32 bit
- FNV-1a hash (L<SEE ALSO>). But its output is 32 bits, so we need to map it to a
- base-64 character alphabet. That is, we only use the 6 least significant bits
- of FNV-1a hash.
- =head2 Output
- The ssdeep hash has this shape: C<BS:hash1:hash2>
- =over
- =item B<BS>
- It is the blocksize. We can only compare hashes from the same blocksize.
- =item B<hash1>
- This is the concatenation of FNV-1a results (mapped to 64 characters) for each block in the file.
- =item B<hash2>
- This is the same that hash1 but using double the blocksize. We write this result
- because a small change can halve or double the blocksize. If this happens,
- we can compare at least one part of the two signatures.
- =back
- =head2 Comparison
- There are several algorithms to compare two strings. I have used the same that
- ssdeep uses for compatibility reasons. Only in certain cases, the result from
- this module is not the same as ssdeep compiled version. Please see
- L<DIFFERENCES> below for details.
- These are the steps for matching calculation:
- =over
- =item *
- The first step is to compare the block sizes. We only can compare hashes calculated
- for the same block size. In one ssdeep string we have both blocksize and double
- blocksize hashes. So we try to match at least of the hashes. If they have no
- common block sizes, the comparison returns 0.
- =item *
- Remove sequences of more than three equal characters. These same character
- sequences have little information about the file and bias the matching score.
- =item *
- Test for a coincidence of, at least 7 characters. This is the default, but this
- value can be changed. If the longest common substring is not a least this
- length, the function returns 0. We expect a lot of collisions since we are
- mapping 32 bit FNV values into 64 character output. This is a way to remove
- false positives.
- =item *
- We use the Wagner-Fischer algorithm to compute the Levenshtein distance using
- these weights:
- =over
- =item *
- Same character: 0
- =item *
- Adition or deletion: 1
- =item *
- Sustitution: 2
- =back
- =item *
- Following the original ssdeep algorithm we scale the value so the output be between 0
- and 100.
- =back
- =head1 INTERFACE
- This section describes the recommended interface for generating and comparing
- ssdeep fuzzy hashes.
- =over
- =item B<ssdeep_hash>
- Calculates the ssdeep hash of the input string.
- Usage:
- $hash = ssdeep_hash( $string );
- or in array context
- @hash = ssdeep_hash( $string );
- In scalar context it returns a
- hash with the format C<bs:hash1:hash2>. Being C<bs> the blocksize, C<hash1>
- the fuzzy hash for this blocksize and C<hash2> the hash for double blocksize.
- The maximum length of each hash is 64 characters.
- In array context it returns the same components above but in a 3 elements array.
- =item B<ssdeep_hash_file>
- Calculates the hash of a file.
- Usage:
- $hash = ssdeep_hash_file( "/tmp/malware1.exe" );
- This is a convenient function. Returns the same of ssdeep_file in scalar or
- array context.
- Since this function slurps the whole file into memory, you should not use it in
- big files. You should not use this module for big files, use libfuzzy wrapper
- instead (L<BUGS AND LIMITATIONS>).
- Returns B<undef> on errors.
- =item B<ssdeep_compare>
- Calculates the matching between two hashes.
- Usage. To compare two scalar hashes:
- $match = ssdeep_compare( $hashA, $hashB );
- To compare two hashes in array format:
- $match = ssdeep_compare( \@hashA, \@hashB );
- The default is to discard hashes with less than 7 characters common substring.
- To override this default and set this limit to any number you can use:
- $match = ssdeep_compare( $hashA, $hashB, 4 );
- The result is a matching score between 0 and 100. See L<Comparison> for
- algorithm details.
- =item B<ssdeep_dump_last>
- Returns an array with information of the last hash calculation. Useful for
- debugging or extended details.
- Usage after a calculation:
- $hash = ssdeep_hash_file( "/tmp/malware1.exe" );
- @details = ssdeep_dump_last();
- The output is an array of CSV values.
- ...
- 2,125870,187|245|110|27|190|66|97,1393131242,q
- 1,210575,13|216|13|115|29|52|208,4009217630,e
- 2,210575,13|216|13|115|29|52|208,4009217630,e
- 1,210730,61|231|220|179|40|89|210,1069791891,T
- 1,237707,45|66|251|98|56|138|91,4014305026,C
- ....
- Meaning of the output array:
- =over
- =item B<Field 1>
- Part of the hash which is affected. 1 for the fist part, 2 for the second part.
- =item B<Field 2>
- Offset of the file where the chunk ends.
- =item B<Field 3>
- Sequence of 7 characters that triggered the rolling hash.
- =item B<Field 4>
- Value of the rolling hash at this moment.
- =item B<Field 5>
- Character output to the fuzzy hash due to this rolling hash trigger.
- =back
- So we can read it this way:
- At byte 125870 of the input file, there is a sequence of these 7 characters:
- C<187 245 110 27 190 66 97>. That sequence triggered the second part of the
- hash. The FNV hash value of the current chunk is 1393131242 that maps to
- character C<q>.
- Or this way:
- From the 4th row I know the letter C<T> in the first hash comes from the
- chunk that started at 210575+1 (the one-starting row before) and ends at
- 210730. The whole FNV hash of this block was 1069791891.
- =back
- =head1 BUGS AND LIMITATIONS
- =over
- =item B<Small blocksize comparison>
- Original ssdeep limit the matching of small blocksize hashes. So when comparing
- them the matching is limited by its size and is never 100%. This algorithm do
- not behaviours that way. Small block sizes hashes are compared as big block
- sizes ones.
- =item B<Performance>
- This is a Pure Perl implementation. The performance is far from optimal. To
- calculate hashes more efficiently, please use compiled software like libfuzzy
- bindings (L<SEE ALSO>).
- =item B<Test 64 bits systems>
- This module has not been tested in 64 bit systems yet.
- =back
- Please report any bugs or feature requests to
- C<bug-digest-ssdeep@rt.cpan.org>, or through the web interface at
- L<http://rt.cpan.org>.
- =head1 SEE ALSO
- =over
- =item Ssdeep's home page
- L<http://ssdeep.sourceforge.net/>
- =item Jesse Kornblum's original paper I<Identifying almost identical files using context triggered piecewise hashing>
- L<http://dfrws.org/2006/proceedings/12-Kornblum.pdf>
- =item I<Data::FuzzyHash> Perl binding of binary libfuzzy libraries
- L<https://github.com/hideo55/Data-FuzzyHash>
- =item Text::WagnerFischer - An implementation of the Wagner-Fischer edit distance.
- L<http://search.cpan.org/perldoc?Text%3A%3AWagnerFischer>
- =item FNV hash's description
- L<http://www.isthe.com/chongo/tech/comp/fnv/>
- =back
- =head1 AUTHOR
- Reinoso Guzman C<< <reinoso.guzman@gmail.com> >>
- =head1 LICENCE AND COPYRIGHT
- Copyright (c) 2013, Reinoso Guzman C<< <reinoso.guzman@gmail.com> >>. All rights reserved.
- This module is free software; you can redistribute it and/or
- modify it under the same terms as Perl itself. See L<perlartistic>.
- =head1 DISCLAIMER OF WARRANTY
- BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
- FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
- OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
- PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
- EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
- ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH
- YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
- NECESSARY SERVICING, REPAIR, OR CORRECTION.
- IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
- WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
- REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE
- LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL,
- OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE
- THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
- RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
- FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
- SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
- SUCH DAMAGES.
|