#!/usr/local/bin/perl use strict; use warnings; use feature qw(say); use Test::More; my ($calls,$misses,%cache,%cache_x) = (0,0); my %examples = ( 'fred,bob' => 4, 'fred,' => 4, ',fred' => 4, ',' => 0, 'rating,ratting' => 1, 'kitten,sitting' => 3, 'sunday,monday' => 2, 'boat riding,bat ridding' => 2, ); foreach my $words (sort keys %examples ) { my($s1,$s2) = split m{,}, $words; $s1||=''; $s2||=''; print "\n"; print render_alignment($s1,$s2); is( edit_distance($s1,$s2), $examples{$words} ); is( edit_distance_simple($s1,$s2), $examples{$words} ); } done_testing(); printf "Calls: %d, hits: %d (%0.2f%%), misses: %d (%0.2f%%)\n", $calls, $calls-$misses, ($calls-$misses)/$calls*100, $misses, $misses/$calls*100; ## Notes: ## -------------------------------------------------------------------- ## ## I'm just going to add "Another day job challenge!" ## ## To be able to make "nicer" output - rather than just keeping track ## of the edit distance of substrings - we will actually keep the ## alignment of the two words as a string of "operations" whether they ## be Indels or SNPs. ## ## {One of my background is working with genomic data and this can be ## thought of as a simple alignment algorithm - and so I think of the ## three operations as Indels {inserts/deletes - remembering an insert ## into one sequence is just the same as a delete from the other} and ## SNPs - or single nucleotide polymorphisms. ## ## The simple alignment string representation we will use consists of: ## '|' - the letters are the same; ## 'v' - insert ## '^' - delete ## ' ' - SNP/modify ## ## We can convert this to an edit distance by counting all the non-"|"s ## In perl we do this with tr/^v /^v / which returns the number of ## matches in scalar form. See {_edit_dist - function} ## ## Finally we include a nice way to render the alignment {edits} ## By showing the two words with appropriate inserts in each word ## and indicate where the letters match in each word via a the ## alignment string in the middle ## ## Additional note - we "memoise" the alignment function - as it will ## be called with the same subseq of letters following different paths ## through the two sequences. This increases performance... ## ## From a "genomic" point of view this is known as the basis of the ## Smith-Waterman local alignment algorithm. Although Smith-Waterman ## has other features - including variable "penalties" for each type ## of edit {inserts, deletes, modifications}. Even having different ## penalties for certain changes {this is also similar to how typing ## correction software works - with assuming adjacent key typos are ## more likely. ## ## See: ## * https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm sub edit_distance { return _edit_dist( alignment_string( @_ ) ); } sub alignment_string { my( $s, $t ) = @_; $calls++; my $key = "$s\t$t"; return $cache{$key} if exists $cache{$key}; $misses++; ## Both strings are empty so reached end! return $cache{$key}||='' if $t eq q() && $s eq q(); ## Exhausted t so all edits are now deletes... return $cache{$key}||='^' x length $s if $t eq q(); ## Exhausted s so all edits are now inserts... return $cache{$key}||='v' x length $t if $s eq q(); ## First letters are the same so we just prepend the ## match symbol (|) and continue... return $cache{$key}||='|'.alignment_string(substr($s,1),substr($t,1)) if ord $s == ord $t; ## We now have three choices - "insert", "delete" or "SNP" my($d,$i,$m) = ( '^'.alignment_string( substr($s,1), $t ), 'v'.alignment_string( $s, substr($t,1) ), ' '.alignment_string( substr($s,1), substr($t,1) ), ); return $cache{$key}||= _edit_dist( $d ) < _edit_dist( $i ) ? ( _edit_dist( $d ) < _edit_dist( $m ) ? $d : $m ) : ( _edit_dist( $i ) < _edit_dist( $m ) ? $i : $m ); } sub _edit_dist { ## Count inserts(v), deletes(^) & mis-matches( ) return $_[0] =~ tr/^v /^v /; } sub render_alignment { my( $s, $t ) = @_; my $a = alignment_string( $s, $t ); my( $top, $bot ) = ( '','' ); foreach ( split m{}, $a ) { $top .= $_ eq 'v' ? '-' : substr $s, 0, 1, ''; $bot .= $_ eq '^' ? '-' : substr $t, 0, 1, ''; } return sprintf "%s\n%s (%d)\n%s\n", $top, $a, _edit_dist($a), $bot; } #### Optimal code - without the alignment string.... # Reduction to a single line # We can use sort here as it is relatively fast anyay # o/w could load in List::Util - but I think overall this will # be quicker! sub edit_distance_simple { my( $s, $t ) = @_; return $cache_x{"$s\t$t"}||= $t eq q() ? length $s : $s eq q() ? length $t : (ord $s == ord $t) ? edit_distance(substr($s,1),substr($t,1)) : 1+(sort { $a <=> $b } edit_distance(substr($s,1),$t), edit_distance($s,substr$t,1), edit_distance(substr($s,1),substr $t,1) )[0] ; }