challenge-096/james-smith/perl/ch-2.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

#!/usr/local/bin/perl

use strict;

use warnings;
use feature qw(say);
use Test::More;

my ($calls,$misses,%cache,%cache_x) = (0,0);

my %examples = (
 'fred,bob' => 4,
 'fred,' => 4,
 ',fred' => 4,
 ',' => 0,
 'rating,ratting' => 1,
 'kitten,sitting' => 3,
 'sunday,monday' => 2,
 'boat riding,bat ridding' => 2,
);

foreach my $words (sort keys %examples ) {
  my($s1,$s2) = split m{,}, $words;
  $s1||='';
  $s2||='';
  print "\n";
  print render_alignment($s1,$s2);
  is( edit_distance($s1,$s2), $examples{$words} );
  is( edit_distance_simple($s1,$s2), $examples{$words} );
}

done_testing();

printf "Calls: %d, hits: %d (%0.2f%%), misses: %d (%0.2f%%)\n",
  $calls,
  $calls-$misses, ($calls-$misses)/$calls*100,
  $misses,        $misses/$calls*100;

## Notes:
## --------------------------------------------------------------------
##
## I'm just going to add "Another day job challenge!"
##
## To be able to make "nicer" output - rather than just keeping track
## of the edit distance of substrings - we will actually keep the
## alignment of the two words as a string of "operations" whether they
## be Indels or SNPs.
##
## {One of my background is working with genomic data and this can be
## thought of as a simple alignment algorithm - and so I think of the
## three operations as Indels {inserts/deletes - remembering an insert
## into one sequence is just the same as a delete from the other} and
## SNPs - or single nucleotide polymorphisms.
##
## The simple alignment string representation we will use consists of:
##   '|' - the letters are the same;
##   'v' - insert
##   '^' - delete
##   ' ' - SNP/modify
##
## We can convert this to an edit distance by counting all the non-"|"s
## In perl we do this with tr/^v /^v / which returns the number of
## matches in scalar form. See {_edit_dist - function}
##
## Finally we include a nice way to render the alignment {edits}
## By showing the two words with appropriate inserts in each word
## and indicate where the letters match in each word via a the
## alignment string in the middle
##
## Additional note - we "memoise" the alignment function - as it will
## be called with the same subseq of letters following different paths
## through the two sequences. This increases performance...
##
## From a "genomic" point of view this is known as the basis of the
## Smith-Waterman local alignment algorithm. Although Smith-Waterman
## has other features - including variable "penalties" for each type
## of edit {inserts, deletes, modifications}. Even having different
## penalties for certain changes {this is also similar to how typing
## correction software works - with assuming adjacent key typos are
## more likely.
##
## See:
##  * https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm

sub edit_distance {
  return _edit_dist( alignment_string( @_ ) );
}

sub alignment_string {
  my( $s, $t ) = @_;
  $calls++;
  my $key = "$s\t$t";
  return $cache{$key} if exists $cache{$key};
  $misses++;
  ## Both strings are empty so reached end!
  return $cache{$key}||=''              if $t eq q() && $s eq q();
  ## Exhausted t so all edits are now deletes...
  return $cache{$key}||='^' x length $s if $t eq q();
  ## Exhausted s so all edits are now inserts...
  return $cache{$key}||='v' x length $t if $s eq q();
  ## First letters are the same so we just prepend the
  ## match symbol (|) and continue...
  return $cache{$key}||='|'.alignment_string(substr($s,1),substr($t,1))
                                        if ord $s == ord $t;

  ## We now have three choices - "insert", "delete" or "SNP"
  my($d,$i,$m) = (
    '^'.alignment_string( substr($s,1), $t           ),
    'v'.alignment_string( $s,           substr($t,1) ),
    ' '.alignment_string( substr($s,1), substr($t,1) ),
  );
  return  $cache{$key}||=
        _edit_dist( $d ) < _edit_dist( $i )
    ? ( _edit_dist( $d ) < _edit_dist( $m ) ? $d : $m )
    : ( _edit_dist( $i ) < _edit_dist( $m ) ? $i : $m );
}

sub _edit_dist { ## Count inserts(v), deletes(^) & mis-matches( )
  return $_[0] =~ tr/^v /^v /;
}

sub render_alignment {
  my( $s, $t ) = @_;
  my $a = alignment_string( $s, $t );
  my( $top, $bot ) = ( '','' );
  foreach ( split m{}, $a ) {
    $top .= $_ eq 'v' ? '-' : substr $s, 0, 1, '';
    $bot .= $_ eq '^' ? '-' : substr $t, 0, 1, '';
  }
  return sprintf "%s\n%s (%d)\n%s\n",
    $top, $a, _edit_dist($a), $bot;
}

#### Optimal code - without the alignment string....
# Reduction to a single line
# We can use sort here as it is relatively fast anyay
# o/w could load in List::Util - but I think overall this will
# be quicker!

sub edit_distance_simple {
  my( $s, $t ) = @_;
  return $cache_x{"$s\t$t"}||=
     $t eq q()          ? length $s
   : $s eq q()          ? length $t
   : (ord $s == ord $t) ? edit_distance(substr($s,1),substr($t,1))
   :                      1+(sort { $a <=> $b }
                            edit_distance(substr($s,1),$t),
                            edit_distance($s,substr$t,1),
                            edit_distance(substr($s,1),substr $t,1)
                          )[0]
   ;
}