diff options
| author | Jörg Sommrey <28217714+jo-37@users.noreply.github.com> | 2021-01-18 21:41:41 +0100 |
|---|---|---|
| committer | Jörg Sommrey <28217714+jo-37@users.noreply.github.com> | 2021-01-22 18:15:31 +0100 |
| commit | 50e899ade1708c952f5c2d1ba21a7ca129503923 (patch) | |
| tree | 0b6d8f9e2f4a4de630ed06a26819915dd26cd978 | |
| parent | 7b1522c52a9f0be331dba4fcf0c2916b08b374f7 (diff) | |
| download | perlweeklychallenge-club-50e899ade1708c952f5c2d1ba21a7ca129503923.tar.gz perlweeklychallenge-club-50e899ade1708c952f5c2d1ba21a7ca129503923.tar.bz2 perlweeklychallenge-club-50e899ade1708c952f5c2d1ba21a7ca129503923.zip | |
Solution to task 2
| -rwxr-xr-x | challenge-096/jo-37/perl/ch-2.pl | 188 |
1 files changed, 188 insertions, 0 deletions
diff --git a/challenge-096/jo-37/perl/ch-2.pl b/challenge-096/jo-37/perl/ch-2.pl new file mode 100755 index 0000000000..72f1a6afb9 --- /dev/null +++ b/challenge-096/jo-37/perl/ch-2.pl @@ -0,0 +1,188 @@ +#!/usr/bin/perl -s + +use v5.16; +use Test2::V0 '!float'; +use PDL; +use experimental 'signatures'; + +our ($tests, $examples, $verbose); + +run_tests() if $tests || $examples; # does not return + +say(<<EOS), exit unless @ARGV == 2; +usage: $0 [-tests] [-examples] [-verbose] [--] [source target] + +-tests + run some tests + +-examples + run the examples from the challenge + +-verbose + display edit operations and internal distance matrix + +source target + calculate levenshtein distance between source and target words + +EOS + + +### Input and Output + +say levenshtein_distance($ARGV[0], $ARGV[1]); + + +### Implementation + + +sub cost; +sub min_ind; + +# Calculate the Levenshtein distance between two words, i.e. the minimum +# number of insert, delete or replace actions to transform the first +# word into the second. PDL implementation of the Wagner-Fischer +# algorithm, see https://en.wikipedia.org/wiki/Wagner-Fischer_algorithm +sub levenshtein_distance ($source, $target) { + + # Split words into arrays and (un)shift by one to match matrix + # dimensions. + my @source = (undef, split //, $source); + my @target = (undef, split //, $target); + + # Prepare distance matrix holding BAD values only. + my $dist = zeros(long, @source + 0, @target + 0)->setvaltobad(0); + + # Fill first row and column with ascending sequences as the costs of + # removing or inserting a prefix of that length. + $dist->slice(',(0)') .= sequence $dist->dim(0); + $dist->slice('(0),') .= sequence $dist->dim(1); + + # To fill the remaining fields, loop over the indices of all BAD + # values. With the index order provided by "whichND" the required + # "flood filling" is achieved. + for my $idx ($dist->isbad->whichND->dog) { + + # Character pair for index position. + my $cs = $source[$idx->at(0)]; + my $ct = $target[$idx->at(1)]; + + # 2x2 neighborhood matrix preceeding index. + my $nb = $dist->range($idx - 1, 2); + + # 2x2 transition cost matrix. + my $trans = cost $cs ne $ct; + + # The cost to reach the new field is the minimum of the sums of + # the previous cost and the transition cost. + $dist->indexND($idx) .= min $nb + $trans; + } + + explain_edit(\@source, \@target, $dist) if $verbose; + + # The requested Levenshtein distance is the bottom right matrix + # element. + $dist->indexND($dist->shape - 1); +} + +# Backtrack the steps that led to the lower right corner of the distance +# matrix and describe the corresponding edit operations. +sub explain_edit ($source, $target, $dist) { + + # Collect edit operations. + my @edit; + + # Start at bottom right corner. + my $idx = $dist->shape - 1; + + # Stop at upper left corner. + while (any $idx) { + + # Character pair for index position. + my $cs = $source->[$idx->at(0)]; + my $ct = $target->[$idx->at(1)]; + + # 2x2 neighborhood matrix preceeding index, truncated to BAD + # values across the matrix' borders. + my $nb = $dist->range($idx - 1, 2, 'truncate'); + + # The field preceeding the current field holds the minimum + # within the neighborhood. Find the minimum and the + # corresponding index. On a non-unique minimum, this will + # prefer the diagonal "replace" move. + my ($min, $pred) = min_ind $nb; + + # Record the edit operation. + unshift @edit, + any($pred) ? + $pred->at(0) ? + "ins($ct)" : + "del($cs)" : + $dist->indexND($idx) == $min ? + "keep($cs)" : + "repl($cs,$ct)"; + + # Move to predecessor field. + $idx = $idx - 1 + $pred; + } + + { + no warnings 'uninitialized'; + say @$source, ' -> ', @$target, ':'; + } + say "@edit"; + say "Levenshtein distance = ", $dist->indexND($dist->shape - 1); + say "Wagner-Fischer distance matrix:", $dist; +} + +# Cost matrix for a single insert, delete and keep/replace. +sub cost ($repl) { + long([0, 1], [1, 0])->setvaltobad(0)->set(0, 0, $repl); +} + +# Find the minimum in an N-dim piddle together with the corresponding +# index. Though this looks complicated, it is very efficient. +sub min_ind ($min) { + my (@index, $index); + + # Reduce dimension by projection via minimum down to zero (getting + # to the global minimum as a scalar) and record the index piddle for + # each dimension. + ($min, undef, $index[@index]) = $min->minmaximum while $min->dims; + + # Expand the target index bottom-up from the record by successively + # applying the so far built index to the next higher dimension index. + $index = $_->indexND($index)->glue(0, $index) for reverse @index; + + ($min, $index); +} + + +### Examples and tests + +sub run_tests { + SKIP: { + skip "examples" unless $examples; + is levenshtein_distance('kitten', 'sitting'), 3, 'example 1'; + is levenshtein_distance('sunday', 'monday'), 2, 'example 2'; + is levenshtein_distance('Saturday', 'Sunday'), 3, + 'example from wikipedia'; + is levenshtein_distance('parachute', 'headache'), 5, + 'another example'; + } + + SKIP: { + skip "tests" unless $tests; + is levenshtein_distance('abcdef', 'uvwxyz'), 6, 'replace all'; + is levenshtein_distance('def', 'abcdef'), 3, 'insert prefix'; + is levenshtein_distance('abcdef', 'def'), 3, 'delete prefix'; + is levenshtein_distance('abcdef', 'uvwdef'), 3, 'replace prefix'; + is levenshtein_distance('abc', 'abcdef'), 3, 'insert suffix'; + is levenshtein_distance('abcdef', 'abc'), 3, 'delete suffix'; + is levenshtein_distance('abcdef', 'abcxyz'), 3, 'replace suffix'; + is levenshtein_distance('abef', 'abcdef'), 2, 'insert infix'; + is levenshtein_distance('abcdef', 'abef'), 2, 'delete infix'; + is levenshtein_distance('abcdef', 'abwxef'), 2, 'replace infix'; + } + done_testing; + exit; +} |
