1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
#!/usr/bin/perl
# https://perlweeklychallenge.org/blog/perl-weekly-challenge-010/
# Challenge #2
# Write a script to find Jaro-Winkler distance between two strings.
# For more information check wikipedia page.
# https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
use strict;
use warnings;
die "Usage: $0 <string> <string>" if (scalar @ARGV != 2);
#Using lower case for checking similarity
my $dw = 1 - simw(map {lc} @ARGV);;
print 'Jaro-Winkler distance: '.sprintf("%.3f",$dw).$/;
exit 0;
#Jaro-Winkler similarity
sub simw {
my ($s1,$s2,$p) = @_;
#Scaling factor
$p = 0.1 unless $p;
my $simj = simj($s1,$s2);
my $prefix = _common_prefix($s1,$s2);
my $simw = $simj + $prefix * $p * (1- $simj);
return $simw;
}
#Common prefix
sub _common_prefix {
my ($s1,$s2) = @_;
my @chr1 = split //, $s1;
my @chr2 = split //, $s2;
my $prefix = 0;
do {
last unless ( defined $chr2[$_] && $chr1[$_] eq $chr2[$_] );
$prefix++;
} for 0..3; #prefix up to 4
return $prefix;
}
#Jaro similarity
sub simj {
my ($s1,$s2) = @_;
my $l1 = length($s1);
my $l2 = length($s2);
#Matching distance
my $max_l = $l1 > $l2 ? $l1 : $l2;
my $match_dist = int($max_l / 2 - 1);
my @chr1 = split //, $s1;
my @chr2 = split //, $s2;
my @matches;
my %matches_position;
for (my $i = 0; $i < $l1; $i++) {
my $init = $i - $match_dist;
$init = 0 if $init < 0;
my $end = $i + $match_dist + 1;
$end = $l2 if $end > $l2;
for (my $j = $init; $j < $end; $j++) {
if ($chr1[$i] eq $chr2[$j]){
push @matches, $chr1[$i];
$matches_position{$j} = $i; #required to detect transpositions
$chr2[$j]='-'; # avoid matching with the same character
last;
}
}
}
my $m = @matches;
#Jaro similarity is 0 if there are no matches
return 0 if ($m == 0);
my $transpositions = 0;
for (my ($i,$j) = (0,0); $j < $m; $i++) {
if (exists $matches_position{$i}){
$transpositions++ if( substr( $s2, $i, 1) ne $matches[$j]);
$j++;
}
}
my $t = $transpositions / 2;
my $simj = 1/3 * ($m/$l1 + $m/$l2 + ($m - $t)/$m);
return $simj;
}
|