blob: d0466443ced838fb0ff71b9b043de6d83d90a705 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
def jaro_winkler_distance(s1, s2, p=0.1):
# Calculate the Jaro distance
m = 0
s1_len = len(s1)
s2_len = len(s2)
match_distance = max(s1_len, s2_len) // 2 - 1
s1_matches = [False] * s1_len
s2_matches = [False] * s2_len
for i in range(s1_len):
start = max(0, i - match_distance)
end = min(i + match_distance + 1, s2_len)
for j in range(start, end):
if s2_matches[j]:
continue
if s1[i] != s2[j]:
continue
s1_matches[i] = True
s2_matches[j] = True
m += 1
break
if m == 0:
return 0.0
transpositions = 0
j = 0
for i in range(s1_len):
if not s1_matches[i]:
continue
while not s2_matches[j]:
j += 1
if s1[i] != s2[j]:
transpositions += 1
j += 1
jaro_distance = (m / s1_len + m / s2_len + (m - transpositions / 2) / m) / 3
# Calculate the Jaro-Winkler distance
l = 0
for i in range(min(s1_len, s2_len)):
if s1[i] != s2[i]:
break
l += 1
jaro_winkler_distance = jaro_distance + l * p * (1 - jaro_distance)
return jaro_winkler_distance
# Test the function
s1 = "MARTHA"
s2 = "MARHTA"
distance = jaro_winkler_distance(s1, s2)
print(f"Jaro-Winkler distance between '{s1}' and '{s2}' is {distance}")
|