aboutsummaryrefslogtreecommitdiff
path: root/challenge-010/paulo-custodio/cpp/ch-2.cpp
blob: 6df9e65d81723233cf2aac3a713665e3ddfbb50d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
/*
Challenge 010

Challenge #2
Write a script to find Jaro-Winkler distance between two strings. For more
information check wikipedia page.
*/

#include <iostream>
#include <string>
#include <cmath>
using namespace std;

float jaro_similarity(const string& s1, const string& s2) {
    if (s1 == s2) return 1.0;    // equal strings

    int len1 = static_cast<int>(s1.size());
    int len2 = static_cast<int>(s2.size());

    // max distance between matching characters
    int max_dist = (max(len1, len2) / 2) - 1;

    // count number of unique matches - same letters less than max_dist appart
    int match = 0;
    bool found_s1[len1] = {0};
    bool found_s2[len2] = {0};

    for (int i = 0; i < len1; i++) {
        int min_j = max(0, i - max_dist);
        int max_j = min(len2-1, i + max_dist);
        for (int j = min_j; j <= max_j; j++) {
            if (s1[i] == s2[j] && !found_s2[j]) {   // same and not already matched
                found_s1[i] = true;                 // mark as found
                found_s2[j] = true;
                match++;
                break;
            }
        }
    }

    if (match == 0) return 0.0;                     // no matches

    // count transpositions : The first assigned character on one string is
    // compared to the first assigned character on the other string.If the
    // characters are not the same, half of a transposition has occurred.Then
    // the second assigned character on one string is compared to the second
    // assigned character on the other string, etc.The number of mismatched
    // characters is divided by two to yield the number of transpositions.
    float transp = 0.0;
    int pos_s2 = 0;
    for (int i = 0; i < len1; i++) {
        if (found_s1[i]) {      // there is a match in s1, find first match in s2
            while (!found_s2[pos_s2] && pos_s2 < len2)
                pos_s2++;
            if (s1[i] != s2[pos_s2++])
                transp++;
        }
    }
    transp /= 2;

    float js = ((float)match / (float)len1
        + (float)match / (float)len2
        + ((float)match - (float)transp) / (float)match) / 3.0f;
    return js;
}

float jaro_winkler_similarity(const string& s1, const string& s2) {
    int len1 = static_cast<int>(s1.size());
    int len2 = static_cast<int>(s2.size());

    // find longest common prefix l
    int l = 0;
    while (l < min(4, min(len1, len2)) && s1[l] == s2[l])
        l++;

    // constant p
    float p = 0.1f;

    float js = jaro_similarity(s1, s2);
    float jws = js + l * p*(1 - js);
    return jws;
}

float jaro_winkler_distance(const string& s1, const string& s2) {
    return 1.0f - jaro_winkler_similarity(s1, s2);
}


int main(int argc, char* argv[]) {
    if (argc != 3) return EXIT_FAILURE;
    float jwd = jaro_winkler_distance(argv[1], argv[2]);
    jwd = round(jwd*100.0f)/100.0f;     // show 2 decimals
    cout << jwd << endl;
}