From fa2ae9a1c4984aba9c6c070ff3a1459f175d72d3 Mon Sep 17 00:00:00 2001 From: dcw Date: Sun, 2 Jun 2019 20:32:02 +0100 Subject: committed my belated challenge 010 solutions --- challenge-010/duncan-c-white/README | 98 +++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 challenge-010/duncan-c-white/README (limited to 'challenge-010/duncan-c-white/README') diff --git a/challenge-010/duncan-c-white/README b/challenge-010/duncan-c-white/README new file mode 100644 index 0000000000..e277d3b17b --- /dev/null +++ b/challenge-010/duncan-c-white/README @@ -0,0 +1,98 @@ +Challenge 1: "Write a script to encode/decode Roman numerals. For example, +given Roman numeral CCXLVI, it should return 246. Similarly, for decimal +number 39, it should return XXXIX." + +My notes: That's a nice problem, let's have a go. + + +Challenge 2: "Write a to find Jaro-Winkler distance between two strings." + +My notes: + +WTF is Jaro-Winkler, read wikipedia page, well Jaro-Winkler is a simple prefix +adjustment to the Jaro distance, but the wikipedia page explaining Jaro distances +is not very clear - it describes it in terms of matched characters and transposed +characters, but it's description of matching within a range, and of how to count +transposed characters is almost completely unclear. I couldn't write code based +on such a poor description! + +But googling further, Rosetta Stone had various implementations in various +languages (including C and Perl), which clarifies the terribly unclear wikipedia +entry. + +It appears that (confirmed by the code people have actually written) +matching should happen as follows, shown by the Rosetta Stone C implementation: + + // length of the strings + int str1_len = strlen(str1); + int str2_len = strlen(str2); + + // if both strings are empty return 1 + // if only one of the strings is empty return 0 + if (str1_len == 0) return str2_len == 0 ? 1.0 : 0.0; + + // max distance between two chars to be considered matching + int match_distance = (int) max(str1_len, str2_len)/2 - 1; + + // arrays of bools that signify if that char in the matching string has a match + int *str1_matches = calloc(str1_len, sizeof(int)); + int *str2_matches = calloc(str2_len, sizeof(int)); + + // number of matches and transpositions + double matches = 0.0; + + // find the matches + for (int i = 0; i < str1_len; i++) { + // start and end take into account the match distance + int start = max(0, i - match_distance); + int end = min(i + match_distance + 1, str2_len); + + for (int k = start; k < end; k++) { + // if str2 already has a match continue + if (str2_matches[k]) continue; + // if str1 and str2 are not + if (str1[i] != str2[k]) continue; + // otherwise assume there is a match + str1_matches[i] = TRUE; + str2_matches[k] = TRUE; + matches++; + break; + } + } + + // if there are no matches return 0 + if (matches == 0) { + free(str1_matches); + free(str2_matches); + return 0.0; + } + + // count transpositions + double transpositions = 0.0; + int k = 0; + for (int i = 0; i < str1_len; i++) { + // if there are no matches in str1 continue + if (!str1_matches[i]) continue; + // while there is no match in str2 increment k + while (!str2_matches[k]) k++; + // increment transpositions + if (str1[i] != str2[k]) transpositions++; + k++; + } + + // divide the number of transpositions by two as per the algorithm specs + transpositions /= 2.0; + + // free the allocated memory + free(str1_matches); + free(str2_matches); + + // return the Jaro distance + return ((matches / str1_len) + + (matches / str2_len) + + ((matches - transpositions) / matches)) / 3.0; + +Ok, I basically understand it now. Transposed characters are matching characters +that are diferent, eg TH vs HT + +So I'll have a go anyway. -- cgit From 908a9f65859d27eae0083521132dc5465c487b6c Mon Sep 17 00:00:00 2001 From: dcw Date: Sun, 9 Jun 2019 10:39:17 +0100 Subject: reformated README long-line paragraphs, and added a final paragraph saying I got stuck with git merge so didn't submit --- challenge-010/duncan-c-white/README | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'challenge-010/duncan-c-white/README') diff --git a/challenge-010/duncan-c-white/README b/challenge-010/duncan-c-white/README index 2654c03c6b..2384fb220f 100644 --- a/challenge-010/duncan-c-white/README +++ b/challenge-010/duncan-c-white/README @@ -9,16 +9,19 @@ Challenge 2: "Write a to find Jaro-Winkler distance between two strings." My notes: -WTF is Jaro-Winkler, read wikipedia page, well Jaro-Winkler is a simple prefix -adjustment to the Jaro distance, but the wikipedia page explaining Jaro distances -is not very clear - it describes it in terms of matched characters and transposed -characters, but it's description of matching within a range, and of how to count -transposed characters is almost completely unclear. I couldn't write code based -on such a poor description! +WTF is Jaro-Winkler, read wikipedia page, well Jaro-Winkler is a +simple prefix adjustment to the Jaro distance, but the wikipedia page +explaining Jaro distances is not very clear - it describes it in terms +of matched characters and transposed characters, but it's description +of matching within a range, and of how to count transposed characters +is almost completely unclear. I couldn't write code based on such a +poor description! -But googling further, Rosetta Stone had various implementations in various -languages (including C and Perl), which clarifies the terribly unclear wikipedia -entry. Ok, I basically understand it now. Transposed characters are matching -characters that are diferent, eg TH vs HT +But googling further, Rosetta Stone had various implementations in +various languages (including C and Perl), which clarifies the terribly +unclear wikipedia entry. Ok, I basically understand it now. Transposed +characters are matching characters that are diferent, eg TH vs HT -So I'll have a go anyway. +So I'll have a go anyway, although I ran out of time to do it in the +weekly challenge (and I screwed up the git side of things, so couldn't +submit it late either). -- cgit