1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
|
#!/usr/bin/perl
use v5.16;
use utf8;
use charnames qw(:full :short);
use Unicode::Normalize;
use Digest;
use Test2::V0;
#use constant DIGEST => 'SHA-384';
use constant DIGEST => 'SHA-1';
# The task appears to be very easy, especially because the specification
# comes along with a detailed description of an optimal implementation.
# This invites to go the extra mile.
#
# - A unique "isomorphic canonicalization" is created. By taking a
# message digest of this canonicalization, isomorphic strings can be
# identified by identical digests.
#
# - Extended grapheme clusters (see
# https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
# are used to identify single "characters". By normalization (using
# canonical decomposition) different representations of the same
# symbol are transformed into identical grapheme clusters. See the
# last test case for an example.
sub uni_iso_digest {
# Normalize input (Normalization Form D - canonical decomposition)
local $_ = NFD shift;
my $digest = Digest->new(DIGEST);
my $n = 0;
my %chars;
my @canon;
# Assign an ascending number to every new character appearing in the
# string and append the current number's bits to the message.
# Use \X to grab for a grapheme cluster.
while (s/(\X)//) {
$chars{$1} = $n++ unless exists $chars{$1};
$digest->add_bits(pack('L', $chars{$1}), 32);
push @canon, $chars{$1};
}
# Return the canonicalization and the message digest in list
# context. Returns solely the digest in scalar context.
(\@canon, $digest->hexdigest);
}
is uni_iso_digest('abc'), uni_iso_digest('xyz'), 'Example 1';
is uni_iso_digest('abb'), uni_iso_digest('xyy'), 'Example 2';
isnt uni_iso_digest('sum'), uni_iso_digest('add'), 'Example 3';
is uni_iso_digest('αββγ'), uni_iso_digest(9001), 'multi-byte vs. numeric';
# Nine code points form four separate characters with two
# different characters having two differing representations.
my $str =
"\N{LATIN CAPITAL LETTER A WITH DIAERESIS}" .
"\N{LATIN CAPITAL LETTER A}" .
"\N{COMBINING DIAERESIS}" .
"\N{LATIN CAPITAL LETTER A}" .
"\N{COMBINING DIAERESIS}" .
"\N{COMBINING LONG STROKE OVERLAY}" .
"\N{LATIN CAPITAL LETTER A}" .
"\N{COMBINING LONG STROKE OVERLAY}" .
"\N{COMBINING DIAERESIS}";
is uni_iso_digest($str), uni_iso_digest('xxyy'),
'Take normalized grapheme clusters as the individual characters';
# Retrieve the canonicalization only:
my ($canon) = uni_iso_digest($str);
say "canonical: (@$canon)";
done_testing;
|