challenge-092/jo-37/perl/ch-1.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

#!/usr/bin/perl

use v5.16;
use utf8;
use charnames qw(:full :short);
use Unicode::Normalize;
use Digest;
use Test2::V0;

#use constant DIGEST => 'SHA-384';
use constant DIGEST => 'SHA-1';

# The task appears to be very easy, especially because the specification
# comes along with a detailed description of an optimal implementation.
# This invites to go the extra mile.
# 
# - A unique "isomorphic canonicalization" is created.  By taking a
#   message digest of this canonicalization, isomorphic strings can be
#   identified by identical digests.
#
# - Extended grapheme clusters (see
#   https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
#   are used to identify single "characters".  By normalization (using
#   canonical decomposition) different representations of the same
#   symbol are transformed into identical grapheme clusters.  See the
#   last test case for an example.
sub uni_iso_digest {
	# Normalize input (Normalization Form D - canonical decomposition)
	local $_ = NFD shift;

	my $digest = Digest->new(DIGEST);
	my $n = 0;
	my %chars;
	my @canon;

	# Assign an ascending number to every new character appearing in the
	# string and append the current number's bits to the message.
	# Use \X to grab for a grapheme cluster.
	while (s/(\X)//) {
		$chars{$1} = $n++ unless exists $chars{$1};
		$digest->add_bits(pack('L', $chars{$1}), 32);
		push @canon, $chars{$1};
	}

	# Return the canonicalization and the message digest in list
	# context.  Returns solely the digest in scalar context.
	(\@canon, $digest->hexdigest);
}

is uni_iso_digest('abc'), uni_iso_digest('xyz'), 'Example 1';
is uni_iso_digest('abb'), uni_iso_digest('xyy'), 'Example 2';
isnt uni_iso_digest('sum'), uni_iso_digest('add'), 'Example 3';

is uni_iso_digest('αββγ'), uni_iso_digest(9001), 'multi-byte vs. numeric';

# Nine code points form four separate characters with two
# different characters having two differing representations.
my $str = 
	"\N{LATIN CAPITAL LETTER A WITH DIAERESIS}" .

	"\N{LATIN CAPITAL LETTER A}" .
	"\N{COMBINING DIAERESIS}" . 

	"\N{LATIN CAPITAL LETTER A}" .
	"\N{COMBINING DIAERESIS}" . 
	"\N{COMBINING LONG STROKE OVERLAY}" .

	"\N{LATIN CAPITAL LETTER A}" .
	"\N{COMBINING LONG STROKE OVERLAY}" .
	"\N{COMBINING DIAERESIS}";

is uni_iso_digest($str), uni_iso_digest('xxyy'),
	'Take normalized grapheme clusters as the individual characters';

# Retrieve the canonicalization only:
my ($canon) = uni_iso_digest($str);
say "canonical: (@$canon)";

done_testing;