aboutsummaryrefslogtreecommitdiff
path: root/challenge-035/ryan-thompson
diff options
context:
space:
mode:
authorRyan Thompson <i@ry.ca>2019-11-21 12:09:39 -0600
committerRyan Thompson <i@ry.ca>2019-11-21 12:09:39 -0600
commit716bdb5b516b533720233d010c705a7d19baccfa (patch)
tree4078e148a7ac790d286603dfffb018f650517e61 /challenge-035/ryan-thompson
parent0fc4db817a6b98a2b81637aad971c3394b046ad3 (diff)
downloadperlweeklychallenge-club-716bdb5b516b533720233d010c705a7d19baccfa.tar.gz
perlweeklychallenge-club-716bdb5b516b533720233d010c705a7d19baccfa.tar.bz2
perlweeklychallenge-club-716bdb5b516b533720233d010c705a7d19baccfa.zip
Perl5 solutions and morse code Wikipedia scraper
Diffstat (limited to 'challenge-035/ryan-thompson')
-rw-r--r--challenge-035/ryan-thompson/extras/.gitignore2
-rwxr-xr-xchallenge-035/ryan-thompson/extras/fetch-morse.pl172
-rwxr-xr-xchallenge-035/ryan-thompson/perl5/ch-1.pl40
-rwxr-xr-xchallenge-035/ryan-thompson/perl5/ch-2.pl50
4 files changed, 264 insertions, 0 deletions
diff --git a/challenge-035/ryan-thompson/extras/.gitignore b/challenge-035/ryan-thompson/extras/.gitignore
new file mode 100644
index 0000000000..3b68c7e1fc
--- /dev/null
+++ b/challenge-035/ryan-thompson/extras/.gitignore
@@ -0,0 +1,2 @@
+morse_code.pl
+morse_code.html
diff --git a/challenge-035/ryan-thompson/extras/fetch-morse.pl b/challenge-035/ryan-thompson/extras/fetch-morse.pl
new file mode 100755
index 0000000000..04463d2a88
--- /dev/null
+++ b/challenge-035/ryan-thompson/extras/fetch-morse.pl
@@ -0,0 +1,172 @@
+#!/usr/bin/env perl
+
+# fetch-morse.pl - Fetch morse code table from Wikipedia and parse it.
+# Programming > data entry.
+#
+# Ryan Thompson <rjt@cpan.org>
+
+use 5.016; # fc()
+use warnings;
+use autodie;
+use strict;
+use utf8;
+use LWP::Simple;
+use Getopt::Long;
+use File::Slurper qw< read_text write_text >;
+use Carp;
+
+no warnings 'uninitialized';
+binmode STDOUT, ':utf8';
+
+my %o = get_options();
+
+my $html = get_html();
+my $table = get_table($html);
+my %morse = get_hash($table);
+
+write_text( $o{output}, perl_source(%morse), 'utf-8' );
+
+run_tests() if $o{test};
+
+exit;
+
+#
+# Helpers -- Most of the logic is broken out into the subs below
+#
+
+# Defaults + GetOptions to set up our configuration
+sub get_options {
+ my %o = (
+ url => 'https://en.wikipedia.org/wiki/Morse_code',
+ file => 'morse_code.html',
+ output => 'morse_code.pl',
+ );
+ GetOptions(\%o, qw< url=s file=s output=s force local force test >);
+ croak "Specifying --local and --force together is ambiguous"
+ if $o{local} and $o{force};
+
+ %o;
+}
+
+# This is the main parsing loop. Given the raw HTML text for the table
+# we want, grab each Morse code character and its code, and stuff it into
+# a hash for return.
+sub get_hash {
+
+ # Here's the regex we'll use to grab each result. We'll be interested in
+ # the named captures "ch" and "morse" when we build our hash of morse
+ # codes
+ my $re = qr{
+ <td><b><a\shref="[^"]+"\s.*?
+ title=" (?<title> [^"]+ ) ">\s*
+ (?<ch> .+?)
+ \s*</a>
+ .+?
+ (?:\QShared by \E(?<alt> .+?)</td>)? # Unused
+ .+?
+ \Q<span style="font-size:x-large;"><b>\E
+ (?<morse> .+?)
+ \s*</b>
+ }sx;
+
+ # Iterate over each match and add it to the hash
+ my %ch;
+ while ( $table =~ /$re/g ) {
+ my @ch = chars_of($+{ch});
+ my $morse = morse_ASCII($+{morse});
+ @ch{@ch} = ($morse) x @ch;
+ }
+
+ %ch;
+}
+
+# Get the <table> contents for the table we want
+sub get_table {
+ $_[0] =~ m{
+ \Q<span class="mw-headline" id="Letters,_numbers,_punctuation\E
+ .+?
+ \Q<table class="wikitable sortable">\E
+ (?<table>.+?)
+ </table>\s*</td></tr></tbody></table>
+ }sx or croak "Can't find span/table";
+
+ $+{table};
+}
+
+# Return Perl source for the given hash
+sub perl_source {
+ my %ch = @_;
+
+ my $perl = <<END_PERL
+# THIS FILE IS AUTO-GENERATED BY extras/fetch-morse.pl
+
+use utf8;
+
+sub morse_hash {
+ (
+END_PERL
+;
+ for ( sort { fc($a) cmp fc($b) || $a cmp $b } keys %ch ) {
+ my $key = $_; $key =~ s/'/\\'/;
+ $perl .= sprintf " %5s => '%s',\n", "'$key'", $ch{$_};
+ }
+ $perl .= " );\n}\n\n1; # END OF $o{output}\n";
+
+ $perl
+}
+
+# Run tests to make sure everything worked
+sub run_tests {
+ use Test::More;
+ use lib qw< . >;
+
+ eval { require $o{output} };
+ croak "require failed: $@" if $@;
+
+ my %morse = eval { morse_hash(); };
+ croak "morse_hash() failed: $@" if $@;
+
+ is $morse{"'"}, '100001', 'Apostrophe';
+ is $morse{A}, '10', 'A';
+ is $morse{A}, $morse{a}, 'A = a';
+ is $morse{0}, '00000', 'Zero';
+
+ is $morse{ç}, '01011', 'C cedilla';
+ is $morse{ç}, $morse{Ć}, 'Cedilla = Acute';
+}
+
+# Convert the utf-8 morse code to ASCII hyphen and period, and remove spaces
+sub morse_ASCII { $_[0] =~ tr/−·  /-./dr }
+
+# Parse the <a> text containing the character(s) a particular morse code
+# applies to and return the character(s) in a list. The Wikipedia page
+# uses the following formats, so we handle them all:
+# 1. A -> ('A')
+# 2. A, a -> ('A', 'a')
+# 3. Comma [,] -> (',')
+# 4. Ampersand [&amp;] -> ('&') # Special case
+sub chars_of {
+ local $_ = shift;
+ return '-' if /Hyphen/;
+ return '/' if /Slash/;
+ return $_ if /^\S+$/;
+ return split /\s*,\s*/ if /^\S+\s*,\s*\S+$/;
+ return '&' if /\[&amp;\]$/;
+ return $1 if /\[([^]])\]$/;
+ croak "Unknown character format `$_'";
+}
+
+# Maybe fetch, and then read the HTML from the Wikipedia page and return it
+sub get_html {
+ unlink $o{file} if $o{force};
+
+ croak "--local specified, but $o{file} does not exist" unless -f $o{file};
+
+ unless ( $o{local} ) {
+ my $URL = 'https://en.wikipedia.org/wiki/Morse_code';
+ my $code = mirror( $o{url}, $o{file} );
+ croak "Could not fetch $URL" if is_error( $code );
+ }
+
+ read_text( $o{file} );
+}
diff --git a/challenge-035/ryan-thompson/perl5/ch-1.pl b/challenge-035/ryan-thompson/perl5/ch-1.pl
new file mode 100755
index 0000000000..b8a9101c2a
--- /dev/null
+++ b/challenge-035/ryan-thompson/perl5/ch-1.pl
@@ -0,0 +1,40 @@
+#!/usr/bin/env perl
+
+# ch-1.pl - Encode text into binary morse code
+#
+# Ryan Thompson <rjt@cpan.org>
+
+use 5.010;
+use warnings;
+use autodie;
+use strict;
+use utf8;
+no warnings 'uninitialized';
+use Text::Trim;
+
+# Include the morse code table generated by ../extras/fetch-morse.pl
+use lib qw< ../extras >;
+eval { require 'morse_code.pl' };
+die "Run fetch-morse.pl in ../extras first" if $@;
+my %morse = morse_hash();
+
+# Text from stdin or filenames specified on the command line
+while ( $_ = <<>> ) {
+ trim;
+ s/\s+/ /; # Consolidate internal whitespace (and tabs) to single space
+ say join '', map { morse_bin($_) } split '';
+}
+
+# Return morse binary for a character
+# See challenge description for the specification of this, but basically
+# dot: 1, dash: 111, 0 in between dots/dashes, and 000 between each char
+# Undefined characters return an empty string
+sub morse_bin {
+ my $ch = shift;
+ my %map = ( '.' => '1', '-' => '111' );
+ return '0000' if $ch eq ' '; # Word separator, 7 0s but 3 from char sep
+ return '' if not exists $morse{$ch};
+ my $morse = join '0', map { $map{$_} } split '', $morse{$ch};
+
+ $morse . '000'; # Char separator
+}
diff --git a/challenge-035/ryan-thompson/perl5/ch-2.pl b/challenge-035/ryan-thompson/perl5/ch-2.pl
new file mode 100755
index 0000000000..e271a5aca2
--- /dev/null
+++ b/challenge-035/ryan-thompson/perl5/ch-2.pl
@@ -0,0 +1,50 @@
+#!/usr/bin/env perl
+
+# ch-2.pl - Read binary morse code
+#
+# Ryan Thompson <rjt@cpan.org>
+
+use 5.016; # fc()
+use warnings;
+use autodie;
+use strict;
+use utf8;
+no warnings 'uninitialized';
+
+binmode STDOUT, ':utf8';
+
+# Include the morse code table generated by ../extras/fetch-morse.pl
+use lib qw< ../extras >;
+eval { require 'morse_code.pl' };
+die "Run fetch-morse.pl in ../extras first" if $@;
+my %rmorse = reverse_morse( morse_hash() );
+
+
+chomp, say morse_bin_to_utf8($_) while $_ = <<>>;
+
+
+# Take morse binary and turn it (back) into utf8 text
+sub morse_bin_to_utf8 {
+ my $r = '';
+ for (split /0{7}/, shift) {
+ $r .= join '', map { $rmorse{$_} } split /000/;
+ $r .= ' ';
+ }
+
+ $r;
+}
+
+# Reverse the morse hash. We could just use reverse(), but that would give
+# inconsistent results for duplicate values. Plus we may as well convert
+# the . and - to 1 and 0 at this stage.
+sub reverse_morse {
+ my %morse = @_;
+ map {
+ my $morse = $morse{$_};
+ $morse =~ s/\./10/g;
+ $morse =~ s/\-/1110/g;
+ $morse =~ s/0$//; # Remove trailing intra-char gap
+
+ $morse => fc $_
+ } keys %morse;
+}