diff options
| author | Ryan Thompson <i@ry.ca> | 2019-11-21 12:09:39 -0600 |
|---|---|---|
| committer | Ryan Thompson <i@ry.ca> | 2019-11-21 12:09:39 -0600 |
| commit | 716bdb5b516b533720233d010c705a7d19baccfa (patch) | |
| tree | 4078e148a7ac790d286603dfffb018f650517e61 /challenge-035/ryan-thompson | |
| parent | 0fc4db817a6b98a2b81637aad971c3394b046ad3 (diff) | |
| download | perlweeklychallenge-club-716bdb5b516b533720233d010c705a7d19baccfa.tar.gz perlweeklychallenge-club-716bdb5b516b533720233d010c705a7d19baccfa.tar.bz2 perlweeklychallenge-club-716bdb5b516b533720233d010c705a7d19baccfa.zip | |
Perl5 solutions and morse code Wikipedia scraper
Diffstat (limited to 'challenge-035/ryan-thompson')
| -rw-r--r-- | challenge-035/ryan-thompson/extras/.gitignore | 2 | ||||
| -rwxr-xr-x | challenge-035/ryan-thompson/extras/fetch-morse.pl | 172 | ||||
| -rwxr-xr-x | challenge-035/ryan-thompson/perl5/ch-1.pl | 40 | ||||
| -rwxr-xr-x | challenge-035/ryan-thompson/perl5/ch-2.pl | 50 |
4 files changed, 264 insertions, 0 deletions
diff --git a/challenge-035/ryan-thompson/extras/.gitignore b/challenge-035/ryan-thompson/extras/.gitignore new file mode 100644 index 0000000000..3b68c7e1fc --- /dev/null +++ b/challenge-035/ryan-thompson/extras/.gitignore @@ -0,0 +1,2 @@ +morse_code.pl +morse_code.html diff --git a/challenge-035/ryan-thompson/extras/fetch-morse.pl b/challenge-035/ryan-thompson/extras/fetch-morse.pl new file mode 100755 index 0000000000..04463d2a88 --- /dev/null +++ b/challenge-035/ryan-thompson/extras/fetch-morse.pl @@ -0,0 +1,172 @@ +#!/usr/bin/env perl + +# fetch-morse.pl - Fetch morse code table from Wikipedia and parse it. +# Programming > data entry. +# +# Ryan Thompson <rjt@cpan.org> + +use 5.016; # fc() +use warnings; +use autodie; +use strict; +use utf8; +use LWP::Simple; +use Getopt::Long; +use File::Slurper qw< read_text write_text >; +use Carp; + +no warnings 'uninitialized'; +binmode STDOUT, ':utf8'; + +my %o = get_options(); + +my $html = get_html(); +my $table = get_table($html); +my %morse = get_hash($table); + +write_text( $o{output}, perl_source(%morse), 'utf-8' ); + +run_tests() if $o{test}; + +exit; + +# +# Helpers -- Most of the logic is broken out into the subs below +# + +# Defaults + GetOptions to set up our configuration +sub get_options { + my %o = ( + url => 'https://en.wikipedia.org/wiki/Morse_code', + file => 'morse_code.html', + output => 'morse_code.pl', + ); + GetOptions(\%o, qw< url=s file=s output=s force local force test >); + croak "Specifying --local and --force together is ambiguous" + if $o{local} and $o{force}; + + %o; +} + +# This is the main parsing loop. Given the raw HTML text for the table +# we want, grab each Morse code character and its code, and stuff it into +# a hash for return. +sub get_hash { + + # Here's the regex we'll use to grab each result. We'll be interested in + # the named captures "ch" and "morse" when we build our hash of morse + # codes + my $re = qr{ + <td><b><a\shref="[^"]+"\s.*? + title=" (?<title> [^"]+ ) ">\s* + (?<ch> .+?) + \s*</a> + .+? + (?:\QShared by \E(?<alt> .+?)</td>)? # Unused + .+? + \Q<span style="font-size:x-large;"><b>\E + (?<morse> .+?) + \s*</b> + }sx; + + # Iterate over each match and add it to the hash + my %ch; + while ( $table =~ /$re/g ) { + my @ch = chars_of($+{ch}); + my $morse = morse_ASCII($+{morse}); + @ch{@ch} = ($morse) x @ch; + } + + %ch; +} + +# Get the <table> contents for the table we want +sub get_table { + $_[0] =~ m{ + \Q<span class="mw-headline" id="Letters,_numbers,_punctuation\E + .+? + \Q<table class="wikitable sortable">\E + (?<table>.+?) + </table>\s*</td></tr></tbody></table> + }sx or croak "Can't find span/table"; + + $+{table}; +} + +# Return Perl source for the given hash +sub perl_source { + my %ch = @_; + + my $perl = <<END_PERL +# THIS FILE IS AUTO-GENERATED BY extras/fetch-morse.pl + +use utf8; + +sub morse_hash { + ( +END_PERL +; + for ( sort { fc($a) cmp fc($b) || $a cmp $b } keys %ch ) { + my $key = $_; $key =~ s/'/\\'/; + $perl .= sprintf " %5s => '%s',\n", "'$key'", $ch{$_}; + } + $perl .= " );\n}\n\n1; # END OF $o{output}\n"; + + $perl +} + +# Run tests to make sure everything worked +sub run_tests { + use Test::More; + use lib qw< . >; + + eval { require $o{output} }; + croak "require failed: $@" if $@; + + my %morse = eval { morse_hash(); }; + croak "morse_hash() failed: $@" if $@; + + is $morse{"'"}, '100001', 'Apostrophe'; + is $morse{A}, '10', 'A'; + is $morse{A}, $morse{a}, 'A = a'; + is $morse{0}, '00000', 'Zero'; + + is $morse{ç}, '01011', 'C cedilla'; + is $morse{ç}, $morse{Ć}, 'Cedilla = Acute'; +} + +# Convert the utf-8 morse code to ASCII hyphen and period, and remove spaces +sub morse_ASCII { $_[0] =~ tr/−· /-./dr } + +# Parse the <a> text containing the character(s) a particular morse code +# applies to and return the character(s) in a list. The Wikipedia page +# uses the following formats, so we handle them all: +# 1. A -> ('A') +# 2. A, a -> ('A', 'a') +# 3. Comma [,] -> (',') +# 4. Ampersand [&] -> ('&') # Special case +sub chars_of { + local $_ = shift; + return '-' if /Hyphen/; + return '/' if /Slash/; + return $_ if /^\S+$/; + return split /\s*,\s*/ if /^\S+\s*,\s*\S+$/; + return '&' if /\[&\]$/; + return $1 if /\[([^]])\]$/; + croak "Unknown character format `$_'"; +} + +# Maybe fetch, and then read the HTML from the Wikipedia page and return it +sub get_html { + unlink $o{file} if $o{force}; + + croak "--local specified, but $o{file} does not exist" unless -f $o{file}; + + unless ( $o{local} ) { + my $URL = 'https://en.wikipedia.org/wiki/Morse_code'; + my $code = mirror( $o{url}, $o{file} ); + croak "Could not fetch $URL" if is_error( $code ); + } + + read_text( $o{file} ); +} diff --git a/challenge-035/ryan-thompson/perl5/ch-1.pl b/challenge-035/ryan-thompson/perl5/ch-1.pl new file mode 100755 index 0000000000..b8a9101c2a --- /dev/null +++ b/challenge-035/ryan-thompson/perl5/ch-1.pl @@ -0,0 +1,40 @@ +#!/usr/bin/env perl + +# ch-1.pl - Encode text into binary morse code +# +# Ryan Thompson <rjt@cpan.org> + +use 5.010; +use warnings; +use autodie; +use strict; +use utf8; +no warnings 'uninitialized'; +use Text::Trim; + +# Include the morse code table generated by ../extras/fetch-morse.pl +use lib qw< ../extras >; +eval { require 'morse_code.pl' }; +die "Run fetch-morse.pl in ../extras first" if $@; +my %morse = morse_hash(); + +# Text from stdin or filenames specified on the command line +while ( $_ = <<>> ) { + trim; + s/\s+/ /; # Consolidate internal whitespace (and tabs) to single space + say join '', map { morse_bin($_) } split ''; +} + +# Return morse binary for a character +# See challenge description for the specification of this, but basically +# dot: 1, dash: 111, 0 in between dots/dashes, and 000 between each char +# Undefined characters return an empty string +sub morse_bin { + my $ch = shift; + my %map = ( '.' => '1', '-' => '111' ); + return '0000' if $ch eq ' '; # Word separator, 7 0s but 3 from char sep + return '' if not exists $morse{$ch}; + my $morse = join '0', map { $map{$_} } split '', $morse{$ch}; + + $morse . '000'; # Char separator +} diff --git a/challenge-035/ryan-thompson/perl5/ch-2.pl b/challenge-035/ryan-thompson/perl5/ch-2.pl new file mode 100755 index 0000000000..e271a5aca2 --- /dev/null +++ b/challenge-035/ryan-thompson/perl5/ch-2.pl @@ -0,0 +1,50 @@ +#!/usr/bin/env perl + +# ch-2.pl - Read binary morse code +# +# Ryan Thompson <rjt@cpan.org> + +use 5.016; # fc() +use warnings; +use autodie; +use strict; +use utf8; +no warnings 'uninitialized'; + +binmode STDOUT, ':utf8'; + +# Include the morse code table generated by ../extras/fetch-morse.pl +use lib qw< ../extras >; +eval { require 'morse_code.pl' }; +die "Run fetch-morse.pl in ../extras first" if $@; +my %rmorse = reverse_morse( morse_hash() ); + + +chomp, say morse_bin_to_utf8($_) while $_ = <<>>; + + +# Take morse binary and turn it (back) into utf8 text +sub morse_bin_to_utf8 { + my $r = ''; + for (split /0{7}/, shift) { + $r .= join '', map { $rmorse{$_} } split /000/; + $r .= ' '; + } + + $r; +} + +# Reverse the morse hash. We could just use reverse(), but that would give +# inconsistent results for duplicate values. Plus we may as well convert +# the . and - to 1 and 0 at this stage. +sub reverse_morse { + my %morse = @_; + map { + my $morse = $morse{$_}; + $morse =~ s/\./10/g; + $morse =~ s/\-/1110/g; + $morse =~ s/0$//; # Remove trailing intra-char gap + + $morse => fc $_ + } keys %morse; +} |
