From 71e4cc2d14f0ff3c89d85d5da6b2c4303abcbbc2 Mon Sep 17 00:00:00 2001
From: dcw <d.white@imperial.ac.uk>
Date: Sun, 18 Aug 2019 20:47:42 +0100
Subject: added my solutions to challenge 21

---
 challenge-021/duncan-c-white/README        |  34 +++----
 challenge-021/duncan-c-white/perl5/ch-1.pl |  26 +++++
 challenge-021/duncan-c-white/perl5/ch-2.pl | 146 +++++++++++++++++++++++++++++
 challenge-021/duncan-c-white/perl5/parse   |  73 +++++++++++++++
 4 files changed, 259 insertions(+), 20 deletions(-)
 create mode 100755 challenge-021/duncan-c-white/perl5/ch-1.pl
 create mode 100755 challenge-021/duncan-c-white/perl5/ch-2.pl
 create mode 100755 challenge-021/duncan-c-white/perl5/parse

diff --git a/challenge-021/duncan-c-white/README b/challenge-021/duncan-c-white/README
index bf5b030e87..4d99aac50e 100644
--- a/challenge-021/duncan-c-white/README
+++ b/challenge-021/duncan-c-white/README
@@ -1,25 +1,19 @@
-Challenge 1: "Write a script to accept a string from command line and
-split it on change of character. For example, if the string is "ABBCDEEF",
-then it should split like 'A', 'BB', 'C', 'D', 'EE', 'F'."
+Challenge 1: "Write a script to calculate the value of e, also known
+ as Euler's number and Napier's constant."
 
-My notes: Clearly defined, sounds like a job for regexes.
+My notes: The associated wiki page reminds us that the easiest way of
+  calculating e (the base of natural logarithms) is via:
+  e = 2 + 1/2! + 1/3! + 1/4! + ... 1/n!  Let's use that.
 
 
-Challenge 2: "Write a script to print the smallest pair of Amicable Numbers."
+Challenge 2: "Write a script for URL normalization based on rfc3986. This
+task was shared by Anonymous Contributor.
 
-Amicable numbers are two different numbers so related that the sum of the
-proper divisors of each is equal to the other number. (A proper divisor
-of a number is a positive factor of that number other than the number
-itself. For example, the proper divisors of 6 are 1, 2, and 3.)
+According to Wikipedia, URL normalization is the process by which URLs
+are modified and standardized in a consistent manner. The goal of the
+normalization process is to transform a URL into a normalized URL so
+it is possible to determine if two syntactically different URLs may
+be equivalent.
 
-The smallest pair of amicable numbers is (220, 284). They are amicable
-because the proper divisors of 220 are 1, 2, 4, 5, 10, 11, 20, 22, 44,
-55 and 110, of which the sum is 284; and the proper divisors of 284 are 1,
-2, 4, 71 and 142, of which the sum is 220.
-
-The first ten amicable pairs are: (220, 284), (1184, 1210), (2620,
-2924), (5020, 5564), (6232, 6368), (10744, 10856), (12285, 14595),
-(17296, 18416), (63020, 76084), and (66928, 66992)
-
-My notes: Another clearly described problem.  Obvious method involves
-a bit of caching.
+My notes: The RFC link points to a long list of regex-syle changes,
+not all of which have to be implemented.  Easy to implement most.
diff --git a/challenge-021/duncan-c-white/perl5/ch-1.pl b/challenge-021/duncan-c-white/perl5/ch-1.pl
new file mode 100755
index 0000000000..ee09dc106c
--- /dev/null
+++ b/challenge-021/duncan-c-white/perl5/ch-1.pl
@@ -0,0 +1,26 @@
+#!/usr/bin/perl
+#
+# Challenge 1: "Write a script to calculate the value of e, also known
+#  as Euler's number and Napier's constant."
+# 
+# My notes: The associated wiki page reminds us that the easiest way of
+#   calculating e (the base of natural logarithms) is via:
+#   e = 1 + 1/1! + 1/2! + 1/3! + 1/4! + ... 1/n!  Let's use that.
+# 
+
+use strict;
+use warnings;
+#use Function::Parameters;
+#use Data::Dumper;
+
+die "Usage: ch-1.pl [NUMTERMS]\n" if @ARGV>1;
+my $nterms = shift // 30;
+
+my $e = 0;
+my $nfact = 1;
+foreach my $n (1..$nterms)
+{
+	$e += 1/$nfact;
+	$nfact *= $n;
+}
+print "e=$e\n";
diff --git a/challenge-021/duncan-c-white/perl5/ch-2.pl b/challenge-021/duncan-c-white/perl5/ch-2.pl
new file mode 100755
index 0000000000..fc978b3a81
--- /dev/null
+++ b/challenge-021/duncan-c-white/perl5/ch-2.pl
@@ -0,0 +1,146 @@
+#!/usr/bin/perl
+#
+# Challenge 2: "Write a script for URL normalization based on rfc3986.
+#
+# According to Wikipedia, URL normalization is the process by which URLs
+# are modified and standardized in a consistent manner. The goal of the
+# normalization process is to transform a URL into a normalized URL so
+# it is possible to determine if two syntactically different URLs may
+# be equivalent.
+# 
+# My notes: The RFC link points to a long list of regex-syle changes,
+# not all of which have to be implemented.  Easy to implement most.
+# Later thought: many of the changes only apply to parts of the URL,
+# so I'll reuse part of my solution to challenge 017, part 2.. url splitting
+#
+# ./ch-2.pl HTTP://ed@mit.edu:800/../%7e%64%75%6e%63%61%6e/%5d%20%ff/a/../../b/../c/../default.asp
+# normalized url is http://ed@mit.edu:800/~duncan/
+# 
+
+use strict;
+use warnings;
+use Function::Parameters;
+use Data::Dumper;
+
+die "Usage: ch-2.pl URL\n" unless @ARGV==1;
+my $url = shift;
+
+$url = normalize( $url );
+print "normalized url is $url\n";
+
+#
+# my %info = parse_url($url);
+#	Parse URL $url. Return a hash of the pieces.  If parsing
+#	fails, return an empty hash.
+#	     scheme:[//[userinfo@]host[:port]]path[?query][#fragment]
+#	eg. jdbc://user:password@localhost:3306/pwc?profile=true#h1
+#
+#	parses to:
+#        scheme:   jdbc
+#        userinfo: user:password
+#        host:     localhost
+#        port:     3306
+#        path:     /pwc
+#        query:    profile=true
+#        fragment: h1
+#
+fun parse_url( $url )
+{
+	$url =~ s/^([^:]+):// || return ();
+
+	my %hash;
+	$hash{scheme} = $1;
+	if( $url =~ s|^//|| )
+	{
+		$hash{userinfo} = $1 if $url =~ s|^(.+)@||;
+		return () unless $url =~ s|^([\w\.]+)||;
+		$hash{host} = $1;
+		$hash{port} = $1 if $url =~ s/^:(\d+)//;
+		$hash{fragment} = $1 if $url =~ s/#([^#]+)$//;
+		$hash{query} = $1 if $url =~ s/\?([^\?]+)$//;
+		$hash{path} = $url;
+	}
+	return %hash;
+}
+
+
+#
+# $path = sanitize_path( $path );
+#	remove '' and '.' path elements, and process '..' as if
+#	we were descending a directory tree, and also remove trailing
+#	inde.html and similar entries.
+#
+fun sanitize_path( $path )
+{
+	my @x = split( m|/|, $path );
+
+	# traverse the path elements, ignoring '.' and '' elements,
+	# pushing any element but a '..' on a stack,
+	# and popping the top element when you see a '..'
+	my @p;
+	foreach (@x)
+	{
+		next if $_ eq '.' || $_ eq '';
+		if( $_ eq '..' )
+		{
+			pop @p;
+		} else
+		{
+			push @p, $_;
+		}
+	}
+
+	my $path = '/'. join('/', @p );
+
+	# remove trailing index.htm[l]? if present
+	$path =~ s|/index.html?$|/|;
+
+	# remove trailing default.jsp if present
+	$path =~ s|/default.asp$|/|;
+
+	# add trailing slash if missing - no, don't, bad idea
+	#$path =~ s|([^/])$|$1/|;
+
+	return $path;
+}
+
+
+#
+# my $normalizedurl = normalize( $url );
+#	Normalize $url according to RFC3986
+#
+fun normalize( $url )
+{
+	# 1. lowercase whole url
+	$url = lc($url);
+
+	# 2. uppercase %hh triples
+	$url =~ s/(%[0-9a-f][0-9a-f])/\U$1/g;
+
+	# 3. decode unnecessary %HH triples, viz:
+	# "ALPHA (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D),
+	#  period (%2E), underscore (%5F), or tilde (%7E)"
+	$url =~
+	s/%(4[0-9A-F]|5[0-9A]|6[0-9A-F]|7[0-9A]|3[0-9]|2D|2E|5F|7E)/chr(hex("0x".$1))/eg;
+
+	# now parse url into parts..
+	my %info = parse_url($url);
+
+	# 4. remove default port
+	delete $info{port} if $info{port} eq "80";
+
+	my $path = $info{path};
+
+	# 5. sanitize path in various ways, eg remove '', '.'and '..' elements
+	$path = sanitize_path( $path );
+
+	# finally, merge bits back together
+	$url = $info{scheme}."://";
+	$url .= $info{userinfo}."@" if $info{userinfo};
+	$url .= $info{host};
+	$url .= ":".$info{port} if $info{port};
+	$url .= $path;
+	$url .= "?".$info{query} if $info{query};
+	$url .= "#".$info{fragment} if $info{fragment};
+	return $url;
+}
diff --git a/challenge-021/duncan-c-white/perl5/parse b/challenge-021/duncan-c-white/perl5/parse
new file mode 100755
index 0000000000..0fc5a6da02
--- /dev/null
+++ b/challenge-021/duncan-c-white/perl5/parse
@@ -0,0 +1,73 @@
+#!/usr/bin/perl
+
+# Challenge 2: "Create a script to parse URL and print the components of
+# URL. According to the Wiki page https://en.wikipedia.org/wiki/URL, the URL
+# syntax is as below:
+# 
+#     scheme:[//[userinfo@]host[:port]]path[?query][#fragment]
+# 
+# eg.  jdbc://user:password@localhost:3306/pwc?profile=true#h1
+# 
+#       scheme:   jdbc
+#       userinfo: user:password
+#       host:     localhost
+#       port:     3306
+#       path:     /pwc
+#       query:    profile=true
+#       fragment: h1
+# 
+# My notes: sounds pretty trivial for regexes, if the lexical syntax of
+# each component is defined clearly.  Ok, reading the above wiki page
+# doesn't make it 100% clear, but let's hack it up, that's probably good
+# enough for most cases.
+
+use strict;
+use warnings;
+use Function::Parameters;
+use Data::Dumper;
+
+#
+# my %info = parse_url($url);
+#	Parse URL $url. Return a hash of the pieces.  If parsing
+#	fails, return an empty hash.
+#	     scheme:[//[userinfo@]host[:port]]path[?query][#fragment]
+#	eg. jdbc://user:password@localhost:3306/pwc?profile=true#h1
+#
+#	parses to:
+#        scheme:   jdbc
+#        userinfo: user:password
+#        host:     localhost
+#        port:     3306
+#        path:     /pwc
+#        query:    profile=true
+#        fragment: h1
+#
+fun parse_url( $url )
+{
+	$url =~ s/^([^:]+):// || return ();
+
+	my %hash;
+	$hash{scheme} = $1;
+	if( $url =~ s|^//|| )
+	{
+		$hash{userinfo} = $1 if $url =~ s|^(.+)@||;
+		return () unless $url =~ s|^([\w\.]+)||;
+		$hash{host} = $1;
+		$hash{port} = $1 if $url =~ s/^:(\d+)//;
+		$hash{fragment} = $1 if $url =~ s/#([^#]+)$//;
+		$hash{query} = $1 if $url =~ s/\?([^\?]+)$//;
+		$hash{path} = $url;
+	}
+	return %hash;
+}
+
+
+
+#die "Usage: ch-2.pl URL*\n";
+push @ARGV, 'jdbc://user:password@localhost:3306/pwc?profile=true#h1'
+	unless @ARGV;
+foreach my $url (@ARGV)
+{
+	my %info = parse_url($url);
+	print "$url:\n". Dumper(\%info);
+}
-- 
cgit