#!/usr/bin/perl
#
# Challenge 2: "Write a script for URL normalization based on rfc3986.
#
# According to Wikipedia, URL normalization is the process by which URLs
# are modified and standardized in a consistent manner. The goal of the
# normalization process is to transform a URL into a normalized URL so
# it is possible to determine if two syntactically different URLs may
# be equivalent.
# 
# My notes: The RFC link points to a long list of regex-syle changes,
# not all of which have to be implemented.  Easy to implement most.
# Later thought: many of the changes only apply to parts of the URL,
# so I'll reuse part of my solution to challenge 017, part 2.. url splitting
#
# ./ch-2.pl HTTP://ed@mit.edu:800/../%7e%64%75%6e%63%61%6e/%5d%20%ff/a/../../b/../c/../default.asp
# normalized url is http://ed@mit.edu:800/~duncan/
# 

use strict;
use warnings;
use Function::Parameters;
use Data::Dumper;

die "Usage: ch-2.pl URL\n" unless @ARGV==1;
my $url = shift;

$url = normalize( $url );
print "normalized url is $url\n";

#
# my %info = parse_url($url);
#	Parse URL $url. Return a hash of the pieces.  If parsing
#	fails, return an empty hash.
#	     scheme:[//[userinfo@]host[:port]]path[?query][#fragment]
#	eg. jdbc://user:password@localhost:3306/pwc?profile=true#h1
#
#	parses to:
#        scheme:   jdbc
#        userinfo: user:password
#        host:     localhost
#        port:     3306
#        path:     /pwc
#        query:    profile=true
#        fragment: h1
#
fun parse_url( $url )
{
	$url =~ s/^([^:]+):// || return ();

	my %hash;
	$hash{scheme} = $1;
	if( $url =~ s|^//|| )
	{
		$hash{userinfo} = $1 if $url =~ s|^(.+)@||;
		return () unless $url =~ s|^([\w\.]+)||;
		$hash{host} = $1;
		$hash{port} = $1 if $url =~ s/^:(\d+)//;
		$hash{fragment} = $1 if $url =~ s/#([^#]+)$//;
		$hash{query} = $1 if $url =~ s/\?([^\?]+)$//;
		$hash{path} = $url;
	}
	return %hash;
}


#
# $path = sanitize_path( $path );
#	remove '' and '.' path elements, and process '..' as if
#	we were descending a directory tree, and also remove trailing
#	inde.html and similar entries.
#
fun sanitize_path( $path )
{
	my @x = split( m|/|, $path );

	# traverse the path elements, ignoring '.' and '' elements,
	# pushing any element but a '..' on a stack,
	# and popping the top element when you see a '..'
	my @p;
	foreach (@x)
	{
		next if $_ eq '.' || $_ eq '';
		if( $_ eq '..' )
		{
			pop @p;
		} else
		{
			push @p, $_;
		}
	}

	my $path = '/'. join('/', @p );

	# remove trailing index.htm[l]? if present
	$path =~ s|/index.html?$|/|;

	# remove trailing default.jsp if present
	$path =~ s|/default.asp$|/|;

	# add trailing slash if missing - no, don't, bad idea
	#$path =~ s|([^/])$|$1/|;

	return $path;
}


#
# my $normalizedurl = normalize( $url );
#	Normalize $url according to RFC3986
#
fun normalize( $url )
{
	# 1. lowercase whole url
	$url = lc($url);

	# 2. uppercase %hh triples
	$url =~ s/(%[0-9a-f][0-9a-f])/\U$1/g;

	# 3. decode unnecessary %HH triples, viz:
	# "ALPHA (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D),
	#  period (%2E), underscore (%5F), or tilde (%7E)"
	$url =~
	s/%(4[0-9A-F]|5[0-9A]|6[0-9A-F]|7[0-9A]|3[0-9]|2D|2E|5F|7E)/chr(hex("0x".$1))/eg;

	# now parse url into parts..
	my %info = parse_url($url);

	# 4. remove default port
	delete $info{port} if $info{port} eq "80";

	my $path = $info{path};

	# 5. sanitize path in various ways, eg remove '', '.'and '..' elements
	$path = sanitize_path( $path );

	# finally, merge bits back together
	$url = $info{scheme}."://";
	$url .= $info{userinfo}."@" if $info{userinfo};
	$url .= $info{host};
	$url .= ":".$info{port} if $info{port};
	$url .= $path;
	$url .= "?".$info{query} if $info{query};
	$url .= "#".$info{fragment} if $info{fragment};
	return $url;
}