aboutsummaryrefslogtreecommitdiff
path: root/challenge-021/duncan-c-white/perl5/ch-2.pl
blob: fc978b3a81a3e439dc336667d072099f83d2fc0e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/perl
#
# Challenge 2: "Write a script for URL normalization based on rfc3986.
#
# According to Wikipedia, URL normalization is the process by which URLs
# are modified and standardized in a consistent manner. The goal of the
# normalization process is to transform a URL into a normalized URL so
# it is possible to determine if two syntactically different URLs may
# be equivalent.
# 
# My notes: The RFC link points to a long list of regex-syle changes,
# not all of which have to be implemented.  Easy to implement most.
# Later thought: many of the changes only apply to parts of the URL,
# so I'll reuse part of my solution to challenge 017, part 2.. url splitting
#
# ./ch-2.pl HTTP://ed@mit.edu:800/../%7e%64%75%6e%63%61%6e/%5d%20%ff/a/../../b/../c/../default.asp
# normalized url is http://ed@mit.edu:800/~duncan/
# 

use strict;
use warnings;
use Function::Parameters;
use Data::Dumper;

die "Usage: ch-2.pl URL\n" unless @ARGV==1;
my $url = shift;

$url = normalize( $url );
print "normalized url is $url\n";

#
# my %info = parse_url($url);
#	Parse URL $url. Return a hash of the pieces.  If parsing
#	fails, return an empty hash.
#	     scheme:[//[userinfo@]host[:port]]path[?query][#fragment]
#	eg. jdbc://user:password@localhost:3306/pwc?profile=true#h1
#
#	parses to:
#        scheme:   jdbc
#        userinfo: user:password
#        host:     localhost
#        port:     3306
#        path:     /pwc
#        query:    profile=true
#        fragment: h1
#
fun parse_url( $url )
{
	$url =~ s/^([^:]+):// || return ();

	my %hash;
	$hash{scheme} = $1;
	if( $url =~ s|^//|| )
	{
		$hash{userinfo} = $1 if $url =~ s|^(.+)@||;
		return () unless $url =~ s|^([\w\.]+)||;
		$hash{host} = $1;
		$hash{port} = $1 if $url =~ s/^:(\d+)//;
		$hash{fragment} = $1 if $url =~ s/#([^#]+)$//;
		$hash{query} = $1 if $url =~ s/\?([^\?]+)$//;
		$hash{path} = $url;
	}
	return %hash;
}


#
# $path = sanitize_path( $path );
#	remove '' and '.' path elements, and process '..' as if
#	we were descending a directory tree, and also remove trailing
#	inde.html and similar entries.
#
fun sanitize_path( $path )
{
	my @x = split( m|/|, $path );

	# traverse the path elements, ignoring '.' and '' elements,
	# pushing any element but a '..' on a stack,
	# and popping the top element when you see a '..'
	my @p;
	foreach (@x)
	{
		next if $_ eq '.' || $_ eq '';
		if( $_ eq '..' )
		{
			pop @p;
		} else
		{
			push @p, $_;
		}
	}

	my $path = '/'. join('/', @p );

	# remove trailing index.htm[l]? if present
	$path =~ s|/index.html?$|/|;

	# remove trailing default.jsp if present
	$path =~ s|/default.asp$|/|;

	# add trailing slash if missing - no, don't, bad idea
	#$path =~ s|([^/])$|$1/|;

	return $path;
}


#
# my $normalizedurl = normalize( $url );
#	Normalize $url according to RFC3986
#
fun normalize( $url )
{
	# 1. lowercase whole url
	$url = lc($url);

	# 2. uppercase %hh triples
	$url =~ s/(%[0-9a-f][0-9a-f])/\U$1/g;

	# 3. decode unnecessary %HH triples, viz:
	# "ALPHA (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D),
	#  period (%2E), underscore (%5F), or tilde (%7E)"
	$url =~
	s/%(4[0-9A-F]|5[0-9A]|6[0-9A-F]|7[0-9A]|3[0-9]|2D|2E|5F|7E)/chr(hex("0x".$1))/eg;

	# now parse url into parts..
	my %info = parse_url($url);

	# 4. remove default port
	delete $info{port} if $info{port} eq "80";

	my $path = $info{path};

	# 5. sanitize path in various ways, eg remove '', '.'and '..' elements
	$path = sanitize_path( $path );

	# finally, merge bits back together
	$url = $info{scheme}."://";
	$url .= $info{userinfo}."@" if $info{userinfo};
	$url .= $info{host};
	$url .= ":".$info{port} if $info{port};
	$url .= $path;
	$url .= "?".$info{query} if $info{query};
	$url .= "#".$info{fragment} if $info{fragment};
	return $url;
}