blob: f7c1dc11442f7f11d0fd7d469e1a463e7a4c5b97 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
#!/usr/bin/perl
# Challenge 021
#
# Task #2
# Write a script for URL normalization based on rfc3986. This task was shared by
# Anonymous Contributor.
#
# According to Wikipedia, URL normalization is the process by which URLs are
# modified and standardized in a consistent manner. The goal of the
# normalization process is to transform a URL into a normalized URL so it is
# possible to determine if two syntactically different URLs may be equivalent.
use Modern::Perl;
sub decode_triplets {
my($hex) = @_;
my $c = chr(hex($hex));
return $c if $c =~ /[a-zA-Z0-9\-._~]/;
return '%'.uc($hex);
}
sub norm_uri {
my($uri) = @_;
for ($uri) {
# Converting percent-encoded triplets to uppercase
s/(\%[0-9a-f]{2})/\U$1/gi;
# Converting the scheme and host to lowercase
s/^(\w+:\/\/)((.*?@)?)(.*?\/)/\L$1\E$2\L$4/;
# Decoding percent-encoded triplets of unreserved characters
s/(\%([0-9a-f]{2}))/ decode_triplets($2) /gie;
# Removing dot-segments
s/\/\.\//\//g;
s/\/[^\/]+\/\.\.\//\//g;
# Converting an empty path to a "/" path
s/^(\w+:\/\/[^\/]+)$/$1\//;
# Removing the default port
s/^(http:\/\/[^\/]+?):80\//$1\//;
}
return $uri;
}
my $uri = shift;
say norm_uri($uri);
|