1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
#!/usr/bin/python3
# Challenge 021
#
# Task #2
# Write a script for URL normalization based on rfc3986. This task was shared by
# Anonymous Contributor.
#
# According to Wikipedia, URL normalization is the process by which URLs are
# modified and standardized in a consistent manner. The goal of the
# normalization process is to transform a URL into a normalized URL so it is
# possible to determine if two syntactically different URLs may be equivalent.
import sys
import re
def decode_triplets(hex):
c = chr(int(hex, 16))
if re.match(r"[a-zA-Z0-9\-._~]", c):
return c
else:
return '%'+hex.upper()
def upper_repl(matchobj):
return matchobj.group(0).upper()
def sheme_host_repl(matchobj):
return matchobj.group(1).lower()+matchobj.group(2)+matchobj.group(4).lower()
def decode_triplets_repl(matchobj):
return decode_triplets(matchobj.group(1))
def norm_uri(uri):
# Converting percent-encoded triplets to uppercase
uri = re.sub(r"%[0-9a-f]{2}", upper_repl, uri, flags=re.IGNORECASE)
# Converting the scheme and host to lowercase
uri = re.sub(r"^(\w+://)((.*?@)?)(.*?/)", sheme_host_repl, uri)
# Decoding percent-encoded triplets of unreserved characters
uri = re.sub(r"%([0-9a-f]{2})", decode_triplets_repl, uri, flags=re.IGNORECASE)
# Removing dot-segments
while True:
uri, count = re.subn(r"/\./", "/", uri, count=1)
if count==0:
break
while True:
uri, count = re.subn(r"/[^/]+/\.\./", "/", uri, count=1)
if count==0:
break
# Converting an empty path to a "/" path
uri = re.sub(r"^(\w+://[^/]+)$", r"\1/", uri)
# Removing the default port
uri = re.sub(r"^(http://[^/]+?):80/", r"\1/", uri)
return uri
print(norm_uri(sys.argv[1]))
|