aboutsummaryrefslogtreecommitdiff
path: root/challenge-022/paulo-custodio/python/ch-2.py
blob: 8900cf37cdd9b8eb78e44b5209c389cab448a1c5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/python3

# Challenge 022
#
# Task #2
# Write a script to implement Lempel-Ziv-Welch (LZW) compression algorithm.
# The script should have method to encode/decode algorithm. The wiki page
# explains the compression algorithm very nicely.

import sys

class Dict():
    EOM = '#'
    SYMBOLS = ['A','B','C','D','E','F','G','H','I','J','K','L','M',
               'N','O','P','Q','R','S','T','U','V','W','X','Y','Z']

    def __init__(self):
        self.dict = {}
        self.symbols = []
        for sym in [Dict.EOM, *Dict.SYMBOLS]:
            self.add(sym)

    def __str__(self):
        return "dict="+str(self.dict)+",symbols="+str(self.symbols)

    def last(self):
        return len(self.symbols)-1

    def width(self):
        return len("{:b}".format(self.last()))

    def next_width(self):
        return len("{:b}".format(self.last()+1))

    def add(self, seq):
        seq = seq.upper()

        id = self.last()+1
        self.dict[seq] = id
        self.symbols.append(seq)

    def longest_match(self, text):
        text = text.upper()

        # find longest match
        match_len = 0
        while match_len < len(text) and \
              text[:match_len+1] in self.dict:
            match_len += 1
        w = text[:match_len]
        text = text[match_len:]
        code = self.dict[w]
        old_width = self.width()

        # store new prefix in the dictionary
        if text != "":
            next_prefix = w+text[0]
            self.add(next_prefix)

        # return code and new text
        return ("{:0"+str(old_width)+"b}").format(code), text

def encode(text):
    text = text.upper()+Dict.EOM
    encoded = ""
    dict = Dict()

    while text != "":
        code, text = dict.longest_match(text)
        encoded += code

    return encoded

def decode(encoded):
    text = ""
    dict = Dict()

    while encoded != "":
        width = dict.width()
        code = int(encoded[:width], 2)
        encoded = encoded[width:]
        seq = dict.symbols[code]
        text += seq

        if encoded != "":
            next_width = dict.next_width()
            next_code = int(encoded[:next_width], 2)
            next_seq = dict.symbols[next_code]
            dict.add(seq + next_seq[0])

    # remove end terminator
    text = text[:-1]
    return text

if sys.argv[1] == "encode":
    print(encode(sys.argv[2]))
elif sys.argv[1] == "decode":
    print(decode(sys.argv[2]))
else:
    print("Usage: ch-2.py encode|decode text")