1 files changed, 132 insertions, 0 deletions
diff --git a/challenge-024/paulo-custodio/python/ch-2.py b/challenge-024/paulo-custodio/python/ch-2.py
new file mode 100644
index 0000000000..7b3aff407a
--- /dev/null
+++ b/challenge-024/paulo-custodio/python/ch-2.py
@@ -0,0 +1,132 @@
+#!/usr/bin/python3
+
+# Challenge 024
+#
+# Task #2
+# Create a script to implement full text search functionality using Inverted
+# Index. According to wikipedia:
+#
+# In computer science, an inverted index (also referred to as a postings file
+# or inverted file) is a database index storing a mapping from content, such as
+# words or numbers, to its locations in a table, or in a document or a set of
+# documents (named in contrast to a forward index, which maps from documents to
+# content). The purpose of an inverted index is to allow fast full-text
+# searches, at a cost of increased processing when a document is added to the
+# database.
+
+import sys
+import sqlite3
+import os.path
+import re
+
+DBFILE = "index.db3"
+
+# Create database if it does not exist
+def create_database():
+    if not os.path.isfile(DBFILE):
+        con = sqlite3.connect(DBFILE)
+        cur = con.cursor()
+
+        # Create tables
+        cur.execute('''
+            CREATE TABLE words (
+                id          INTEGER PRIMARY KEY AUTOINCREMENT,
+                word        TEXT UNIQUE
+            );
+            ''')
+        cur.execute('''
+            CREATE TABLE documents (
+                id          INTEGER PRIMARY KEY AUTOINCREMENT,
+                title       TEXT UNIQUE
+            );
+            ''')
+        cur.execute('''
+            CREATE TABLE found (
+                id          INTEGER PRIMARY KEY AUTOINCREMENT,
+                document_id INTEGER,
+                word_id     INTEGER
+            );
+            ''')
+
+        con.commit()
+        con.close()
+
+# get or add value to table
+def get_or_add_id(con, table, column, value):
+    cur = con.cursor()
+    for i in range(2):      # lookup, insert, lookup again
+        # lookup
+        for id, in cur.execute(f"SELECT id FROM {table} WHERE {column} = ?", \
+                              (value,)):
+            return id
+
+        # insert
+        cur.execute(f"INSERT INTO {table}({column}) VALUES(?)", (value,))
+        con.commit()
+    return None # not reached
+
+# get or add document id
+def get_document_id(con, title):
+    return get_or_add_id(con, "documents", "title", title)
+
+# get or add word id
+def get_word_id(con, word):
+    return get_or_add_id(con, "words", "word", word)
+
+# add a found location if not already found
+def add_found(con, document_id, word_id):
+    cur = con.cursor()
+    for id, in cur.execute("""
+                    SELECT id FROM found
+                    WHERE document_id = ?
+                      AND word_id     = ?
+                """, (document_id,word_id)):
+        return  # found
+
+    # location not found, insert in db
+    cur.execute("""
+                INSERT INTO found (document_id, word_id)
+                          VALUES(?,?)
+                """, (document_id,word_id))
+    con.commit()
+
+# add words from document to index
+def add_doc(file):
+    con = sqlite3.connect(DBFILE)
+
+    title = os.path.basename(file)
+    document_id = get_document_id(con, title)
+
+    # read document
+    word_count = 0
+    with open(file) as f:
+        for line in f.readlines():
+            for word in re.findall(r"\w+", line):
+                word = word.lower()
+                word_id = get_word_id(con, word)
+                add_found(con, document_id, word_id)
+                word_count += 1
+    print(f"Indexed {title}")
+
+# search word
+def search(find_word):
+    con = sqlite3.connect(DBFILE)
+    cur = con.cursor()
+    for word, title in cur.execute("""
+                        SELECT word, title
+                        FROM documents, words, found
+                        WHERE word = ?
+                          AND found.document_id = documents.id
+                          AND found.word_id     = words.id
+                        ORDER BY title""", (find_word,)):
+        print(f"{word}  {title}")
+
+create_database()
+if sys.argv[1]=="add":
+    for file in sys.argv[2:]:
+        add_doc(file)
+elif sys.argv[1]=="search":
+    for word in sys.argv[2:]:
+        search(word)
+else:
+    print("Usage: add documents | search words")