aboutsummaryrefslogtreecommitdiff
path: root/challenge-024/paulo-custodio/python/ch-2.py
diff options
context:
space:
mode:
Diffstat (limited to 'challenge-024/paulo-custodio/python/ch-2.py')
-rw-r--r--challenge-024/paulo-custodio/python/ch-2.py132
1 files changed, 132 insertions, 0 deletions
diff --git a/challenge-024/paulo-custodio/python/ch-2.py b/challenge-024/paulo-custodio/python/ch-2.py
new file mode 100644
index 0000000000..7b3aff407a
--- /dev/null
+++ b/challenge-024/paulo-custodio/python/ch-2.py
@@ -0,0 +1,132 @@
+#!/usr/bin/python3
+
+# Challenge 024
+#
+# Task #2
+# Create a script to implement full text search functionality using Inverted
+# Index. According to wikipedia:
+#
+# In computer science, an inverted index (also referred to as a postings file
+# or inverted file) is a database index storing a mapping from content, such as
+# words or numbers, to its locations in a table, or in a document or a set of
+# documents (named in contrast to a forward index, which maps from documents to
+# content). The purpose of an inverted index is to allow fast full-text
+# searches, at a cost of increased processing when a document is added to the
+# database.
+
+import sys
+import sqlite3
+import os.path
+import re
+
+DBFILE = "index.db3"
+
+# Create database if it does not exist
+def create_database():
+ if not os.path.isfile(DBFILE):
+ con = sqlite3.connect(DBFILE)
+ cur = con.cursor()
+
+ # Create tables
+ cur.execute('''
+ CREATE TABLE words (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ word TEXT UNIQUE
+ );
+ ''')
+ cur.execute('''
+ CREATE TABLE documents (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ title TEXT UNIQUE
+ );
+ ''')
+ cur.execute('''
+ CREATE TABLE found (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ document_id INTEGER,
+ word_id INTEGER
+ );
+ ''')
+
+ con.commit()
+ con.close()
+
+# get or add value to table
+def get_or_add_id(con, table, column, value):
+ cur = con.cursor()
+ for i in range(2): # lookup, insert, lookup again
+ # lookup
+ for id, in cur.execute(f"SELECT id FROM {table} WHERE {column} = ?", \
+ (value,)):
+ return id
+
+ # insert
+ cur.execute(f"INSERT INTO {table}({column}) VALUES(?)", (value,))
+ con.commit()
+ return None # not reached
+
+# get or add document id
+def get_document_id(con, title):
+ return get_or_add_id(con, "documents", "title", title)
+
+# get or add word id
+def get_word_id(con, word):
+ return get_or_add_id(con, "words", "word", word)
+
+# add a found location if not already found
+def add_found(con, document_id, word_id):
+ cur = con.cursor()
+ for id, in cur.execute("""
+ SELECT id FROM found
+ WHERE document_id = ?
+ AND word_id = ?
+ """, (document_id,word_id)):
+ return # found
+
+ # location not found, insert in db
+ cur.execute("""
+ INSERT INTO found (document_id, word_id)
+ VALUES(?,?)
+ """, (document_id,word_id))
+ con.commit()
+
+# add words from document to index
+def add_doc(file):
+ con = sqlite3.connect(DBFILE)
+
+ title = os.path.basename(file)
+ document_id = get_document_id(con, title)
+
+ # read document
+ word_count = 0
+ with open(file) as f:
+ for line in f.readlines():
+ for word in re.findall(r"\w+", line):
+ word = word.lower()
+ word_id = get_word_id(con, word)
+ add_found(con, document_id, word_id)
+ word_count += 1
+ print(f"Indexed {title}")
+
+# search word
+def search(find_word):
+ con = sqlite3.connect(DBFILE)
+ cur = con.cursor()
+ for word, title in cur.execute("""
+ SELECT word, title
+ FROM documents, words, found
+ WHERE word = ?
+ AND found.document_id = documents.id
+ AND found.word_id = words.id
+ ORDER BY title""", (find_word,)):
+ print(f"{word} {title}")
+
+create_database()
+if sys.argv[1]=="add":
+ for file in sys.argv[2:]:
+ add_doc(file)
+elif sys.argv[1]=="search":
+ for word in sys.argv[2:]:
+ search(word)
+else:
+ print("Usage: add documents | search words")