diff options
Diffstat (limited to 'challenge-024/paulo-custodio/python/ch-2.py')
| -rw-r--r-- | challenge-024/paulo-custodio/python/ch-2.py | 132 |
1 files changed, 132 insertions, 0 deletions
diff --git a/challenge-024/paulo-custodio/python/ch-2.py b/challenge-024/paulo-custodio/python/ch-2.py new file mode 100644 index 0000000000..7b3aff407a --- /dev/null +++ b/challenge-024/paulo-custodio/python/ch-2.py @@ -0,0 +1,132 @@ +#!/usr/bin/python3 + +# Challenge 024 +# +# Task #2 +# Create a script to implement full text search functionality using Inverted +# Index. According to wikipedia: +# +# In computer science, an inverted index (also referred to as a postings file +# or inverted file) is a database index storing a mapping from content, such as +# words or numbers, to its locations in a table, or in a document or a set of +# documents (named in contrast to a forward index, which maps from documents to +# content). The purpose of an inverted index is to allow fast full-text +# searches, at a cost of increased processing when a document is added to the +# database. + +import sys +import sqlite3 +import os.path +import re + +DBFILE = "index.db3" + +# Create database if it does not exist +def create_database(): + if not os.path.isfile(DBFILE): + con = sqlite3.connect(DBFILE) + cur = con.cursor() + + # Create tables + cur.execute(''' + CREATE TABLE words ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + word TEXT UNIQUE + ); + ''') + cur.execute(''' + CREATE TABLE documents ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title TEXT UNIQUE + ); + ''') + cur.execute(''' + CREATE TABLE found ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + document_id INTEGER, + word_id INTEGER + ); + ''') + + con.commit() + con.close() + +# get or add value to table +def get_or_add_id(con, table, column, value): + cur = con.cursor() + for i in range(2): # lookup, insert, lookup again + # lookup + for id, in cur.execute(f"SELECT id FROM {table} WHERE {column} = ?", \ + (value,)): + return id + + # insert + cur.execute(f"INSERT INTO {table}({column}) VALUES(?)", (value,)) + con.commit() + return None # not reached + +# get or add document id +def get_document_id(con, title): + return get_or_add_id(con, "documents", "title", title) + +# get or add word id +def get_word_id(con, word): + return get_or_add_id(con, "words", "word", word) + +# add a found location if not already found +def add_found(con, document_id, word_id): + cur = con.cursor() + for id, in cur.execute(""" + SELECT id FROM found + WHERE document_id = ? + AND word_id = ? + """, (document_id,word_id)): + return # found + + # location not found, insert in db + cur.execute(""" + INSERT INTO found (document_id, word_id) + VALUES(?,?) + """, (document_id,word_id)) + con.commit() + +# add words from document to index +def add_doc(file): + con = sqlite3.connect(DBFILE) + + title = os.path.basename(file) + document_id = get_document_id(con, title) + + # read document + word_count = 0 + with open(file) as f: + for line in f.readlines(): + for word in re.findall(r"\w+", line): + word = word.lower() + word_id = get_word_id(con, word) + add_found(con, document_id, word_id) + word_count += 1 + print(f"Indexed {title}") + +# search word +def search(find_word): + con = sqlite3.connect(DBFILE) + cur = con.cursor() + for word, title in cur.execute(""" + SELECT word, title + FROM documents, words, found + WHERE word = ? + AND found.document_id = documents.id + AND found.word_id = words.id + ORDER BY title""", (find_word,)): + print(f"{word} {title}") + +create_database() +if sys.argv[1]=="add": + for file in sys.argv[2:]: + add_doc(file) +elif sys.argv[1]=="search": + for word in sys.argv[2:]: + search(word) +else: + print("Usage: add documents | search words") |
