1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
|
#!/usr/bin/python3
# Challenge 024
#
# Task #2
# Create a script to implement full text search functionality using Inverted
# Index. According to wikipedia:
#
# In computer science, an inverted index (also referred to as a postings file
# or inverted file) is a database index storing a mapping from content, such as
# words or numbers, to its locations in a table, or in a document or a set of
# documents (named in contrast to a forward index, which maps from documents to
# content). The purpose of an inverted index is to allow fast full-text
# searches, at a cost of increased processing when a document is added to the
# database.
import sys
import sqlite3
import os.path
import re
DBFILE = "index.db3"
# Create database if it does not exist
def create_database():
if not os.path.isfile(DBFILE):
con = sqlite3.connect(DBFILE)
cur = con.cursor()
# Create tables
cur.execute('''
CREATE TABLE words (
id INTEGER PRIMARY KEY AUTOINCREMENT,
word TEXT UNIQUE
);
''')
cur.execute('''
CREATE TABLE documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT UNIQUE
);
''')
cur.execute('''
CREATE TABLE found (
id INTEGER PRIMARY KEY AUTOINCREMENT,
document_id INTEGER,
word_id INTEGER
);
''')
cur.execute('''
CREATE UNIQUE INDEX found_index ON found(document_id, word_id);
''')
con.commit()
con.close()
# get or add value to table
def get_or_add_id(con, table, column, value):
cur = con.cursor()
for i in range(2): # lookup, insert, lookup again
# lookup
for id, in cur.execute(f"SELECT id FROM {table} WHERE {column} = ?", \
(value,)):
return id
# insert
cur.execute(f"INSERT INTO {table}({column}) VALUES(?)", (value,))
con.commit()
return None # not reached
# get or add document id
def get_document_id(con, title):
return get_or_add_id(con, "documents", "title", title)
# get or add word id
def get_word_id(con, word):
return get_or_add_id(con, "words", "word", word)
# add a found location if not already found
def add_found(con, document_id, word_id):
cur = con.cursor()
for id, in cur.execute("""
SELECT id FROM found
WHERE document_id = ?
AND word_id = ?
""", (document_id,word_id)):
return # found
# location not found, insert in db
cur.execute("""
INSERT INTO found (document_id, word_id)
VALUES(?,?)
""", (document_id,word_id))
con.commit()
# add words from document to index
def add_doc(file):
con = sqlite3.connect(DBFILE)
title = re.sub(r"\.\w+$", "", os.path.basename(file))
document_id = get_document_id(con, title)
# read document
word_count = 0
with open(file) as f:
for line in f.readlines():
for word in re.findall(r"\w+", line):
word = word.lower()
word_id = get_word_id(con, word)
add_found(con, document_id, word_id)
word_count += 1
print(f"Indexed {title}")
# search word
def search(find_word):
con = sqlite3.connect(DBFILE)
cur = con.cursor()
for word, title in cur.execute("""
SELECT word, title
FROM documents, words, found
WHERE word = ?
AND found.document_id = documents.id
AND found.word_id = words.id
ORDER BY title""", (find_word,)):
print(f"{word} {title}")
create_database()
if sys.argv[1]=="add":
for file in sys.argv[2:]:
add_doc(file)
elif sys.argv[1]=="search":
for word in sys.argv[2:]:
search(word)
else:
print("Usage: add documents | search words")
|