aboutsummaryrefslogtreecommitdiff
path: root/challenge-024/paulo-custodio
diff options
context:
space:
mode:
authorPaulo Custodio <pauloscustodio@gmail.com>2021-01-23 15:41:16 +0000
committerPaulo Custodio <pauloscustodio@gmail.com>2021-01-23 15:41:16 +0000
commite9cb33e05f1a3336cd13945da2d9a899b6f42120 (patch)
tree9e5148191a003cdd9428a4d4ae5ea0bfa45fbd53 /challenge-024/paulo-custodio
parent45a9178d4865a7289d6856d7756468f896a04e49 (diff)
downloadperlweeklychallenge-club-e9cb33e05f1a3336cd13945da2d9a899b6f42120.tar.gz
perlweeklychallenge-club-e9cb33e05f1a3336cd13945da2d9a899b6f42120.tar.bz2
perlweeklychallenge-club-e9cb33e05f1a3336cd13945da2d9a899b6f42120.zip
Add Perl solution to challenge 024
Diffstat (limited to 'challenge-024/paulo-custodio')
-rw-r--r--challenge-024/paulo-custodio/README1
-rw-r--r--challenge-024/paulo-custodio/perl/ch-1.pl0
-rw-r--r--challenge-024/paulo-custodio/perl/ch-2.pl179
-rw-r--r--challenge-024/paulo-custodio/test.pl77
4 files changed, 257 insertions, 0 deletions
diff --git a/challenge-024/paulo-custodio/README b/challenge-024/paulo-custodio/README
new file mode 100644
index 0000000000..87dc0b2fbd
--- /dev/null
+++ b/challenge-024/paulo-custodio/README
@@ -0,0 +1 @@
+Solution by Paulo Custodio
diff --git a/challenge-024/paulo-custodio/perl/ch-1.pl b/challenge-024/paulo-custodio/perl/ch-1.pl
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/challenge-024/paulo-custodio/perl/ch-1.pl
diff --git a/challenge-024/paulo-custodio/perl/ch-2.pl b/challenge-024/paulo-custodio/perl/ch-2.pl
new file mode 100644
index 0000000000..31d780cfa9
--- /dev/null
+++ b/challenge-024/paulo-custodio/perl/ch-2.pl
@@ -0,0 +1,179 @@
+#!/usr/bin/perl
+
+# Challenge 019
+#
+# Task #2
+# Create a script to implement full text search functionality using Inverted
+# Index. According to wikipedia:
+#
+# In computer science, an inverted index (also referred to as a postings file
+# or inverted file) is a database index storing a mapping from content, such as
+# words or numbers, to its locations in a table, or in a document or a set of
+# documents (named in contrast to a forward index, which maps from documents to
+# content). The purpose of an inverted index is to allow fast full-text
+# searches, at a cost of increased processing when a document is added to the
+# database.
+
+# Solution: store the inverted index in a SQLite database, use DBI to access it
+
+use strict;
+use warnings;
+use 5.030;
+use Path::Tiny;
+use DBI;
+use DBD::SQLite;
+use Path::Tiny;
+use constant DBFILE => "index.db3";
+
+use Data::Dump 'dump';
+
+# Create database if index does not exist
+BEGIN {
+ if (! -f DBFILE) {
+ open(my $p, "| sqlite3 ".DBFILE) or die "cannot start sqlite3";
+ say $p <<END;
+CREATE TABLE words (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ word TEXT UNIQUE
+);
+CREATE TABLE documents (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ title TEXT UNIQUE
+);
+CREATE TABLE found (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ document_id INTEGER,
+ word_id INTEGER
+);
+END
+ close($p) or die "sqlite3 failed";
+ }
+};
+
+# main
+my($op, @args) = @ARGV;
+if (@ARGV>=2 && $op =~ /^add/i) {
+ add_doc($_) for @args;
+}
+elsif (@ARGV>=2 && $op =~ /^sea/) {
+ search($_) for @args;
+}
+else {
+ die "Usage: add documents | search words\n";
+}
+
+
+# add words from document to index
+sub add_doc {
+ my($doc) = @_;
+
+ # get title
+ my $title = path($doc)->basename;
+
+ # connect to index database
+ my $dbh = DBI->connect("dbi:SQLite:dbname=".DBFILE,"","",
+ { RaiseError => 1, AutoCommit => 1 });
+
+ # get document id
+ my $document_id = get_document_id($dbh, $title);
+
+ # read document
+ my $word_count = 0;
+ my $text = path($doc)->slurp;
+ while ($text =~ /(\w+)/g) {
+ my $word = lc($1);
+ my $word_id = get_word_id($dbh, $word);
+ add_found($dbh, $document_id, $word_id);
+ $word_count++;
+ }
+
+ say "Indexed $title: found $word_count words";
+
+ # disconnect from database
+ $dbh->disconnect();
+}
+
+# get or add value to table
+sub get_or_add_id {
+ my($dbh, $table, $column, $value) = @_;
+
+ for (1..2) {
+ # search document in db
+ my $sth = $dbh->prepare("SELECT id FROM $table WHERE $column = ?");
+ $sth->execute($value);
+ my($id) = $sth->fetchrow();
+ $sth->finish();
+
+ return $id if defined $id;
+
+ # add value if not found
+ $sth = $dbh->prepare("INSERT INTO $table($column) VALUES(?)");
+ $sth->execute($value);
+ $sth->finish();
+ }
+
+ die "failed to insert $column to $table value '$value'";
+}
+
+# get or add document id
+sub get_document_id {
+ my($dbh, $title) = @_;
+ return get_or_add_id($dbh, "documents", "title", $title);
+}
+
+# get or add word id
+sub get_word_id {
+ my($dbh, $word) = @_;
+ return get_or_add_id($dbh, "words", "word", $word);
+}
+
+# add a found location if not already found
+sub add_found {
+ my($dbh, $document_id, $word_id) = @_;
+
+ # search location in db
+ my $sth = $dbh->prepare("
+ SELECT id FROM found
+ WHERE document_id = ?
+ AND word_id = ?");
+ $sth->execute($document_id, $word_id);
+ my($id) = $sth->fetchrow();
+ $sth->finish();
+
+ return if defined($id);
+
+ # location not found, insert in db
+ $sth = $dbh->prepare("INSERT INTO found (document_id, word_id)
+ VALUES(?,?)");
+ $sth->execute($document_id, $word_id);
+ $sth->finish();
+}
+
+# search word
+sub search {
+ my($word) = @_;
+
+ # connect to index database
+ my $dbh = DBI->connect("dbi:SQLite:dbname=".DBFILE,"","",
+ { RaiseError => 1, AutoCommit => 1 });
+
+ # search locations of each word
+ my $sth = $dbh->prepare("
+ SELECT word, title
+ FROM documents, words, found
+ WHERE word = ?
+ AND found.document_id = documents.id
+ AND found.word_id = words.id
+ ORDER BY title");
+
+ $sth->execute($word);
+
+ while (my($word, $title) = $sth->fetchrow()) {
+ say "$word\t$title";
+ }
+
+ $sth->finish();
+
+ # disconnect from database
+ $dbh->disconnect();
+}
diff --git a/challenge-024/paulo-custodio/test.pl b/challenge-024/paulo-custodio/test.pl
new file mode 100644
index 0000000000..a5930b3da4
--- /dev/null
+++ b/challenge-024/paulo-custodio/test.pl
@@ -0,0 +1,77 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+use 5.030;
+use Path::Tiny;
+use Test::More;
+use WWW::Mechanize;
+use utf8::all; # books are in UTF-8
+
+# Challenge 024
+# Task #1
+# Create a smallest script in terms of size that on execution doesn't throw any
+# error. The script doesn't have to do anything special. You could even come up
+# with smallest one-liner.
+#
+# My solution: an empty file! (0 bytes) - Perl executes it and does not throw an error
+is capture("perl perl/ch-1.pl"), "";
+
+# Task #2
+# Create a script to implement full text search functionality using Inverted
+# Index. According to wikipedia:
+#
+# Download a couple of books from https://www.gutenberg.org/ebooks/
+get_book('The Masque of the Red Death.txt',
+ 'https://www.gutenberg.org/files/1064/1064-0.txt');
+get_book('The Fall of the House of Usher.txt',
+ 'https://www.gutenberg.org/cache/epub/932/pg932.txt');
+get_book('The Cask of Amontillado.txt',
+ 'https://www.gutenberg.org/cache/epub/1063/pg1063.txt');
+get_book('The Raven.txt',
+ 'https://www.gutenberg.org/cache/epub/17192/pg17192.txt');
+
+# show index
+is capture("perl perl/ch-2.pl search death"), <<END;
+death The Fall of the House of Usher.txt
+death The Masque of the Red Death.txt
+death The Raven.txt
+END
+
+is capture("perl perl/ch-2.pl search mystery"), <<END;
+mystery The Fall of the House of Usher.txt
+mystery The Raven.txt
+END
+
+is capture("perl perl/ch-2.pl search imagination"), <<END;
+imagination The Fall of the House of Usher.txt
+imagination The Raven.txt
+END
+
+
+done_testing;
+
+sub capture {
+ my($cmd) = @_;
+ my $out = `$cmd`;
+ $out =~ s/[ \t\v\f\r]*\n/\n/g;
+ return $out;
+}
+
+sub run {
+ my($cmd) = @_;
+ ok 0==system($cmd), $cmd;
+}
+
+sub get_book {
+ my($file, $url) = @_;
+ if (!-f $file) {
+ say "Getting $url --> $file";
+ my $mech = WWW::Mechanize->new();
+ $mech->get($url);
+ path($file)->spew($mech->content);
+
+ # build index
+ run("perl perl/ch-2.pl add '$file'");
+ }
+}