diff options
| author | Paulo Custodio <pauloscustodio@gmail.com> | 2021-01-23 15:41:16 +0000 |
|---|---|---|
| committer | Paulo Custodio <pauloscustodio@gmail.com> | 2021-01-23 15:41:16 +0000 |
| commit | e9cb33e05f1a3336cd13945da2d9a899b6f42120 (patch) | |
| tree | 9e5148191a003cdd9428a4d4ae5ea0bfa45fbd53 /challenge-024/paulo-custodio | |
| parent | 45a9178d4865a7289d6856d7756468f896a04e49 (diff) | |
| download | perlweeklychallenge-club-e9cb33e05f1a3336cd13945da2d9a899b6f42120.tar.gz perlweeklychallenge-club-e9cb33e05f1a3336cd13945da2d9a899b6f42120.tar.bz2 perlweeklychallenge-club-e9cb33e05f1a3336cd13945da2d9a899b6f42120.zip | |
Add Perl solution to challenge 024
Diffstat (limited to 'challenge-024/paulo-custodio')
| -rw-r--r-- | challenge-024/paulo-custodio/README | 1 | ||||
| -rw-r--r-- | challenge-024/paulo-custodio/perl/ch-1.pl | 0 | ||||
| -rw-r--r-- | challenge-024/paulo-custodio/perl/ch-2.pl | 179 | ||||
| -rw-r--r-- | challenge-024/paulo-custodio/test.pl | 77 |
4 files changed, 257 insertions, 0 deletions
diff --git a/challenge-024/paulo-custodio/README b/challenge-024/paulo-custodio/README new file mode 100644 index 0000000000..87dc0b2fbd --- /dev/null +++ b/challenge-024/paulo-custodio/README @@ -0,0 +1 @@ +Solution by Paulo Custodio diff --git a/challenge-024/paulo-custodio/perl/ch-1.pl b/challenge-024/paulo-custodio/perl/ch-1.pl new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/challenge-024/paulo-custodio/perl/ch-1.pl diff --git a/challenge-024/paulo-custodio/perl/ch-2.pl b/challenge-024/paulo-custodio/perl/ch-2.pl new file mode 100644 index 0000000000..31d780cfa9 --- /dev/null +++ b/challenge-024/paulo-custodio/perl/ch-2.pl @@ -0,0 +1,179 @@ +#!/usr/bin/perl + +# Challenge 019 +# +# Task #2 +# Create a script to implement full text search functionality using Inverted +# Index. According to wikipedia: +# +# In computer science, an inverted index (also referred to as a postings file +# or inverted file) is a database index storing a mapping from content, such as +# words or numbers, to its locations in a table, or in a document or a set of +# documents (named in contrast to a forward index, which maps from documents to +# content). The purpose of an inverted index is to allow fast full-text +# searches, at a cost of increased processing when a document is added to the +# database. + +# Solution: store the inverted index in a SQLite database, use DBI to access it + +use strict; +use warnings; +use 5.030; +use Path::Tiny; +use DBI; +use DBD::SQLite; +use Path::Tiny; +use constant DBFILE => "index.db3"; + +use Data::Dump 'dump'; + +# Create database if index does not exist +BEGIN { + if (! -f DBFILE) { + open(my $p, "| sqlite3 ".DBFILE) or die "cannot start sqlite3"; + say $p <<END; +CREATE TABLE words ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + word TEXT UNIQUE +); +CREATE TABLE documents ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title TEXT UNIQUE +); +CREATE TABLE found ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + document_id INTEGER, + word_id INTEGER +); +END + close($p) or die "sqlite3 failed"; + } +}; + +# main +my($op, @args) = @ARGV; +if (@ARGV>=2 && $op =~ /^add/i) { + add_doc($_) for @args; +} +elsif (@ARGV>=2 && $op =~ /^sea/) { + search($_) for @args; +} +else { + die "Usage: add documents | search words\n"; +} + + +# add words from document to index +sub add_doc { + my($doc) = @_; + + # get title + my $title = path($doc)->basename; + + # connect to index database + my $dbh = DBI->connect("dbi:SQLite:dbname=".DBFILE,"","", + { RaiseError => 1, AutoCommit => 1 }); + + # get document id + my $document_id = get_document_id($dbh, $title); + + # read document + my $word_count = 0; + my $text = path($doc)->slurp; + while ($text =~ /(\w+)/g) { + my $word = lc($1); + my $word_id = get_word_id($dbh, $word); + add_found($dbh, $document_id, $word_id); + $word_count++; + } + + say "Indexed $title: found $word_count words"; + + # disconnect from database + $dbh->disconnect(); +} + +# get or add value to table +sub get_or_add_id { + my($dbh, $table, $column, $value) = @_; + + for (1..2) { + # search document in db + my $sth = $dbh->prepare("SELECT id FROM $table WHERE $column = ?"); + $sth->execute($value); + my($id) = $sth->fetchrow(); + $sth->finish(); + + return $id if defined $id; + + # add value if not found + $sth = $dbh->prepare("INSERT INTO $table($column) VALUES(?)"); + $sth->execute($value); + $sth->finish(); + } + + die "failed to insert $column to $table value '$value'"; +} + +# get or add document id +sub get_document_id { + my($dbh, $title) = @_; + return get_or_add_id($dbh, "documents", "title", $title); +} + +# get or add word id +sub get_word_id { + my($dbh, $word) = @_; + return get_or_add_id($dbh, "words", "word", $word); +} + +# add a found location if not already found +sub add_found { + my($dbh, $document_id, $word_id) = @_; + + # search location in db + my $sth = $dbh->prepare(" + SELECT id FROM found + WHERE document_id = ? + AND word_id = ?"); + $sth->execute($document_id, $word_id); + my($id) = $sth->fetchrow(); + $sth->finish(); + + return if defined($id); + + # location not found, insert in db + $sth = $dbh->prepare("INSERT INTO found (document_id, word_id) + VALUES(?,?)"); + $sth->execute($document_id, $word_id); + $sth->finish(); +} + +# search word +sub search { + my($word) = @_; + + # connect to index database + my $dbh = DBI->connect("dbi:SQLite:dbname=".DBFILE,"","", + { RaiseError => 1, AutoCommit => 1 }); + + # search locations of each word + my $sth = $dbh->prepare(" + SELECT word, title + FROM documents, words, found + WHERE word = ? + AND found.document_id = documents.id + AND found.word_id = words.id + ORDER BY title"); + + $sth->execute($word); + + while (my($word, $title) = $sth->fetchrow()) { + say "$word\t$title"; + } + + $sth->finish(); + + # disconnect from database + $dbh->disconnect(); +} diff --git a/challenge-024/paulo-custodio/test.pl b/challenge-024/paulo-custodio/test.pl new file mode 100644 index 0000000000..a5930b3da4 --- /dev/null +++ b/challenge-024/paulo-custodio/test.pl @@ -0,0 +1,77 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use 5.030; +use Path::Tiny; +use Test::More; +use WWW::Mechanize; +use utf8::all; # books are in UTF-8 + +# Challenge 024 +# Task #1 +# Create a smallest script in terms of size that on execution doesn't throw any +# error. The script doesn't have to do anything special. You could even come up +# with smallest one-liner. +# +# My solution: an empty file! (0 bytes) - Perl executes it and does not throw an error +is capture("perl perl/ch-1.pl"), ""; + +# Task #2 +# Create a script to implement full text search functionality using Inverted +# Index. According to wikipedia: +# +# Download a couple of books from https://www.gutenberg.org/ebooks/ +get_book('The Masque of the Red Death.txt', + 'https://www.gutenberg.org/files/1064/1064-0.txt'); +get_book('The Fall of the House of Usher.txt', + 'https://www.gutenberg.org/cache/epub/932/pg932.txt'); +get_book('The Cask of Amontillado.txt', + 'https://www.gutenberg.org/cache/epub/1063/pg1063.txt'); +get_book('The Raven.txt', + 'https://www.gutenberg.org/cache/epub/17192/pg17192.txt'); + +# show index +is capture("perl perl/ch-2.pl search death"), <<END; +death The Fall of the House of Usher.txt +death The Masque of the Red Death.txt +death The Raven.txt +END + +is capture("perl perl/ch-2.pl search mystery"), <<END; +mystery The Fall of the House of Usher.txt +mystery The Raven.txt +END + +is capture("perl perl/ch-2.pl search imagination"), <<END; +imagination The Fall of the House of Usher.txt +imagination The Raven.txt +END + + +done_testing; + +sub capture { + my($cmd) = @_; + my $out = `$cmd`; + $out =~ s/[ \t\v\f\r]*\n/\n/g; + return $out; +} + +sub run { + my($cmd) = @_; + ok 0==system($cmd), $cmd; +} + +sub get_book { + my($file, $url) = @_; + if (!-f $file) { + say "Getting $url --> $file"; + my $mech = WWW::Mechanize->new(); + $mech->get($url); + path($file)->spew($mech->content); + + # build index + run("perl perl/ch-2.pl add '$file'"); + } +} |
