Add Perl solution to challenge 024

author: Paulo Custodio <pauloscustodio@gmail.com> 2021-01-23 15:41:16 +0000
committer: Paulo Custodio <pauloscustodio@gmail.com> 2021-01-23 15:41:16 +0000
commit: e9cb33e05f1a3336cd13945da2d9a899b6f42120 (patch)
tree: 9e5148191a003cdd9428a4d4ae5ea0bfa45fbd53 /challenge-024/paulo-custodio
parent: 45a9178d4865a7289d6856d7756468f896a04e49 (diff)
download: perlweeklychallenge-club-e9cb33e05f1a3336cd13945da2d9a899b6f42120.tar.gz
perlweeklychallenge-club-e9cb33e05f1a3336cd13945da2d9a899b6f42120.tar.bz2
perlweeklychallenge-club-e9cb33e05f1a3336cd13945da2d9a899b6f42120.zip
4 files changed, 257 insertions, 0 deletions
diff --git a/challenge-024/paulo-custodio/README b/challenge-024/paulo-custodio/README
new file mode 100644
index 0000000000..87dc0b2fbd
--- /dev/null
+++ b/challenge-024/paulo-custodio/README
@@ -0,0 +1 @@
+Solution by Paulo Custodio
diff --git a/challenge-024/paulo-custodio/perl/ch-1.pl b/challenge-024/paulo-custodio/perl/ch-1.pl
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/challenge-024/paulo-custodio/perl/ch-1.pl
diff --git a/challenge-024/paulo-custodio/perl/ch-2.pl b/challenge-024/paulo-custodio/perl/ch-2.pl
new file mode 100644
index 0000000000..31d780cfa9
--- /dev/null
+++ b/challenge-024/paulo-custodio/perl/ch-2.pl
@@ -0,0 +1,179 @@
+#!/usr/bin/perl
+
+# Challenge 019
+#
+# Task #2
+# Create a script to implement full text search functionality using Inverted 
+# Index. According to wikipedia:
+# 
+# In computer science, an inverted index (also referred to as a postings file 
+# or inverted file) is a database index storing a mapping from content, such as
+# words or numbers, to its locations in a table, or in a document or a set of
+# documents (named in contrast to a forward index, which maps from documents to
+# content). The purpose of an inverted index is to allow fast full-text 
+# searches, at a cost of increased processing when a document is added to the
+# database.
+
+# Solution: store the inverted index in a SQLite database, use DBI to access it
+
+use strict;
+use warnings;
+use 5.030;
+use Path::Tiny;
+use DBI;
+use DBD::SQLite;
+use Path::Tiny;
+use constant DBFILE => "index.db3";
+
+use Data::Dump 'dump';
+
+# Create database if index does not exist
+BEGIN {
+    if (! -f DBFILE) {
+        open(my $p, "| sqlite3 ".DBFILE) or die "cannot start sqlite3";
+        say $p <<END;
+CREATE TABLE words (
+    id          INTEGER PRIMARY KEY AUTOINCREMENT,
+    word        TEXT UNIQUE
+);
+CREATE TABLE documents (
+    id          INTEGER PRIMARY KEY AUTOINCREMENT,
+    title       TEXT UNIQUE
+);
+CREATE TABLE found (
+    id          INTEGER PRIMARY KEY AUTOINCREMENT,
+    document_id INTEGER,
+    word_id     INTEGER
+);
+END
+        close($p) or die "sqlite3 failed";
+    }
+};
+
+# main 
+my($op, @args) = @ARGV;
+if (@ARGV>=2 && $op =~ /^add/i) {
+	add_doc($_) for @args;
+}
+elsif (@ARGV>=2 && $op =~ /^sea/) {
+	search($_) for @args;
+}
+else {
+	die "Usage: add documents | search words\n";
+}
+
+
+# add words from document to index
+sub add_doc {
+	my($doc) = @_;
+	
+	# get title
+	my $title = path($doc)->basename;
+
+	# connect to index database
+	my $dbh = DBI->connect("dbi:SQLite:dbname=".DBFILE,"","", 
+						   { RaiseError => 1, AutoCommit => 1 });
+
+	# get document id
+	my $document_id = get_document_id($dbh, $title);
+	
+	# read document
+	my $word_count = 0;
+	my $text = path($doc)->slurp;
+	while ($text =~ /(\w+)/g) {
+		my $word = lc($1);
+		my $word_id = get_word_id($dbh, $word);
+		add_found($dbh, $document_id, $word_id);
+		$word_count++;
+	}
+	
+	say "Indexed $title: found $word_count words";
+	
+	# disconnect from database
+	$dbh->disconnect();
+}
+
+# get or add value to table
+sub get_or_add_id {
+	my($dbh, $table, $column, $value) = @_;
+	
+	for (1..2) {
+		# search document in db
+		my $sth = $dbh->prepare("SELECT id FROM $table WHERE $column = ?");
+		$sth->execute($value); 
+		my($id) = $sth->fetchrow();
+		$sth->finish();
+
+		return $id if defined $id;
+		
+		# add value if not found
+		$sth = $dbh->prepare("INSERT INTO $table($column) VALUES(?)");
+		$sth->execute($value); 
+		$sth->finish();
+	}
+	
+	die "failed to insert $column to $table value '$value'";
+}
+
+# get or add document id
+sub get_document_id {
+	my($dbh, $title) = @_;
+	return get_or_add_id($dbh, "documents", "title", $title);
+}
+
+# get or add word id
+sub get_word_id {
+	my($dbh, $word) = @_;
+	return get_or_add_id($dbh, "words", "word", $word);
+}
+
+# add a found location if not already found
+sub add_found {
+	my($dbh, $document_id, $word_id) = @_;
+	
+	# search location in db
+	my $sth = $dbh->prepare("
+		SELECT id FROM found 
+		WHERE document_id = ? 
+		  AND word_id     = ?");
+	$sth->execute($document_id, $word_id); 
+	my($id) = $sth->fetchrow();
+	$sth->finish();
+	
+	return if defined($id);
+	
+	# location not found, insert in db
+	$sth = $dbh->prepare("INSERT INTO found (document_id, word_id) 
+						  VALUES(?,?)");
+	$sth->execute($document_id, $word_id); 
+	$sth->finish();
+}
+
+# search word
+sub search {
+	my($word) = @_;
+	
+	# connect to index database
+	my $dbh = DBI->connect("dbi:SQLite:dbname=".DBFILE,"","", 
+						   { RaiseError => 1, AutoCommit => 1 });
+
+	# search locations of each word
+	my $sth = $dbh->prepare("
+		SELECT word, title
+		FROM documents, words, found 
+		WHERE word = ?
+		  AND found.document_id = documents.id 
+		  AND found.word_id     = words.id
+		ORDER BY title");
+	
+	$sth->execute($word); 
+
+	while (my($word, $title) = $sth->fetchrow()) {
+		say "$word\t$title";
+	}
+
+	$sth->finish();
+
+	# disconnect from database
+	$dbh->disconnect();
+}
diff --git a/challenge-024/paulo-custodio/test.pl b/challenge-024/paulo-custodio/test.pl
new file mode 100644
index 0000000000..a5930b3da4
--- /dev/null
+++ b/challenge-024/paulo-custodio/test.pl
@@ -0,0 +1,77 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+use 5.030;
+use Path::Tiny;
+use Test::More;
+use WWW::Mechanize;
+use utf8::all;		# books are in UTF-8
+
+# Challenge 024
+# Task #1
+# Create a smallest script in terms of size that on execution doesn't throw any
+# error. The script doesn't have to do anything special. You could even come up 
+# with smallest one-liner.
+# 
+# My solution: an empty file! (0 bytes) - Perl executes it and does not throw an error
+is capture("perl perl/ch-1.pl"), "";
+
+# Task #2
+# Create a script to implement full text search functionality using Inverted 
+# Index. According to wikipedia:
+#
+# Download a couple of books from https://www.gutenberg.org/ebooks/
+get_book('The Masque of the Red Death.txt',	
+		 'https://www.gutenberg.org/files/1064/1064-0.txt');
+get_book('The Fall of the House of Usher.txt',	
+		 'https://www.gutenberg.org/cache/epub/932/pg932.txt');
+get_book('The Cask of Amontillado.txt',	
+		 'https://www.gutenberg.org/cache/epub/1063/pg1063.txt');
+get_book('The Raven.txt',	
+		 'https://www.gutenberg.org/cache/epub/17192/pg17192.txt');
+
+# show index
+is capture("perl perl/ch-2.pl search death"), <<END;
+death	The Fall of the House of Usher.txt
+death	The Masque of the Red Death.txt
+death	The Raven.txt
+END
+
+is capture("perl perl/ch-2.pl search mystery"), <<END;
+mystery	The Fall of the House of Usher.txt
+mystery	The Raven.txt
+END
+
+is capture("perl perl/ch-2.pl search imagination"), <<END;
+imagination	The Fall of the House of Usher.txt
+imagination	The Raven.txt
+END
+
+
+done_testing;
+
+sub capture {
+	my($cmd) = @_;
+	my $out = `$cmd`;
+	$out =~ s/[ \t\v\f\r]*\n/\n/g;
+	return $out;
+}
+
+sub run {
+	my($cmd) = @_;
+	ok 0==system($cmd), $cmd;
+}
+
+sub get_book {
+	my($file, $url) = @_;
+	if (!-f $file) {
+		say "Getting $url --> $file";
+		my $mech = WWW::Mechanize->new();
+		$mech->get($url);
+		path($file)->spew($mech->content);
+		
+		# build index
+		run("perl perl/ch-2.pl add '$file'");
+	}
+}
author	Paulo Custodio <pauloscustodio@gmail.com>	2021-01-23 15:41:16 +0000
committer	Paulo Custodio <pauloscustodio@gmail.com>	2021-01-23 15:41:16 +0000
commit	e9cb33e05f1a3336cd13945da2d9a899b6f42120 (patch)
tree	9e5148191a003cdd9428a4d4ae5ea0bfa45fbd53 /challenge-024/paulo-custodio
parent	45a9178d4865a7289d6856d7756468f896a04e49 (diff)
download	perlweeklychallenge-club-e9cb33e05f1a3336cd13945da2d9a899b6f42120.tar.gz perlweeklychallenge-club-e9cb33e05f1a3336cd13945da2d9a899b6f42120.tar.bz2 perlweeklychallenge-club-e9cb33e05f1a3336cd13945da2d9a899b6f42120.zip