From dd6293e6c7c4752f20974bb223deeb8c1c8b8373 Mon Sep 17 00:00:00 2001 From: Mohammad S Anwar Date: Mon, 9 Sep 2019 01:23:00 +0100 Subject: - Added solutions by Colin Crain. --- challenge-024/colin-crain/perl5/ch-1.pl | 1 + challenge-024/colin-crain/perl5/ch-2.pl | 98 +++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 challenge-024/colin-crain/perl5/ch-1.pl create mode 100644 challenge-024/colin-crain/perl5/ch-2.pl diff --git a/challenge-024/colin-crain/perl5/ch-1.pl b/challenge-024/colin-crain/perl5/ch-1.pl new file mode 100644 index 0000000000..da1d2f1a0c --- /dev/null +++ b/challenge-024/colin-crain/perl5/ch-1.pl @@ -0,0 +1 @@ +perl -e “” diff --git a/challenge-024/colin-crain/perl5/ch-2.pl b/challenge-024/colin-crain/perl5/ch-2.pl new file mode 100644 index 0000000000..cd4701ed12 --- /dev/null +++ b/challenge-024/colin-crain/perl5/ch-2.pl @@ -0,0 +1,98 @@ +#! /opt/local/bin/perl +# +# inverted_index.pl +# +# +# +# 2019 colin crain +## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## + +use warnings; +use strict; +use feature ":5.26"; + +## ## ## ## ## MAIN + +my @docs = @ARGV; + +say "enter search terms: "; +my @TEST_TERMS = split /\s/, ; + +my $idx = make_index( @docs ); + +my ($match, $all) = search_index($idx, @TEST_TERMS); + +say "matched all:\n", (join ", ", $match->@*); +say "by prevalence:\n", (join "\n ", $all->@*); + + +## ## ## ## ## SUBS + +sub make_index { + my @documents = @_; + + my $idx = {}; + + ## open and slurp down the file, + ## split on whitespace to make a word list + for my $file ( @documents ) { + open (my $fh, "<" , $file) or die "can't open $file to read: $!\n"; + my @words = split /\s/, <$fh>; + close $fh; + + ## run it through a unique filter + my @unique_words = uniq( @words ); + + ## add each word to the index adding the file to the value array + for my $word ( @unique_words ) { + if (exists $idx->{$word}) { + push $idx->{$word}->@*, $file; + } + else { + $idx->{$word} = [$file]; + } + } + } + + return $idx; +} + +sub search_index { +## takes an index hashref and a list of search terms +## returns two lists, the first of docs with all terms +## the second a sorted list of documents by term prevalence +## ( most words found first, will include all documents with at least one word) + my ($idx, @words) = @_; + + ## for each search term, add its index doc list to the result + my @result; + for my $search_term ( @words ) { + if (exists $idx->{$search_term}) { + push @result, $idx->{$search_term}->@*; + } + } + + ## count the document occurrences in the concatenated index lists + my %result_count; + foreach my $document ( @result ) { + $result_count{$document}++; + } + my @unique_result = uniq( @result ); + + ## if the occurence count equals the search term count, all terms match to the document + my @all_terms = grep { $result_count{$_} == scalar @words } @unique_result; + + ## alternately, sort the unique result list on prevalence + my @by_prevalence = sort { $result_count{$b} <=> $result_count{$a} } @unique_result; + + return (\@all_terms, \@by_prevalence); +} + +sub uniq { +## filters a list for unique terms, with first occurence preserved +## we could grab List::Util::uniq but here we roll our own with the "seen" trick + my %seen; + return grep { ! $seen{$_}++ } @_; +} + + -- cgit