diff options
| -rw-r--r-- | challenge-109/james-smith/perl/ch-1.pl | 22 | ||||
| -rw-r--r-- | challenge-109/james-smith/perl/ch-2.pl | 5 | ||||
| -rw-r--r-- | challenge-110/james-smith/README.md | 283 | ||||
| -rw-r--r-- | challenge-110/james-smith/blog.txt | 1 | ||||
| -rw-r--r-- | challenge-110/james-smith/perl/ch-1.pl | 14 | ||||
| -rw-r--r-- | challenge-110/james-smith/perl/ch-2.pl | 104 | ||||
| -rw-r--r-- | challenge-110/james-smith/perl/in.txt | 5 | ||||
| -rw-r--r-- | challenge-110/james-smith/perl/phone_nos.txt | 9 |
8 files changed, 427 insertions, 16 deletions
diff --git a/challenge-109/james-smith/perl/ch-1.pl b/challenge-109/james-smith/perl/ch-1.pl index 1b89226a6a..6fccc59edd 100644 --- a/challenge-109/james-smith/perl/ch-1.pl +++ b/challenge-109/james-smith/perl/ch-1.pl @@ -19,7 +19,7 @@ is( chowla_for($_), $answer[ $_ ] ) foreach 1..20; done_testing(); ## We will quickly run benchmarking... -## This suggests the for loop to be approximately 40-50% +## This suggests the for loop to be approximately 40% ## faster than the map solution... ## It is also 9 characters shorter... @@ -30,19 +30,17 @@ cmpthese(1_000_000, { ## ## Rate Map For -## Map 38670/s -- -33% -## For 57670/s 49% -- +## Map 59524/s -- -26% +## For 79936/s 34% -- ## -sub chowla_map { - my ($t,$n) = (0,@_); ## First attempt - the one-liner is to write this as a map, ## we add $t at the end which is the value returned - ( map { (($n%$_) || ($t+=$_)) && () } 2..$n-1 ), $t; +sub chowla_map { + my ($t,$n) = (0,@_); + ( map { (($n%$_) || ($t+=$_)) && () } 2..$n>>1 ), $t; } -sub chowla_for { - my($t,$n)=(0,@_); ## This time we won't write this as a nasty map/reduce solution... ## @@ -55,12 +53,12 @@ sub chowla_for { ## can be rewritten as: ## ($condition)||($fun()) ## * in perl `foreach` and `for` are synonymous - so we can shorten - - ($n%$_)||($t+=$_) for 2..$n-1; - - ## Now a quick "shortening" - if there is no specific return + ## Finally a quick "shortening" - if there is no specific return ## statement - we can just omit the return in the last statement... +sub chowla_for { + my($t,$n)=(0,@_); + ($n%$_)||($t+=$_) for 2..$n>>1; $t; } diff --git a/challenge-109/james-smith/perl/ch-2.pl b/challenge-109/james-smith/perl/ch-2.pl index 436e0fca19..bafd89bff8 100644 --- a/challenge-109/james-smith/perl/ch-2.pl +++ b/challenge-109/james-smith/perl/ch-2.pl @@ -67,8 +67,6 @@ say ''; sub sep { say '------------------------------------------------------------------------'; } sub show { say "@{$_}" foreach @{$_[0]}; } -sub four_square { - ## For a start we make the observation that ## ## $a + 2$b + $c + 2$d + $e + 2$f + $g = $n * 4 where $n is the total of a square @@ -94,6 +92,7 @@ sub four_square { ## ## We push any valid results to the array +sub four_square { my ($t,@n1,@res) = (0,@_); $t+=$_ foreach @n1; foreach my $b ( @n1 ) { @@ -110,7 +109,6 @@ sub four_square { return \@res; } -sub four_square_non_unique { ## Now let us make no assumption about the numbers... ## We choose 3 from the list... ## We then compute n (and check for no remainder) @@ -123,6 +121,7 @@ sub four_square_non_unique { ## will end up with 2 entries in the list ## where you swap the equivalent values... +sub four_square_non_unique { my ($t,$check,@n1,%res) = (0,"@{[sort @_]}",@_); $t+=$_ foreach @n1; foreach my $i ( 0..@n1-1 ) { diff --git a/challenge-110/james-smith/README.md b/challenge-110/james-smith/README.md index 3b736748ea..8e08f1b553 100644 --- a/challenge-110/james-smith/README.md +++ b/challenge-110/james-smith/README.md @@ -1 +1,282 @@ -Solutions by James Smith. +# Challenge 1 - valid phone numbers... + +You are given a text file - Write a script to display all valid phone numbers in the given text file. + +## Solution 1 + +This is what Perl was designed for - pattern matching strings. The format we accept numbers is one of: + +``` + +nn nnnnnnnnnn + (nn) nnnnnnnnnn + nnnn nnnnnnnnnn +``` + +We can write this as a single regex - there are two parts to the pattern - the prefix and the number +itself. The latter is just a simple number match {we don't specify length as in some countries this +is of different lengths}. + +We group the three prefix patterns into a group match with `(`s and `|`s - remembering to add `?:` so +we save memory by not storing the match. + +We wrap this regex in a function call: +``` perl +sub is_valid_phone_number { + return m{\A\s*(?:[+]\d+|00\d+|[(]\d+[)])\s+\d+\s*\Z}; +} +``` +We can then just use this to grep over the lines of the file.... + +``` perl +print grep { is_valid_phone_number($_) } <>; +``` + +note we use `print` rather than `say` as we haven't stripped the return character from the line! + +# Challenge 2 - transposing a file + +You are given a text file. Write a script to transpose the contents of the given file. + +## Some background - I've seen this before! + +Now this is a problem I have come across before - I work at a genomic institute which handles large +amounts of data - some of these are in the forms of very large tab or comma separated value files +containing many 1000s of rows and many 1000s of columns, e.g. gene expression vs samples, genes vs +variations. These files can be 10G or larger in size.... + +Often these files are generated by automatic processes - but when you want to perform the next step +you would rather work down columns rather than across rows...! + +The problem as was posed on a Python mailing list is that this was taking for ever to run.... +Firstly moving back to python 2 helped, but unless the scripts were run on big memory machine {64G+} +then it was still taking a long time... + +As a non-python developer on the list - I looked to see if this could be easily achieved in Perl, to +show the pythonistas that actually Perl was still a better language for this sort of thing. + +Investigating the problem I realised that the method they were using was a slurp and print model.... +The problem with that for such large files was memory. Once slurped, chopped etc the machine was +swapping OR running out of memory. So had to come up with a cleaner script... I will outline 3 +methods of performing this + +## Solution 2a - The simplest solution - load in and split into arrays of arrays. + +The simplest solution is to slurp the data in the file into an array of array(ref)s. + +This is just a case of a `map` which splits the line into it's chunks - we use chomp +to remove any unneeded line endings. + +Once we have an array of arrays we take the first entry out of each row and printing +these in one line. We repeat this until we reach the end of the lines... Again this +is a simple one liner..... + +``` perl +sub transpose_split { + open my $fh, '<', $_[0]; + my @in = map { chomp;[ split /,/ ] } <$fh>; ## Slurp into array + close $fh; + open $fh, '>', $_[1]; + say {$fh} join ',', map {shift @{$_} } @in while @{$in[0]}; ## Generate transpose; + close $fh; +} +``` + +For an 1000 x 1000 file (6.75Mb), this uses approximately 90Mb of memory. This is +approximately 12-15 x space multiplier.. + +## Solution 2b - Load in but don't split into arrays + +This time we want to look at how we can save memory - one of the problems with the +solution above is the huge overhead of storing data in an array. + +To resolve this we can just slurp all the data into the array, and then use a regex +to strip the characters up to the `,` or "end of line" and output them, until the +strings are empty. + +``` perl +sub transpose_regex { + open my $fh, '<', $_[0]; + my @in = <$fh>; ## Slurp into array + close $fh; + open $fh, '>', $_[1]; + say {$fh} join ',', map { s{^(.*?)[,\r\n]+}{}; $1 } @in while $in[0]; + close $fh; +} +``` +This is more efficient - for the 1000 x 1000 example it only uses approx 9Mb of +memory this is on only about a 1.33 x space multiplier... This still becomes more +of an issue as the file size gets larger + +## Solution 2c - The ultimate large file solution - using `tell`, `seek` and `read`; + +Finally a solution for "very" large-files... + +Rather than slurping all the data into memory in one go - we instead pull content of +each of the lines a few bytes at a time, working with that - and going back to the +file system to retrieve more data. + +Perl has some simple functions for moving the "head" around in the file: + + * `tell` - return the current location in the file + * `seek` - set the current location in the file. + +Additionally it has another way of accessing the file content rather than `<$fh>`, +by using the lower level `read` command. This reads contents for the file and +stores it into a scalar variable. + +So this requires a "double" pass solution - first of all we use `<$fh>` to retrieve +every line of data from the file, noting the start and end of the line {in byte offsets} and +remebering the first `$BYTES` bytes from that line {in this example we set this to 1000 bytes}. + +We then use a similar approach to the previous one - where we use a regex to find the value +of the first column. But if we can't find a comma OR the end of the line - we retrieve another +`$BYTES` bytes of data to the string (until we can find a comma or the end of the line). + +If we get to the end of the line we only retrieve the number of bytes left in that line. + +``` perl +sub transpose_seek { + my($prev,@pos) = (0); + open my $fh, '<', $_[0]; + open my $ofh, '>', $_[1]; + ## Loop through the file and get the start/end of each line + push ( @pos, [$prev+$BYTES,tell $fh,substr $_,0,$BYTES]) && + ( ($pos[-1][0]>$pos[-1][1]) && ($pos[-1][0]=$pos[-1][1]), $prev=tell $fh) while <$fh>; + + while( $pos[0][0] < $pos[0][1] || length $pos[0][2] ) { + my @line; + foreach(@pos) { + while( $_->[2] !~ m{,} && $_->[0] < $_->[1] ) { + seek $fh, $_->[0], 0; + read $fh, $_->[2], $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0], length $_->[2]; + $_->[0] = tell $fh; + } + $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{}; + push @line, $1; + } + say ${$ofh} join q(,), @line; + } +} +``` + +### Notes on this code... + + * `@pos` is an array of triples: + + * **`[0]`** - the current offset in the file for the next chunk of this line; + + * **`[1]`** - the end of the current line; + + * **`[2]`** - the buffer for this line (initially the first `$BYTES` bytes) + + * Let's look at the three while loops.... + + * The push/while line slurping in data + + We use `while <$fh>` here rather than `foreach` as `while` doesn't accumulate + data into an array, where as `foreach <$fh>` does, saving memory. + + ``` perl + while( <$fh> ) { + push ( @pos, [$prev+$BYTES,tell $fh,substr $_,0,$BYTES]) && + ( + ($pos[-1][0]>$pos[-1][1]) && ($pos[-1][0]=$pos[-1][1]), + $prev=tell $fh + ); + } + ``` + + * This uses a lot of ***1-liner*** tricks - first of all we push the start, end and first + `$BYTES` bytes into the @pos array. This always evaluates to true, so we run the second + bracketed statement. This (a) sets the start of the next block to the end of this current + line if we have overshot, and then (b) updates the `$prev` value so we know where the + next line starts in the file.... + + * Now let us look at the 2nd while loop (outer)... + + ``` perl + while( $pos[0][0] < $pos[0][1] || length $pos[0][2] ) { + + } + ``` + This is checking that (a) we haven't run out of data to be retrieved for the first and + (b) haven't run out of data we have already retrieved. + + + * Finally the 3rd while loop (inner)... + + ``` perl + while( $_->[2] !~ m{,} && $_->[0] < $_->[1] ) { + seek $fh, $_->[0], 0; + read $fh, $_->[2], $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0], length $_->[2]; + $_->[0] = tell $fh; + } + ``` + In this loop we see if the row does not contain a comma AND there is data left... If this is the + case we have to retrieve more data from the file. We do this by first `seek`ing to the location + in the file that we need to get data from. We then retrieve the either $BYTES `bytes` of data (or + all the data left for the row {if it is less than `$BYTES` bytes.} + We then update the location for that particular row (using `tell`). + + Note also we use the 4 parameter version of read. + + `read $fh, $buffer, $bytes, $offset` + + By adding the offset - we can easily append this content onto the end of our buffer string. We have + to use length `$_->[2]` as you can use -ve indecies to read into the buffer with an offset from the + end - but this only works for -1, -2 etc not "-0". + + * We then use the regex trick to get the first column of the data. + + * Memory usage: + + * This script does not load the file all in one go - so really needs a lot less memory + (vs more disc accesses). It is linear in the number of lines, e.g. for the 1000 line file we load in + roughly 1Mb of data at a time, and the memory usage is roughly 1.3Mb. + * Note this is `O(n)` as well as if the rows get longer then the number of bytes used does not increase. + * Having played a bit - the sweet spot of `$BYTES` lies somewhere between 1K and 2K. Smaller makes the + regex in the split more efficient, larger reduces the file IO overhead. + +### Some information about speed/memory etc... + +The following are timings on a single core, 2G RAM, 4G swap machine: + +| Method/size | Time (s) | Kbytes | resident | shared | +| ----------- | -------: | -----: | -------: | -----: | +| Seek small | 0.001 | 16016| 7836| 5228 | +| Regex small | 0.000 | 16016| 7836| 5228 | +| Split small | 0.000 | 16016| 7836| 5228 | +| Seek 1000 | 1.346 | 17388| 9320| 5228 | +| Seek 2000 | 5.841 | 18848| 10636| 5228 | +| Seek 5000 | 54.208 | 23044| 14972| 5228 | +| Regex 1000 | 1.293 | 25492| 17288| 5228 | +| Seek 30000 | 3003.220 | 57312| 43948| 2720 | +| Regex 2000 | 9.040 | 63896| 51376| 3140 | +| Split 1000 | 0.934 | 105784| 93100| 3204 | +| Regex 5000 | 130.411 | 260432| 248016| 3204 | +| Split 2000 | 6.780 | 362028| 349388| 3204 | +| Split 5000 | 527.614 | 2153576| 1423468| 2764 | + +The size is the number of rows/columns - so the "1000" file has 1000 rows and 1000 columns (+row/column labels). + +File sizes: + +| name | size | row size | +| ----- | -----: | ----: | +| small | 61 bytes | 12 | +| 1000 | 6.6 Mbytes | 6.7K | +| 2000 | 27 Mbytes | 13.5K | +| 5000 | 165 Mbytes | 33.6K | +| 30000 | 5.8 Gbytes | 201.0K | + +If we look at the timings by method we can see that for the smaller files the `split` is +the most efficient {but the difference is relatively small}. But as the file size increases +then it soon becomes the least efficient: + +| Size | Split | Regex | Seek | +| -----: | ----: | ----: | ----: | +| small | **0.000** | 0.000 | *0.001* | +| 1000 | **0.934** | 1.293 | *1.346* | +| 2000 | 6.890 | *9.040* | **5.841** | +| 5000 | *527.614* | 130.411 | **54.208** | +| 30000 | - | - | **3003.220** | diff --git a/challenge-110/james-smith/blog.txt b/challenge-110/james-smith/blog.txt new file mode 100644 index 0000000000..017d59262e --- /dev/null +++ b/challenge-110/james-smith/blog.txt @@ -0,0 +1 @@ +https://github.com/drbaggy/perlweeklychallenge-club/tree/master/challenge-110/james-smith diff --git a/challenge-110/james-smith/perl/ch-1.pl b/challenge-110/james-smith/perl/ch-1.pl new file mode 100644 index 0000000000..f47ec05169 --- /dev/null +++ b/challenge-110/james-smith/perl/ch-1.pl @@ -0,0 +1,14 @@ +#!/usr/local/bin/perl + +use strict; + +use warnings; +use feature qw(say); +use Test::More; + +print grep { is_valid_phone_number($_) } <>; + +sub is_valid_phone_number { + return m{\A\s*(?:[+]\d+|00\d+|[(]\d+[)])\s+\d+\s*\Z}; +} + diff --git a/challenge-110/james-smith/perl/ch-2.pl b/challenge-110/james-smith/perl/ch-2.pl new file mode 100644 index 0000000000..4f543ff569 --- /dev/null +++ b/challenge-110/james-smith/perl/ch-2.pl @@ -0,0 +1,104 @@ +#!/usr/local/bin/perl + +use strict; + +use warnings; +use feature qw(say); +use Data::Dumper; +use Const::Fast qw(const); +use Time::HiRes qw(time); +const my $FN_TINY => 'in.txt'; +#const my $FN_SMALL => 'in-large.txt'; +const my $FN_SMALL => 'in-1000.txt'; +const my $FN_MEDIUM => 'in-2000.txt'; +const my $FN_LARGE => 'in-5000.txt'; +const my $FN_MASSIVE => 'in-30000.txt'; +const my $BYTES => 1_024; + +select(STDOUT); $| = 1; + +## For a small file we can use just about any valid technique... +## But here I will look at some different techniques which can +## be more efficient with medium, large data files. +## +## Some of these are a trade off between performance and +## +## At work we have large 2d arrays of data - e.g. genes vs samples, +## variations vs genes - often both dimensions are in the range 20,000 +## lines or more. So we need to look at faster - alternative less +## memory intense solutions +## +## The last function transpose_seek - was my solution to speeding +## up a python script +## +## Python 3 -> Python 2 sped things up by a factor or 4, the perl +## rewrite sped things up by a factor of 100 for the size of arrays +## we were looking at. + +my $t0; + + $t0 = time; transpose_seek( $FN_TINY, 'seek-small' ); say 'Seek small - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); + $t0 = time; transpose_regex( $FN_TINY, 'regex-small' ); say 'Regex small - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); + $t0 = time; transpose_split( $FN_TINY, 'split-small' ); say 'Split small - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); + $t0 = time; transpose_seek( $FN_SMALL, 'seek-1000' ); say 'Seek 1000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); + $t0 = time; transpose_seek( $FN_MEDIUM, 'seek-2000' ); say 'Seek 2000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); + $t0 = time; transpose_seek( $FN_LARGE, 'seek-5000' ); say 'Seek 5000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); + $t0 = time; transpose_regex( $FN_SMALL, 'regex-1000' ); say 'Regex 1000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); + $t0 = time; transpose_seek( $FN_MASSIVE, 'seek-30000' ); say 'Seek 30000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); + $t0 = time; transpose_regex( $FN_MEDIUM, 'regex-2000' ); say 'Regex 2000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); + $t0 = time; transpose_split( $FN_SMALL, 'split-1000' ); say 'Split 1000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); + $t0 = time; transpose_regex( $FN_LARGE, 'regex-5000' ); say 'Regex 5000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); + $t0 = time; transpose_split( $FN_MEDIUM, 'split-2000' ); say 'Split 2000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); + $t0 = time; transpose_split( $FN_LARGE, 'split-5000' ); say 'Split 5000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); + +sub transpose_split { + open my $fh, '<', $_[0]; + my @in = map { chomp;[ split /,/ ] } <$fh>; ## Slurp into array + close $fh; + open $fh, '>', $_[1]; + say {$fh} join ',', map {shift @{$_} } @in while @{$in[0]}; ## Generate transpose; + close $fh; +} + +sub transpose_regex { + open my $fh, '<', $_[0]; + my @in = <$fh>; ## Slurp into array + close $fh; + open $fh, '>', $_[1]; + say {$fh} join ',', map { s{^(.*?)[,\r\n]+}{}; $1 } @in while $in[0]; + close $fh; +} + +sub transpose_seek { + my($prev,@pos) = (0); + open my $fh, '<', $_[0]; + open my $ofh, '>', $_[1]; + ## Loop through the file and get the start/end of each line + push (@pos, [$prev+$BYTES,tell $fh,substr $_,0,$BYTES]) && (($pos[-1][0]>$pos[-1][1])&&($pos[-1][0]=$pos[-1][1]), $prev=tell $fh) while <$fh>; + + while( $pos[0][0] < $pos[0][1] || length $pos[0][2] ) { + my @line; + foreach(@pos) { + while( $_->[2] !~ m{,} && $_->[0] < $_->[1] ) { + seek $fh, $_->[0], 0; + read $fh, $_->[2], $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0], length $_->[2]; + $_->[0] = tell $fh; + } + $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{}; + push @line, $1; + } + say ${$ofh} join q(,), @line; + } +} + +sub get_statm_info { + my $error = ''; + my $ref = {}; + + if( open(_INFO,"</proc/$$/statm") ){ + my @info = split(/\s+/,<_INFO>); + close(_INFO); + return sprintf 'Size: %8d, Resident: %8d, Shared: %8d', $info[0]*4, $info[1]*4, $info[2]*4; + } + return '-'; +} diff --git a/challenge-110/james-smith/perl/in.txt b/challenge-110/james-smith/perl/in.txt new file mode 100644 index 0000000000..716ebdce75 --- /dev/null +++ b/challenge-110/james-smith/perl/in.txt @@ -0,0 +1,5 @@ +name,age,sex +Mohammad,45,m +Joe,20,m +Julie,35,f +Cristina,10,f diff --git a/challenge-110/james-smith/perl/phone_nos.txt b/challenge-110/james-smith/perl/phone_nos.txt new file mode 100644 index 0000000000..4f167ab35b --- /dev/null +++ b/challenge-110/james-smith/perl/phone_nos.txt @@ -0,0 +1,9 @@ +0044 1148820341 + +44 1148820341 + 44-11-4882-0341 +(44) 1148820341 + (44) 1148820341 + 00 1148820341 +00 1243143+1414 + () 342432 +(00) 12321-4324 |
