aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--challenge-109/james-smith/perl/ch-1.pl22
-rw-r--r--challenge-109/james-smith/perl/ch-2.pl5
-rw-r--r--challenge-110/james-smith/README.md283
-rw-r--r--challenge-110/james-smith/blog.txt1
-rw-r--r--challenge-110/james-smith/perl/ch-1.pl14
-rw-r--r--challenge-110/james-smith/perl/ch-2.pl104
-rw-r--r--challenge-110/james-smith/perl/in.txt5
-rw-r--r--challenge-110/james-smith/perl/phone_nos.txt9
8 files changed, 427 insertions, 16 deletions
diff --git a/challenge-109/james-smith/perl/ch-1.pl b/challenge-109/james-smith/perl/ch-1.pl
index 1b89226a6a..6fccc59edd 100644
--- a/challenge-109/james-smith/perl/ch-1.pl
+++ b/challenge-109/james-smith/perl/ch-1.pl
@@ -19,7 +19,7 @@ is( chowla_for($_), $answer[ $_ ] ) foreach 1..20;
done_testing();
## We will quickly run benchmarking...
-## This suggests the for loop to be approximately 40-50%
+## This suggests the for loop to be approximately 40%
## faster than the map solution...
## It is also 9 characters shorter...
@@ -30,19 +30,17 @@ cmpthese(1_000_000, {
##
## Rate Map For
-## Map 38670/s -- -33%
-## For 57670/s 49% --
+## Map 59524/s -- -26%
+## For 79936/s 34% --
##
-sub chowla_map {
- my ($t,$n) = (0,@_);
## First attempt - the one-liner is to write this as a map,
## we add $t at the end which is the value returned
- ( map { (($n%$_) || ($t+=$_)) && () } 2..$n-1 ), $t;
+sub chowla_map {
+ my ($t,$n) = (0,@_);
+ ( map { (($n%$_) || ($t+=$_)) && () } 2..$n>>1 ), $t;
}
-sub chowla_for {
- my($t,$n)=(0,@_);
## This time we won't write this as a nasty map/reduce solution...
##
@@ -55,12 +53,12 @@ sub chowla_for {
## can be rewritten as:
## ($condition)||($fun())
## * in perl `foreach` and `for` are synonymous - so we can shorten
-
- ($n%$_)||($t+=$_) for 2..$n-1;
-
- ## Now a quick "shortening" - if there is no specific return
+ ## Finally a quick "shortening" - if there is no specific return
## statement - we can just omit the return in the last statement...
+sub chowla_for {
+ my($t,$n)=(0,@_);
+ ($n%$_)||($t+=$_) for 2..$n>>1;
$t;
}
diff --git a/challenge-109/james-smith/perl/ch-2.pl b/challenge-109/james-smith/perl/ch-2.pl
index 436e0fca19..bafd89bff8 100644
--- a/challenge-109/james-smith/perl/ch-2.pl
+++ b/challenge-109/james-smith/perl/ch-2.pl
@@ -67,8 +67,6 @@ say '';
sub sep { say '------------------------------------------------------------------------'; }
sub show { say "@{$_}" foreach @{$_[0]}; }
-sub four_square {
-
## For a start we make the observation that
##
## $a + 2$b + $c + 2$d + $e + 2$f + $g = $n * 4 where $n is the total of a square
@@ -94,6 +92,7 @@ sub four_square {
##
## We push any valid results to the array
+sub four_square {
my ($t,@n1,@res) = (0,@_);
$t+=$_ foreach @n1;
foreach my $b ( @n1 ) {
@@ -110,7 +109,6 @@ sub four_square {
return \@res;
}
-sub four_square_non_unique {
## Now let us make no assumption about the numbers...
## We choose 3 from the list...
## We then compute n (and check for no remainder)
@@ -123,6 +121,7 @@ sub four_square_non_unique {
## will end up with 2 entries in the list
## where you swap the equivalent values...
+sub four_square_non_unique {
my ($t,$check,@n1,%res) = (0,"@{[sort @_]}",@_);
$t+=$_ foreach @n1;
foreach my $i ( 0..@n1-1 ) {
diff --git a/challenge-110/james-smith/README.md b/challenge-110/james-smith/README.md
index 3b736748ea..8e08f1b553 100644
--- a/challenge-110/james-smith/README.md
+++ b/challenge-110/james-smith/README.md
@@ -1 +1,282 @@
-Solutions by James Smith.
+# Challenge 1 - valid phone numbers...
+
+You are given a text file - Write a script to display all valid phone numbers in the given text file.
+
+## Solution 1
+
+This is what Perl was designed for - pattern matching strings. The format we accept numbers is one of:
+
+```
+ +nn nnnnnnnnnn
+ (nn) nnnnnnnnnn
+ nnnn nnnnnnnnnn
+```
+
+We can write this as a single regex - there are two parts to the pattern - the prefix and the number
+itself. The latter is just a simple number match {we don't specify length as in some countries this
+is of different lengths}.
+
+We group the three prefix patterns into a group match with `(`s and `|`s - remembering to add `?:` so
+we save memory by not storing the match.
+
+We wrap this regex in a function call:
+``` perl
+sub is_valid_phone_number {
+ return m{\A\s*(?:[+]\d+|00\d+|[(]\d+[)])\s+\d+\s*\Z};
+}
+```
+We can then just use this to grep over the lines of the file....
+
+``` perl
+print grep { is_valid_phone_number($_) } <>;
+```
+
+note we use `print` rather than `say` as we haven't stripped the return character from the line!
+
+# Challenge 2 - transposing a file
+
+You are given a text file. Write a script to transpose the contents of the given file.
+
+## Some background - I've seen this before!
+
+Now this is a problem I have come across before - I work at a genomic institute which handles large
+amounts of data - some of these are in the forms of very large tab or comma separated value files
+containing many 1000s of rows and many 1000s of columns, e.g. gene expression vs samples, genes vs
+variations. These files can be 10G or larger in size....
+
+Often these files are generated by automatic processes - but when you want to perform the next step
+you would rather work down columns rather than across rows...!
+
+The problem as was posed on a Python mailing list is that this was taking for ever to run....
+Firstly moving back to python 2 helped, but unless the scripts were run on big memory machine {64G+}
+then it was still taking a long time...
+
+As a non-python developer on the list - I looked to see if this could be easily achieved in Perl, to
+show the pythonistas that actually Perl was still a better language for this sort of thing.
+
+Investigating the problem I realised that the method they were using was a slurp and print model....
+The problem with that for such large files was memory. Once slurped, chopped etc the machine was
+swapping OR running out of memory. So had to come up with a cleaner script... I will outline 3
+methods of performing this
+
+## Solution 2a - The simplest solution - load in and split into arrays of arrays.
+
+The simplest solution is to slurp the data in the file into an array of array(ref)s.
+
+This is just a case of a `map` which splits the line into it's chunks - we use chomp
+to remove any unneeded line endings.
+
+Once we have an array of arrays we take the first entry out of each row and printing
+these in one line. We repeat this until we reach the end of the lines... Again this
+is a simple one liner.....
+
+``` perl
+sub transpose_split {
+ open my $fh, '<', $_[0];
+ my @in = map { chomp;[ split /,/ ] } <$fh>; ## Slurp into array
+ close $fh;
+ open $fh, '>', $_[1];
+ say {$fh} join ',', map {shift @{$_} } @in while @{$in[0]}; ## Generate transpose;
+ close $fh;
+}
+```
+
+For an 1000 x 1000 file (6.75Mb), this uses approximately 90Mb of memory. This is
+approximately 12-15 x space multiplier..
+
+## Solution 2b - Load in but don't split into arrays
+
+This time we want to look at how we can save memory - one of the problems with the
+solution above is the huge overhead of storing data in an array.
+
+To resolve this we can just slurp all the data into the array, and then use a regex
+to strip the characters up to the `,` or "end of line" and output them, until the
+strings are empty.
+
+``` perl
+sub transpose_regex {
+ open my $fh, '<', $_[0];
+ my @in = <$fh>; ## Slurp into array
+ close $fh;
+ open $fh, '>', $_[1];
+ say {$fh} join ',', map { s{^(.*?)[,\r\n]+}{}; $1 } @in while $in[0];
+ close $fh;
+}
+```
+This is more efficient - for the 1000 x 1000 example it only uses approx 9Mb of
+memory this is on only about a 1.33 x space multiplier... This still becomes more
+of an issue as the file size gets larger
+
+## Solution 2c - The ultimate large file solution - using `tell`, `seek` and `read`;
+
+Finally a solution for "very" large-files...
+
+Rather than slurping all the data into memory in one go - we instead pull content of
+each of the lines a few bytes at a time, working with that - and going back to the
+file system to retrieve more data.
+
+Perl has some simple functions for moving the "head" around in the file:
+
+ * `tell` - return the current location in the file
+ * `seek` - set the current location in the file.
+
+Additionally it has another way of accessing the file content rather than `<$fh>`,
+by using the lower level `read` command. This reads contents for the file and
+stores it into a scalar variable.
+
+So this requires a "double" pass solution - first of all we use `<$fh>` to retrieve
+every line of data from the file, noting the start and end of the line {in byte offsets} and
+remebering the first `$BYTES` bytes from that line {in this example we set this to 1000 bytes}.
+
+We then use a similar approach to the previous one - where we use a regex to find the value
+of the first column. But if we can't find a comma OR the end of the line - we retrieve another
+`$BYTES` bytes of data to the string (until we can find a comma or the end of the line).
+
+If we get to the end of the line we only retrieve the number of bytes left in that line.
+
+``` perl
+sub transpose_seek {
+ my($prev,@pos) = (0);
+ open my $fh, '<', $_[0];
+ open my $ofh, '>', $_[1];
+ ## Loop through the file and get the start/end of each line
+ push ( @pos, [$prev+$BYTES,tell $fh,substr $_,0,$BYTES]) &&
+ ( ($pos[-1][0]>$pos[-1][1]) && ($pos[-1][0]=$pos[-1][1]), $prev=tell $fh) while <$fh>;
+
+ while( $pos[0][0] < $pos[0][1] || length $pos[0][2] ) {
+ my @line;
+ foreach(@pos) {
+ while( $_->[2] !~ m{,} && $_->[0] < $_->[1] ) {
+ seek $fh, $_->[0], 0;
+ read $fh, $_->[2], $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0], length $_->[2];
+ $_->[0] = tell $fh;
+ }
+ $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{};
+ push @line, $1;
+ }
+ say ${$ofh} join q(,), @line;
+ }
+}
+```
+
+### Notes on this code...
+
+ * `@pos` is an array of triples:
+
+ * **`[0]`** - the current offset in the file for the next chunk of this line;
+
+ * **`[1]`** - the end of the current line;
+
+ * **`[2]`** - the buffer for this line (initially the first `$BYTES` bytes)
+
+ * Let's look at the three while loops....
+
+ * The push/while line slurping in data
+
+ We use `while <$fh>` here rather than `foreach` as `while` doesn't accumulate
+ data into an array, where as `foreach <$fh>` does, saving memory.
+
+ ``` perl
+ while( <$fh> ) {
+ push ( @pos, [$prev+$BYTES,tell $fh,substr $_,0,$BYTES]) &&
+ (
+ ($pos[-1][0]>$pos[-1][1]) && ($pos[-1][0]=$pos[-1][1]),
+ $prev=tell $fh
+ );
+ }
+ ```
+
+ * This uses a lot of ***1-liner*** tricks - first of all we push the start, end and first
+ `$BYTES` bytes into the @pos array. This always evaluates to true, so we run the second
+ bracketed statement. This (a) sets the start of the next block to the end of this current
+ line if we have overshot, and then (b) updates the `$prev` value so we know where the
+ next line starts in the file....
+
+ * Now let us look at the 2nd while loop (outer)...
+
+ ``` perl
+ while( $pos[0][0] < $pos[0][1] || length $pos[0][2] ) {
+
+ }
+ ```
+ This is checking that (a) we haven't run out of data to be retrieved for the first and
+ (b) haven't run out of data we have already retrieved.
+
+
+ * Finally the 3rd while loop (inner)...
+
+ ``` perl
+ while( $_->[2] !~ m{,} && $_->[0] < $_->[1] ) {
+ seek $fh, $_->[0], 0;
+ read $fh, $_->[2], $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0], length $_->[2];
+ $_->[0] = tell $fh;
+ }
+ ```
+ In this loop we see if the row does not contain a comma AND there is data left... If this is the
+ case we have to retrieve more data from the file. We do this by first `seek`ing to the location
+ in the file that we need to get data from. We then retrieve the either $BYTES `bytes` of data (or
+ all the data left for the row {if it is less than `$BYTES` bytes.}
+ We then update the location for that particular row (using `tell`).
+
+ Note also we use the 4 parameter version of read.
+
+ `read $fh, $buffer, $bytes, $offset`
+
+ By adding the offset - we can easily append this content onto the end of our buffer string. We have
+ to use length `$_->[2]` as you can use -ve indecies to read into the buffer with an offset from the
+ end - but this only works for -1, -2 etc not "-0".
+
+ * We then use the regex trick to get the first column of the data.
+
+ * Memory usage:
+
+ * This script does not load the file all in one go - so really needs a lot less memory
+ (vs more disc accesses). It is linear in the number of lines, e.g. for the 1000 line file we load in
+ roughly 1Mb of data at a time, and the memory usage is roughly 1.3Mb.
+ * Note this is `O(n)` as well as if the rows get longer then the number of bytes used does not increase.
+ * Having played a bit - the sweet spot of `$BYTES` lies somewhere between 1K and 2K. Smaller makes the
+ regex in the split more efficient, larger reduces the file IO overhead.
+
+### Some information about speed/memory etc...
+
+The following are timings on a single core, 2G RAM, 4G swap machine:
+
+| Method/size | Time (s) | Kbytes | resident | shared |
+| ----------- | -------: | -----: | -------: | -----: |
+| Seek small | 0.001 | 16016| 7836| 5228 |
+| Regex small | 0.000 | 16016| 7836| 5228 |
+| Split small | 0.000 | 16016| 7836| 5228 |
+| Seek 1000 | 1.346 | 17388| 9320| 5228 |
+| Seek 2000 | 5.841 | 18848| 10636| 5228 |
+| Seek 5000 | 54.208 | 23044| 14972| 5228 |
+| Regex 1000 | 1.293 | 25492| 17288| 5228 |
+| Seek 30000 | 3003.220 | 57312| 43948| 2720 |
+| Regex 2000 | 9.040 | 63896| 51376| 3140 |
+| Split 1000 | 0.934 | 105784| 93100| 3204 |
+| Regex 5000 | 130.411 | 260432| 248016| 3204 |
+| Split 2000 | 6.780 | 362028| 349388| 3204 |
+| Split 5000 | 527.614 | 2153576| 1423468| 2764 |
+
+The size is the number of rows/columns - so the "1000" file has 1000 rows and 1000 columns (+row/column labels).
+
+File sizes:
+
+| name | size | row size |
+| ----- | -----: | ----: |
+| small | 61 bytes | 12 |
+| 1000 | 6.6 Mbytes | 6.7K |
+| 2000 | 27 Mbytes | 13.5K |
+| 5000 | 165 Mbytes | 33.6K |
+| 30000 | 5.8 Gbytes | 201.0K |
+
+If we look at the timings by method we can see that for the smaller files the `split` is
+the most efficient {but the difference is relatively small}. But as the file size increases
+then it soon becomes the least efficient:
+
+| Size | Split | Regex | Seek |
+| -----: | ----: | ----: | ----: |
+| small | **0.000** | 0.000 | *0.001* |
+| 1000 | **0.934** | 1.293 | *1.346* |
+| 2000 | 6.890 | *9.040* | **5.841** |
+| 5000 | *527.614* | 130.411 | **54.208** |
+| 30000 | - | - | **3003.220** |
diff --git a/challenge-110/james-smith/blog.txt b/challenge-110/james-smith/blog.txt
new file mode 100644
index 0000000000..017d59262e
--- /dev/null
+++ b/challenge-110/james-smith/blog.txt
@@ -0,0 +1 @@
+https://github.com/drbaggy/perlweeklychallenge-club/tree/master/challenge-110/james-smith
diff --git a/challenge-110/james-smith/perl/ch-1.pl b/challenge-110/james-smith/perl/ch-1.pl
new file mode 100644
index 0000000000..f47ec05169
--- /dev/null
+++ b/challenge-110/james-smith/perl/ch-1.pl
@@ -0,0 +1,14 @@
+#!/usr/local/bin/perl
+
+use strict;
+
+use warnings;
+use feature qw(say);
+use Test::More;
+
+print grep { is_valid_phone_number($_) } <>;
+
+sub is_valid_phone_number {
+ return m{\A\s*(?:[+]\d+|00\d+|[(]\d+[)])\s+\d+\s*\Z};
+}
+
diff --git a/challenge-110/james-smith/perl/ch-2.pl b/challenge-110/james-smith/perl/ch-2.pl
new file mode 100644
index 0000000000..4f543ff569
--- /dev/null
+++ b/challenge-110/james-smith/perl/ch-2.pl
@@ -0,0 +1,104 @@
+#!/usr/local/bin/perl
+
+use strict;
+
+use warnings;
+use feature qw(say);
+use Data::Dumper;
+use Const::Fast qw(const);
+use Time::HiRes qw(time);
+const my $FN_TINY => 'in.txt';
+#const my $FN_SMALL => 'in-large.txt';
+const my $FN_SMALL => 'in-1000.txt';
+const my $FN_MEDIUM => 'in-2000.txt';
+const my $FN_LARGE => 'in-5000.txt';
+const my $FN_MASSIVE => 'in-30000.txt';
+const my $BYTES => 1_024;
+
+select(STDOUT); $| = 1;
+
+## For a small file we can use just about any valid technique...
+## But here I will look at some different techniques which can
+## be more efficient with medium, large data files.
+##
+## Some of these are a trade off between performance and
+##
+## At work we have large 2d arrays of data - e.g. genes vs samples,
+## variations vs genes - often both dimensions are in the range 20,000
+## lines or more. So we need to look at faster - alternative less
+## memory intense solutions
+##
+## The last function transpose_seek - was my solution to speeding
+## up a python script
+##
+## Python 3 -> Python 2 sped things up by a factor or 4, the perl
+## rewrite sped things up by a factor of 100 for the size of arrays
+## we were looking at.
+
+my $t0;
+
+ $t0 = time; transpose_seek( $FN_TINY, 'seek-small' ); say 'Seek small - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+ $t0 = time; transpose_regex( $FN_TINY, 'regex-small' ); say 'Regex small - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+ $t0 = time; transpose_split( $FN_TINY, 'split-small' ); say 'Split small - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+ $t0 = time; transpose_seek( $FN_SMALL, 'seek-1000' ); say 'Seek 1000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+ $t0 = time; transpose_seek( $FN_MEDIUM, 'seek-2000' ); say 'Seek 2000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+ $t0 = time; transpose_seek( $FN_LARGE, 'seek-5000' ); say 'Seek 5000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+ $t0 = time; transpose_regex( $FN_SMALL, 'regex-1000' ); say 'Regex 1000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+ $t0 = time; transpose_seek( $FN_MASSIVE, 'seek-30000' ); say 'Seek 30000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+ $t0 = time; transpose_regex( $FN_MEDIUM, 'regex-2000' ); say 'Regex 2000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+ $t0 = time; transpose_split( $FN_SMALL, 'split-1000' ); say 'Split 1000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+ $t0 = time; transpose_regex( $FN_LARGE, 'regex-5000' ); say 'Regex 5000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+ $t0 = time; transpose_split( $FN_MEDIUM, 'split-2000' ); say 'Split 2000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+ $t0 = time; transpose_split( $FN_LARGE, 'split-5000' ); say 'Split 5000 - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+
+sub transpose_split {
+ open my $fh, '<', $_[0];
+ my @in = map { chomp;[ split /,/ ] } <$fh>; ## Slurp into array
+ close $fh;
+ open $fh, '>', $_[1];
+ say {$fh} join ',', map {shift @{$_} } @in while @{$in[0]}; ## Generate transpose;
+ close $fh;
+}
+
+sub transpose_regex {
+ open my $fh, '<', $_[0];
+ my @in = <$fh>; ## Slurp into array
+ close $fh;
+ open $fh, '>', $_[1];
+ say {$fh} join ',', map { s{^(.*?)[,\r\n]+}{}; $1 } @in while $in[0];
+ close $fh;
+}
+
+sub transpose_seek {
+ my($prev,@pos) = (0);
+ open my $fh, '<', $_[0];
+ open my $ofh, '>', $_[1];
+ ## Loop through the file and get the start/end of each line
+ push (@pos, [$prev+$BYTES,tell $fh,substr $_,0,$BYTES]) && (($pos[-1][0]>$pos[-1][1])&&($pos[-1][0]=$pos[-1][1]), $prev=tell $fh) while <$fh>;
+
+ while( $pos[0][0] < $pos[0][1] || length $pos[0][2] ) {
+ my @line;
+ foreach(@pos) {
+ while( $_->[2] !~ m{,} && $_->[0] < $_->[1] ) {
+ seek $fh, $_->[0], 0;
+ read $fh, $_->[2], $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0], length $_->[2];
+ $_->[0] = tell $fh;
+ }
+ $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{};
+ push @line, $1;
+ }
+ say ${$ofh} join q(,), @line;
+ }
+}
+
+sub get_statm_info {
+ my $error = '';
+ my $ref = {};
+
+ if( open(_INFO,"</proc/$$/statm") ){
+ my @info = split(/\s+/,<_INFO>);
+ close(_INFO);
+ return sprintf 'Size: %8d, Resident: %8d, Shared: %8d', $info[0]*4, $info[1]*4, $info[2]*4;
+ }
+ return '-';
+}
diff --git a/challenge-110/james-smith/perl/in.txt b/challenge-110/james-smith/perl/in.txt
new file mode 100644
index 0000000000..716ebdce75
--- /dev/null
+++ b/challenge-110/james-smith/perl/in.txt
@@ -0,0 +1,5 @@
+name,age,sex
+Mohammad,45,m
+Joe,20,m
+Julie,35,f
+Cristina,10,f
diff --git a/challenge-110/james-smith/perl/phone_nos.txt b/challenge-110/james-smith/perl/phone_nos.txt
new file mode 100644
index 0000000000..4f167ab35b
--- /dev/null
+++ b/challenge-110/james-smith/perl/phone_nos.txt
@@ -0,0 +1,9 @@
+0044 1148820341
+ +44 1148820341
+ 44-11-4882-0341
+(44) 1148820341
+ (44) 1148820341
+ 00 1148820341
+00 1243143+1414
+ () 342432
+(00) 12321-4324