8 files changed, 427 insertions, 16 deletions
diff --git a/challenge-109/james-smith/perl/ch-1.pl b/challenge-109/james-smith/perl/ch-1.pl
index 1b89226a6a..6fccc59edd 100644
--- a/challenge-109/james-smith/perl/ch-1.pl
+++ b/challenge-109/james-smith/perl/ch-1.pl
@@ -19,7 +19,7 @@ is( chowla_for($_), $answer[ $_ ] ) foreach 1..20;
 done_testing();
 
 ## We will quickly run benchmarking...
-## This suggests the for loop to be approximately 40-50%
+## This suggests the for loop to be approximately 40%
 ## faster than the map solution...
 ## It is also 9 characters shorter...
 
@@ -30,19 +30,17 @@ cmpthese(1_000_000, {
 
 ##
 ##        Rate  Map  For
-## Map 38670/s   -- -33%
-## For 57670/s  49%   --
+## Map 59524/s   -- -26%
+## For 79936/s  34%   --
 ##
 
-sub chowla_map {
-  my ($t,$n) = (0,@_);
 ## First attempt - the one-liner is to write this as a map,
 ## we add $t at the end which is the value returned
-  ( map { (($n%$_) || ($t+=$_)) && () } 2..$n-1 ), $t;
+sub chowla_map {
+  my ($t,$n) = (0,@_);
+  ( map { (($n%$_) || ($t+=$_)) && () } 2..$n>>1 ), $t;
 }
 
-sub chowla_for {
-  my($t,$n)=(0,@_);
 
   ## This time we won't write this as a nasty map/reduce solution...
   ##
@@ -55,12 +53,12 @@ sub chowla_for {
   ##    can be rewritten as:
   ##      ($condition)||($fun())
   ##  * in perl `foreach` and `for` are synonymous - so we can shorten
-
-  ($n%$_)||($t+=$_) for 2..$n-1;
-
-  ## Now a quick "shortening" - if there is no specific return
+  ## Finally a quick "shortening" - if there is no specific return
   ## statement - we can just omit the return in the last statement...
 
+sub chowla_for {
+  my($t,$n)=(0,@_);
+  ($n%$_)||($t+=$_) for 2..$n>>1;
   $t;
 }
 
diff --git a/challenge-109/james-smith/perl/ch-2.pl b/challenge-109/james-smith/perl/ch-2.pl
index 436e0fca19..bafd89bff8 100644
--- a/challenge-109/james-smith/perl/ch-2.pl
+++ b/challenge-109/james-smith/perl/ch-2.pl
@@ -67,8 +67,6 @@ say '';
 sub sep  { say '------------------------------------------------------------------------'; }
 sub show { say "@{$_}" foreach @{$_[0]}; }
 
-sub four_square {
-
 ## For a start we make the observation that
 ##
 ## $a + 2$b + $c + 2$d + $e + 2$f + $g = $n * 4 where $n is the total of a square
@@ -94,6 +92,7 @@ sub four_square {
 ##
 ## We push any valid results to the array
 
+sub four_square {
   my ($t,@n1,@res) = (0,@_);
   $t+=$_ foreach @n1;
   foreach my $b (        @n1 ) {
@@ -110,7 +109,6 @@ sub four_square {
   return \@res;
 }
 
-sub four_square_non_unique {
 ## Now let us make no assumption about the numbers...
 ## We choose 3 from the list...
 ## We then compute n (and check for no remainder)
@@ -123,6 +121,7 @@ sub four_square_non_unique {
 ##            will end up with 2 entries in the list
 ##            where you swap the equivalent values...
 
+sub four_square_non_unique {
   my ($t,$check,@n1,%res) = (0,"@{[sort @_]}",@_);
   $t+=$_ foreach @n1;
   foreach my $i ( 0..@n1-1 ) {
diff --git a/challenge-110/james-smith/README.md b/challenge-110/james-smith/README.md
index 3b736748ea..8e08f1b553 100644
--- a/challenge-110/james-smith/README.md
+++ b/challenge-110/james-smith/README.md
@@ -1 +1,282 @@
-Solutions by James Smith.
+# Challenge 1 - valid phone numbers...
+
+You are given a text file - Write a script to display all valid phone numbers in the given text file.
+
+## Solution 1
+
+This is what Perl was designed for - pattern matching strings. The format we accept numbers is one of:
+
+```
+   +nn  nnnnnnnnnn
+   (nn) nnnnnnnnnn
+   nnnn nnnnnnnnnn
+```
+
+We can write this as a single regex - there are two parts to the pattern - the prefix and the number
+itself. The latter is just a simple number match {we don't specify length as in some countries this
+is of different lengths}.
+
+We group the three prefix patterns into a group match with `(`s and `|`s - remembering to add `?:` so
+we save memory by not storing the match.
+
+We wrap this regex in a function call:
+``` perl
+sub is_valid_phone_number {
+  return m{\A\s*(?:[+]\d+|00\d+|[(]\d+[)])\s+\d+\s*\Z};
+}
+```
+We can then just use this to grep over the lines of the file....
+
+``` perl
+print grep { is_valid_phone_number($_) } <>;
+```
+
+note we use `print` rather than `say` as we haven't stripped the return character from the line!
+
+# Challenge 2 - transposing a file
+
+You are given a text file. Write a script to transpose the contents of the given file.
+
+## Some background - I've seen this before!
+
+Now this is a problem I have come across before - I work at a genomic institute which handles large
+amounts of data - some of these are in the forms of very large tab or comma separated value files
+containing many 1000s of rows and many 1000s of columns, e.g. gene expression vs samples, genes vs
+variations. These files can be 10G or larger in size....
+
+Often these files are generated by automatic processes - but when you want to perform the next step
+you would rather work down columns rather than across rows...!
+
+The problem as was posed on a Python mailing list is that this was taking for ever to run.... 
+Firstly moving back to python 2 helped, but unless the scripts were run on big memory machine {64G+}
+then it was still taking a long time...
+
+As a non-python developer on the list - I looked to see if this could be easily achieved in Perl, to
+show the pythonistas that actually Perl was still a better language for this sort of thing.
+
+Investigating the problem I realised that the method they were using was a slurp and print model....
+The problem with that for such large files was memory. Once slurped, chopped etc the machine was
+swapping OR running out of memory. So had to come up with a cleaner script... I will outline 3
+methods of performing this
+
+## Solution 2a - The simplest solution - load in and split into arrays of arrays.
+
+The simplest solution is to slurp the data in the file into an array of array(ref)s.
+
+This is just a case of a `map` which splits the line into it's chunks - we use chomp
+to remove any unneeded line endings.
+
+Once we have an array of arrays we take the first entry out of each row and printing
+these in one line. We repeat this until we reach the end of the lines... Again this
+is a simple one liner.....
+
+``` perl
+sub transpose_split {
+  open my $fh, '<', $_[0];
+  my @in =  map { chomp;[ split /,/ ] } <$fh>;                 ## Slurp into array
+  close $fh;
+  open $fh, '>', $_[1];
+  say {$fh} join ',', map {shift @{$_} } @in while @{$in[0]};  ## Generate transpose;
+  close $fh;
+}
+```
+
+For an 1000 x 1000 file (6.75Mb), this uses approximately 90Mb of memory. This is
+approximately 12-15 x space multiplier..
+
+## Solution 2b - Load in but don't split into arrays
+
+This time we want to look at how we can save memory - one of the problems with the
+solution above is the huge overhead of storing data in an array.
+
+To resolve this we can just slurp all the data into the array, and then use a regex
+to strip the characters up to the `,` or "end of line" and output them, until the
+strings are empty.
+
+``` perl
+sub transpose_regex {
+  open my $fh, '<', $_[0];
+  my @in = <$fh>;                                              ## Slurp into array
+  close $fh;
+  open $fh, '>', $_[1];
+  say {$fh} join ',', map { s{^(.*?)[,\r\n]+}{}; $1 } @in while $in[0];
+  close $fh;
+}
+```
+This is more efficient - for the 1000 x 1000 example it only uses approx 9Mb of
+memory this is on only about a 1.33 x space multiplier... This still becomes more
+of an issue as the file size gets larger
+
+## Solution 2c - The ultimate large file solution - using `tell`, `seek` and `read`;
+
+Finally a solution for "very" large-files...
+
+Rather than slurping all the data into memory in one go - we instead pull content of
+each of the lines a few bytes at a time, working with that - and going back to the
+file system to retrieve more data.
+
+Perl has some simple functions for moving the "head" around in the file:
+
+  * `tell` - return the current location in the file
+  * `seek` - set the current location in the file.
+
+Additionally it has another way of accessing the file content rather than `<$fh>`,
+by using the lower level `read` command. This reads contents for the file and
+stores it into a scalar variable.
+
+So this requires a "double" pass solution - first of all we use `<$fh>` to retrieve
+every line of data from the file, noting the start and end of the line {in byte offsets} and
+remebering the first `$BYTES` bytes from that line {in this example we set this to 1000 bytes}.
+
+We then use a similar approach to the previous one - where we use a regex to find the value
+of the first column. But if we can't find a comma OR the end of the line - we retrieve another
+`$BYTES` bytes of data to the string (until we can find a comma or the end of the line).
+
+If we get to the end of the line we only retrieve the number of bytes left in that line.
+
+``` perl
+sub transpose_seek {
+  my($prev,@pos) = (0);
+  open my $fh,  '<', $_[0];
+  open my $ofh, '>', $_[1];
+  ## Loop through the file and get the start/end of each line
+  push ( @pos, [$prev+$BYTES,tell $fh,substr $_,0,$BYTES]) &&
+       ( ($pos[-1][0]>$pos[-1][1]) && ($pos[-1][0]=$pos[-1][1]), $prev=tell $fh) while <$fh>;
+
+  while( $pos[0][0] < $pos[0][1] || length $pos[0][2] ) {
+    my @line;
+    foreach(@pos) {
+      while( $_->[2] !~ m{,} && $_->[0] < $_->[1] ) {
+        seek $fh, $_->[0], 0;
+        read $fh, $_->[2], $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0], length $_->[2];
+        $_->[0] = tell $fh;
+      }
+      $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{};
+      push @line, $1;
+    }
+    say ${$ofh} join q(,), @line;
+  }
+}
+```
+
+### Notes on this code...
+
+ * `@pos` is an array of triples:
+
+   * **`[0]`** - the current offset in the file for the next chunk of this line;
+
+   * **`[1]`** - the end of the current line;
+
+   * **`[2]`** - the buffer for this line (initially the first `$BYTES` bytes)
+
+ * Let's look at the three while loops....
+  
+    * The push/while line slurping in data
+
+      We use `while <$fh>` here rather than `foreach` as `while` doesn't accumulate
+      data into an array, where as `foreach <$fh>` does, saving memory.
+
+      ``` perl
+      while( <$fh> ) {
+        push ( @pos, [$prev+$BYTES,tell $fh,substr $_,0,$BYTES]) &&
+             (
+               ($pos[-1][0]>$pos[-1][1]) && ($pos[-1][0]=$pos[-1][1]),
+               $prev=tell $fh
+             );
+      }
+      ```
+
+    * This uses a lot of ***1-liner*** tricks - first of all we push the start, end and first
+      `$BYTES` bytes into the @pos array. This always evaluates to true, so we run the second
+      bracketed statement. This (a) sets the start of the next block to the end of this current
+      line if we have overshot, and then (b) updates the `$prev` value so we know where the
+      next line starts in the file....
+    
+    * Now let us look at the 2nd while loop (outer)...
+
+      ``` perl
+        while( $pos[0][0] < $pos[0][1] || length $pos[0][2] ) {
+
+        }
+      ```
+      This is checking that (a) we haven't run out of data to be retrieved for the first and
+      (b) haven't run out of data we have already retrieved.
+
+      
+    * Finally the 3rd while loop (inner)... 
+
+      ``` perl
+            while( $_->[2] !~ m{,} && $_->[0] < $_->[1] ) {
+              seek $fh, $_->[0], 0;
+              read $fh, $_->[2], $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0], length $_->[2];
+              $_->[0] = tell $fh;
+            } 
+      ```
+      In this loop we see if the row does not contain a comma AND there is data left... If this is the
+      case we have to retrieve more data from the file. We do this by first `seek`ing to the location
+      in the file that we need to get data from. We then retrieve the either $BYTES `bytes` of data (or
+      all the data left for the row {if it is less than `$BYTES` bytes.}
+      We then update the location for that particular row (using `tell`).
+
+      Note also we use the 4 parameter version of read.
+
+         `read $fh, $buffer, $bytes, $offset`
+     
+      By adding the offset - we can easily append this content onto the end of our buffer string. We have
+      to use length `$_->[2]` as you can use -ve indecies to read into the buffer with an offset from the
+      end - but this only works for -1, -2 etc not "-0".
+
+    * We then use the regex trick to get the first column of the data.
+
+  * Memory usage:
+
+    * This script does not load the file all in one go - so really needs a lot less memory
+      (vs more disc accesses). It is linear in the number of lines, e.g. for the 1000 line file we load in
+      roughly 1Mb of data at a time, and the memory usage is roughly 1.3Mb.
+    * Note this is `O(n)` as well as if the rows get longer then the number of bytes used does not increase.
+    * Having played a bit - the sweet spot of `$BYTES` lies somewhere between 1K and 2K. Smaller makes the
+      regex in the split more efficient, larger reduces the file IO overhead.
+
+### Some information about speed/memory etc...
+
+The following are timings on a single core, 2G RAM, 4G swap machine:
+
+| Method/size | Time (s) | Kbytes | resident | shared |
+| ----------- | -------: | -----: | -------: | -----: |
+| Seek small  | 0.001 | 16016| 7836| 5228 |
+| Regex small | 0.000 | 16016| 7836| 5228 |
+| Split small | 0.000 | 16016| 7836| 5228 |
+| Seek 1000   | 1.346 | 17388| 9320| 5228 |
+| Seek 2000   | 5.841 | 18848| 10636| 5228 |
+| Seek 5000   | 54.208 | 23044| 14972| 5228 |
+| Regex 1000  | 1.293 | 25492| 17288| 5228 |
+| Seek 30000  | 3003.220 | 57312| 43948| 2720 |
+| Regex 2000  | 9.040 | 63896| 51376| 3140 |
+| Split 1000  | 0.934 | 105784| 93100| 3204 |
+| Regex 5000  | 130.411 | 260432| 248016| 3204 |
+| Split 2000  | 6.780 | 362028| 349388| 3204 |
+| Split 5000  | 527.614 | 2153576| 1423468| 2764 |
+
+The size is the number of rows/columns - so the "1000" file has 1000 rows and 1000 columns (+row/column labels).
+
+File sizes:
+
+| name  | size | row size |
+| ----- | -----: | ----: |
+| small | 61 bytes | 12 |
+|  1000 | 6.6 Mbytes | 6.7K |
+|  2000 | 27 Mbytes | 13.5K |
+|  5000 | 165 Mbytes | 33.6K |
+| 30000 | 5.8 Gbytes | 201.0K |
+
+If we look at the timings by method we can see that for the smaller files the `split` is
+the most efficient {but the difference is relatively small}. But as the file size increases
+then it soon becomes the least efficient:
+
+| Size   | Split | Regex | Seek |
+| -----: | ----: | ----: | ----: |
+| small  | **0.000** | 0.000 | *0.001* |
+| 1000   | **0.934** | 1.293 | *1.346* |
+| 2000   | 6.890 | *9.040* | **5.841** |
+| 5000   | *527.614* | 130.411 | **54.208** |
+| 30000  | - | - | **3003.220** |
diff --git a/challenge-110/james-smith/blog.txt b/challenge-110/james-smith/blog.txt
new file mode 100644
index 0000000000..017d59262e
--- /dev/null
+++ b/challenge-110/james-smith/blog.txt
@@ -0,0 +1 @@
+https://github.com/drbaggy/perlweeklychallenge-club/tree/master/challenge-110/james-smith
diff --git a/challenge-110/james-smith/perl/ch-1.pl b/challenge-110/james-smith/perl/ch-1.pl
new file mode 100644
index 0000000000..f47ec05169
--- /dev/null
+++ b/challenge-110/james-smith/perl/ch-1.pl
@@ -0,0 +1,14 @@
+#!/usr/local/bin/perl
+
+use strict;
+
+use warnings;
+use feature qw(say);
+use Test::More;
+
+print grep { is_valid_phone_number($_) } <>;
+
+sub is_valid_phone_number {
+  return m{\A\s*(?:[+]\d+|00\d+|[(]\d+[)])\s+\d+\s*\Z};
+}
+
diff --git a/challenge-110/james-smith/perl/ch-2.pl b/challenge-110/james-smith/perl/ch-2.pl
new file mode 100644
index 0000000000..4f543ff569
--- /dev/null
+++ b/challenge-110/james-smith/perl/ch-2.pl
@@ -0,0 +1,104 @@
+#!/usr/local/bin/perl
+
+use strict;
+
+use warnings;
+use feature qw(say);
+use Data::Dumper;
+use Const::Fast qw(const);
+use Time::HiRes qw(time);
+const my $FN_TINY  => 'in.txt';
+#const my $FN_SMALL => 'in-large.txt';
+const my $FN_SMALL   => 'in-1000.txt';
+const my $FN_MEDIUM  => 'in-2000.txt';
+const my $FN_LARGE   => 'in-5000.txt';
+const my $FN_MASSIVE => 'in-30000.txt';
+const my $BYTES => 1_024;
+
+select(STDOUT); $| = 1;
+
+## For a small file we can use just about any valid technique...
+## But here I will look at some different techniques which can
+## be more efficient with medium, large data files.
+##
+## Some of these are a trade off between performance and
+##
+## At work we have large 2d arrays of data - e.g. genes vs samples,
+## variations vs genes - often both dimensions are in the range 20,000
+## lines or more. So we need to look at faster - alternative less
+## memory intense solutions
+##
+## The last function transpose_seek - was my solution to speeding
+## up a python script
+##
+## Python 3 -> Python 2 sped things up by a factor or 4, the perl
+## rewrite sped things up by a factor of 100 for the size of arrays
+## we were looking at.
+
+my $t0;
+
+   $t0 = time; transpose_seek(  $FN_TINY,    'seek-small'  ); say 'Seek small    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+   $t0 = time; transpose_regex( $FN_TINY,    'regex-small' ); say 'Regex small   - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+   $t0 = time; transpose_split( $FN_TINY,    'split-small' ); say 'Split small   - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+   $t0 = time; transpose_seek(  $FN_SMALL,   'seek-1000'   ); say 'Seek 1000     - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+   $t0 = time; transpose_seek(  $FN_MEDIUM,  'seek-2000'   ); say 'Seek 2000     - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+   $t0 = time; transpose_seek(  $FN_LARGE,   'seek-5000'   ); say 'Seek 5000     - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+   $t0 = time; transpose_regex( $FN_SMALL,   'regex-1000'  ); say 'Regex 1000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+   $t0 = time; transpose_seek(  $FN_MASSIVE, 'seek-30000'  ); say 'Seek 30000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+   $t0 = time; transpose_regex( $FN_MEDIUM,  'regex-2000'  ); say 'Regex 2000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+   $t0 = time; transpose_split( $FN_SMALL,   'split-1000'  ); say 'Split 1000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+   $t0 = time; transpose_regex( $FN_LARGE,   'regex-5000'  ); say 'Regex 5000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+   $t0 = time; transpose_split( $FN_MEDIUM,  'split-2000'  ); say 'Split 2000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+   $t0 = time; transpose_split( $FN_LARGE,   'split-5000'  ); say 'Split 5000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+
+sub transpose_split {
+  open my $fh, '<', $_[0];
+  my @in =  map { chomp;[ split /,/ ] } <$fh>;                 ## Slurp into array
+  close $fh;
+  open $fh, '>', $_[1];
+  say {$fh} join ',', map {shift @{$_} } @in while @{$in[0]};        ## Generate transpose;
+  close $fh;
+}
+
+sub transpose_regex {
+  open my $fh, '<', $_[0];
+  my @in = <$fh>;                                              ## Slurp into array
+  close $fh;
+  open $fh, '>', $_[1];
+  say {$fh} join ',', map { s{^(.*?)[,\r\n]+}{}; $1 } @in while $in[0];
+  close $fh;
+}
+
+sub transpose_seek {
+  my($prev,@pos) = (0);
+  open my $fh, '<', $_[0];
+  open my $ofh, '>', $_[1];
+  ## Loop through the file and get the start/end of each line
+  push (@pos, [$prev+$BYTES,tell $fh,substr $_,0,$BYTES]) && (($pos[-1][0]>$pos[-1][1])&&($pos[-1][0]=$pos[-1][1]), $prev=tell $fh) while <$fh>;
+
+  while( $pos[0][0] < $pos[0][1] || length $pos[0][2] ) {
+    my @line;
+    foreach(@pos) {
+      while( $_->[2] !~ m{,} && $_->[0] < $_->[1] ) {
+        seek $fh, $_->[0], 0;
+        read $fh, $_->[2], $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0], length $_->[2];
+        $_->[0] = tell $fh;
+      }
+      $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{};
+      push @line, $1;
+    }
+    say ${$ofh} join q(,), @line;
+  }
+}
+
+sub get_statm_info {
+  my $error = '';
+  my $ref = {};
+
+  if( open(_INFO,"</proc/$$/statm") ){
+    my @info = split(/\s+/,<_INFO>);
+    close(_INFO);
+    return sprintf 'Size: %8d, Resident: %8d, Shared: %8d', $info[0]*4, $info[1]*4, $info[2]*4;
+  }
+  return '-';
+}
diff --git a/challenge-110/james-smith/perl/in.txt b/challenge-110/james-smith/perl/in.txt
new file mode 100644
index 0000000000..716ebdce75
--- /dev/null
+++ b/challenge-110/james-smith/perl/in.txt
@@ -0,0 +1,5 @@
+name,age,sex
+Mohammad,45,m
+Joe,20,m
+Julie,35,f
+Cristina,10,f
diff --git a/challenge-110/james-smith/perl/phone_nos.txt b/challenge-110/james-smith/perl/phone_nos.txt
new file mode 100644
index 0000000000..4f167ab35b
--- /dev/null
+++ b/challenge-110/james-smith/perl/phone_nos.txt
@@ -0,0 +1,9 @@
+0044 1148820341
+ +44 1148820341
+  44-11-4882-0341
+(44) 1148820341
+  (44) 1148820341  
+  00 1148820341
+00 1243143+1414
+ () 342432 
+(00) 12321-4324