From 4fb433346a210a16bc92a7f1f3c7749d69eb502b Mon Sep 17 00:00:00 2001
From: drbaggy <js5@sanger.ac.uk>
Date: Wed, 28 Apr 2021 09:38:44 +0100
Subject: Tidied up seek method

---
 challenge-110/james-smith/README.md    | 15 ++++++++-------
 challenge-110/james-smith/perl/ch-2.pl | 17 +++++++++--------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/challenge-110/james-smith/README.md b/challenge-110/james-smith/README.md
index 433fbf84f7..22b9d2b788 100644
--- a/challenge-110/james-smith/README.md
+++ b/challenge-110/james-smith/README.md
@@ -172,8 +172,8 @@ sub transpose_seek {
   ## Loop through the file and get the start/end position of each line,
   ## and the first $BYTES characters of each line...
 
-  push ( @pos, [$prev+$BYTES,tell $fh,substr $_,0,$BYTES]) &&
-       ( $prev=tell $fh ) while <$fh>;
+  ( push @pos, [ $prev+$BYTES, tell $fh, substr $_, 0, $BYTES] ) &&
+    ( $prev = tell $fh ) while <$fh>;
 
   ## While we still have "columns" loop through each row and grab the first
   ## entry and output results.
@@ -181,16 +181,17 @@ sub transpose_seek {
   while( $pos[0][0] < $pos[0][1] || length $pos[0][2] ) {
     my @line;
     foreach(@pos) {
+      ## Grab extra data for the row if we have got an incomplete
+      ## field {missing a "," and still data to read}
       while( $_->[2] !~ m{,} && $_->[0] < $_->[1] ) {
         seek $fh, $_->[0], 0;
-        read $fh,
-             $_->[2],         ## "Buffer"
+        read $fh, $_->[2],    ## "Buffer"
              $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0],
-             length $_->[2];  ## Length of "Buffer" so text gets added to end
+             length $_->[2];  ## Length of "Buffer" so text gets
+                              ## added to end
         $_->[0]+=$BYTES;
       }
-      $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{};
-      push @line, $1;
+      push @line, $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{};
     }
     say {$ofh} join q(,), @line;
   }
diff --git a/challenge-110/james-smith/perl/ch-2.pl b/challenge-110/james-smith/perl/ch-2.pl
index 4afa11e49c..c71cbca113 100644
--- a/challenge-110/james-smith/perl/ch-2.pl
+++ b/challenge-110/james-smith/perl/ch-2.pl
@@ -78,12 +78,12 @@ sub transpose_seek {
 
   open my $fh,  '<', $_[0];
   open my $ofh, '>', $_[1];
-  
+
   ## Loop through the file and get the start/end position of each line,
   ## and the first $BYTES characters of each line...
 
-  push ( @pos, [$prev+$BYTES,tell $fh,substr $_,0,$BYTES]) &&
-       ( $prev=tell $fh ) while <$fh>;
+  ( push @pos, [ $prev+$BYTES, tell $fh, substr $_, 0, $BYTES] ) &&
+    ( $prev = tell $fh ) while <$fh>;
 
   ## While we still have "columns" loop through each row and grab the first
   ## entry and output results.
@@ -91,16 +91,17 @@ sub transpose_seek {
   while( $pos[0][0] < $pos[0][1] || length $pos[0][2] ) {
     my @line;
     foreach(@pos) {
+      ## Grab extra data for the row if we have got an incomplete
+      ## field {missing a "," and still data to read}
       while( $_->[2] !~ m{,} && $_->[0] < $_->[1] ) {
         seek $fh, $_->[0], 0;
-        read $fh,
-             $_->[2],         ## "Buffer"
+        read $fh, $_->[2],    ## "Buffer"
              $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0],
-             length $_->[2];  ## Length of "Buffer" so text gets added to end
+             length $_->[2];  ## Length of "Buffer" so text gets
+                              ## added to end
         $_->[0]+=$BYTES;
       }
-      $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{};
-      push @line, $1;
+      push @line, $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{};
     }
     say {$ofh} join q(,), @line;
   }
-- 
cgit 


From 52a191f723ab56240a9624d222c35391e95cb185 Mon Sep 17 00:00:00 2001
From: drbaggy <js5@sanger.ac.uk>
Date: Wed, 28 Apr 2021 10:53:56 +0100
Subject: expand out and comment on regex

---
 challenge-110/james-smith/perl/ch-1.pl | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/challenge-110/james-smith/perl/ch-1.pl b/challenge-110/james-smith/perl/ch-1.pl
index f47ec05169..5435cde1b1 100644
--- a/challenge-110/james-smith/perl/ch-1.pl
+++ b/challenge-110/james-smith/perl/ch-1.pl
@@ -9,6 +9,21 @@ use Test::More;
 print grep { is_valid_phone_number($_) } <>;
 
 sub is_valid_phone_number {
+  return m{\A           # Start of line
+           \s*          # Some white-space ?
+           (?:          # Prefix - one of:
+             [+]\d+ |   #   +{digits}
+             00\d+  |   #   00{digits}
+             [(]\d+[)]  #   ({digits})
+           )
+           \s+          # Some white-space
+           \d+          # String of numbers
+           \s*          # Some white-space ?
+           \Z           # End of line
+          }x;
+}
+
+sub is_valid_phone_number_compact {
   return m{\A\s*(?:[+]\d+|00\d+|[(]\d+[)])\s+\d+\s*\Z};
 }
 
-- 
cgit 


From 1bc56a1fa4ec9471b6e6b99eadad97763e3fc2c8 Mon Sep 17 00:00:00 2001
From: drbaggy <js5@sanger.ac.uk>
Date: Wed, 28 Apr 2021 10:54:31 +0100
Subject: stop using array - but printing every result...

---
 challenge-110/james-smith/perl/ch-2.pl | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/challenge-110/james-smith/perl/ch-2.pl b/challenge-110/james-smith/perl/ch-2.pl
index c71cbca113..e02c2735a3 100644
--- a/challenge-110/james-smith/perl/ch-2.pl
+++ b/challenge-110/james-smith/perl/ch-2.pl
@@ -58,7 +58,7 @@ sub transpose_split {
   close $fh;
   ## Generate transpose
   open $fh, '>', $_[1];
-  say {$fh} join ',', map {shift @{$_} } @in while @{$in[0]};        
+  say {$fh} join ',', map {shift @{$_} } @in while @{$in[0]};
   close $fh;
 }
 
@@ -89,7 +89,7 @@ sub transpose_seek {
   ## entry and output results.
 
   while( $pos[0][0] < $pos[0][1] || length $pos[0][2] ) {
-    my @line;
+    my $j='';
     foreach(@pos) {
       ## Grab extra data for the row if we have got an incomplete
       ## field {missing a "," and still data to read}
@@ -101,9 +101,11 @@ sub transpose_seek {
                               ## added to end
         $_->[0]+=$BYTES;
       }
-      push @line, $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{};
+      $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{};
+      print {$ofh} $j,$1;
+      $j||=',';
     }
-    say {$ofh} join q(,), @line;
+    say {$ofh} '';
   }
 }
 
-- 
cgit 


From 9d6496ca31c312c6010a77493ff156660f318111 Mon Sep 17 00:00:00 2001
From: drbaggy <js5@sanger.ac.uk>
Date: Wed, 28 Apr 2021 13:43:59 +0100
Subject: Sped up/less memory on seek

---
 .vscode/settings.json               |  5 +++++
 challenge-110/james-smith/README.md | 28 ++++++++++++++--------------
 2 files changed, 19 insertions(+), 14 deletions(-)
 create mode 100644 .vscode/settings.json

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000000..8bab9cab1f
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,5 @@
+{
+    "editor.rulers": [
+        72,80,120,132
+    ]
+}
\ No newline at end of file
diff --git a/challenge-110/james-smith/README.md b/challenge-110/james-smith/README.md
index 22b9d2b788..f39904c08a 100644
--- a/challenge-110/james-smith/README.md
+++ b/challenge-110/james-smith/README.md
@@ -191,7 +191,7 @@ sub transpose_seek {
                               ## added to end
         $_->[0]+=$BYTES;
       }
-      push @line, $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{};
+      push @line, @[$_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{}];
     }
     say {$ofh} join q(,), @line;
   }
@@ -286,19 +286,19 @@ We list these in order of "memory consumption"...
 
 | Method/size | Time (s)  | Kbytes    | resident  | shared |
 | ----------- | --------: | --------: | --------: | -----: |
-| Seek small  |     0.000 |    16,016 |     7,836 |  5,228 |
-| Regex small |     0.000 |    16,016 |     7,836 |  5,228 |
-| Split small |     0.000 |    16,016 |     7,836 |  5,228 |
-| Seek 1000   |     1.346 |    17,388 |     9,320 |  5,228 |
-| Seek 2000   |     5.841 |    18,848 |    10,636 |  5,228 |
-| Seek 5000   |    54.208 |    23,044 |    14,972 |  5,228 |
-| Regex 1000  |     1.293 |    25,492 |    17,288 |  5,228 |
-| Seek 30000  | 3,003.220 |    57,312 |    43,948 |  2,720 |
-| Regex 2000  |     9.040 |    63,896 |    51,376 |  3,140 |
-| Split 1000  |     0.934 |   105,784 |    93,100 |  3,204 |
-| Regex 5000  |   130.411 |   260,432 |   248,016 |  3,204 |
-| Split 2000  |     6.780 |   362,028 |   349,388 |  3,204 |
-| Split 5000  |   527.614 | 2,153,576 | 1,423,468 |  2,764 |
+| Seek small  |     0.000 |    16,024 |     7,836 |  5,228 |
+| Regex small |     0.000 |    16,024 |     7,836 |  5,228 |
+| Split small |     0.000 |    16,024 |     7,836 |  5,228 |
+| Seek 1000   |     1.277 |    17,196 |     9,320 |  5,228 |
+| Seek 2000   |     5.132 |    18,612 |    10,636 |  5,228 |
+| Seek 5000   |    39.498 |    22,308 |    14,208 |  5,228 |
+| Regex 1000  |     1.181 |    24,868 |    17,288 |  5,228 |
+| Seek 30000  | 2,537.705 |    53,364 |    43,948 |  2,720 |
+| Regex 2000  |    10.596 |    58,160 |    51,376 |  3,140 |
+| Split 1000  |     1.054 |   103,620 |    93,100 |  3,204 |
+| Regex 5000  |   128.589 |   258,056 |   248,016 |  3,204 |
+| Split 2000  |     4.490 |   360,036 |   349,388 |  3,204 |
+| Split 5000  |   598.668 | 2,151,664 | 1,423,468 |  2,764 |
 
 The size is the number of rows/columns - so the "1000" file has 1000 rows and 1000 columns (+row/column labels).
 
-- 
cgit 


From 22fb48cb473b6ae376b6ed91431732ea5639ac4f Mon Sep 17 00:00:00 2001
From: drbaggy <js5@sanger.ac.uk>
Date: Wed, 28 Apr 2021 13:59:37 +0100
Subject: pushing new version of ch-2.pl with better notes and white space

---
 challenge-110/james-smith/perl/ch-2.pl | 52 ++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/challenge-110/james-smith/perl/ch-2.pl b/challenge-110/james-smith/perl/ch-2.pl
index e02c2735a3..8ee3bb6690 100644
--- a/challenge-110/james-smith/perl/ch-2.pl
+++ b/challenge-110/james-smith/perl/ch-2.pl
@@ -37,19 +37,19 @@ select(STDOUT); $| = 1;
 
 my $t0;
 
-   $t0 = time; transpose_seek(  $FN_TINY,    'seek-small'  ); say 'Seek small    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
-   $t0 = time; transpose_regex( $FN_TINY,    'regex-small' ); say 'Regex small   - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
-   $t0 = time; transpose_split( $FN_TINY,    'split-small' ); say 'Split small   - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
-   $t0 = time; transpose_seek(  $FN_SMALL,   'seek-1000'   ); say 'Seek 1000     - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
-   $t0 = time; transpose_seek(  $FN_MEDIUM,  'seek-2000'   ); say 'Seek 2000     - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
-   $t0 = time; transpose_seek(  $FN_LARGE,   'seek-5000'   ); say 'Seek 5000     - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
-   $t0 = time; transpose_regex( $FN_SMALL,   'regex-1000'  ); say 'Regex 1000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
-   $t0 = time; transpose_seek(  $FN_MASSIVE, 'seek-30000'  ); say 'Seek 30000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
-   $t0 = time; transpose_regex( $FN_MEDIUM,  'regex-2000'  ); say 'Regex 2000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
-   $t0 = time; transpose_split( $FN_SMALL,   'split-1000'  ); say 'Split 1000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
-   $t0 = time; transpose_regex( $FN_LARGE,   'regex-5000'  ); say 'Regex 5000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
-   $t0 = time; transpose_split( $FN_MEDIUM,  'split-2000'  ); say 'Split 2000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
-   $t0 = time; transpose_split( $FN_LARGE,   'split-5000'  ); say 'Split 5000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
+if($ARGV[0] ==  1) {  $t0 = time; transpose_seek(  $FN_TINY,    'seek-small'  ); say 'Seek small    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); }
+if($ARGV[0] ==  2) {  $t0 = time; transpose_regex( $FN_TINY,    'regex-small' ); say 'Regex small   - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); }
+if($ARGV[0] ==  3) {  $t0 = time; transpose_split( $FN_TINY,    'split-small' ); say 'Split small   - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); }
+if($ARGV[0] ==  4) {  $t0 = time; transpose_seek(  $FN_SMALL,   'seek-1000'   ); say 'Seek 1000     - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); }
+if($ARGV[0] ==  5) {  $t0 = time; transpose_seek(  $FN_MEDIUM,  'seek-2000'   ); say 'Seek 2000     - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); }
+if($ARGV[0] ==  6) {  $t0 = time; transpose_seek(  $FN_LARGE,   'seek-5000'   ); say 'Seek 5000     - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); }
+if($ARGV[0] ==  7) {  $t0 = time; transpose_regex( $FN_SMALL,   'regex-1000'  ); say 'Regex 1000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); }
+if($ARGV[0] ==  8) {  $t0 = time; transpose_seek(  $FN_MASSIVE, 'seek-30000'  ); say 'Seek 30000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); }
+if($ARGV[0] ==  9) {  $t0 = time; transpose_regex( $FN_MEDIUM,  'regex-2000'  ); say 'Regex 2000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); }
+if($ARGV[0] == 10) {  $t0 = time; transpose_split( $FN_SMALL,   'split-1000'  ); say 'Split 1000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); }
+if($ARGV[0] == 11) {  $t0 = time; transpose_regex( $FN_LARGE,   'regex-5000'  ); say 'Regex 5000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); }
+if($ARGV[0] == 12) {  $t0 = time; transpose_split( $FN_MEDIUM,  'split-2000'  ); say 'Split 2000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); }
+if($ARGV[0] == 13) {  $t0 = time; transpose_split( $FN_LARGE,   'split-5000'  ); say 'Split 5000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info(); }
 
 sub transpose_split {
   ## Slurp into array
@@ -74,36 +74,38 @@ sub transpose_regex {
 }
 
 sub transpose_seek {
-  my($prev,@pos) = (0);
-
-  open my $fh,  '<', $_[0];
-  open my $ofh, '>', $_[1];
+  my ( $prev, @pos ) = ( 0 );
 
   ## Loop through the file and get the start/end position of each line,
   ## and the first $BYTES characters of each line...
 
-  ( push @pos, [ $prev+$BYTES, tell $fh, substr $_, 0, $BYTES] ) &&
+  open my $fh,  '<', $_[0];
+
+  ( push @pos, [ $prev+$BYTES, tell $fh, substr $_, 0, $BYTES ] ) &&
     ( $prev = tell $fh ) while <$fh>;
 
   ## While we still have "columns" loop through each row and grab the first
   ## entry and output results.
 
+  open my $ofh, '>', $_[1];  ## Need 2 file handles open at once for this.
+
   while( $pos[0][0] < $pos[0][1] || length $pos[0][2] ) {
-    my $j='';
-    foreach(@pos) {
+    my $j = '';
+    foreach( @pos ) {
       ## Grab extra data for the row if we have got an incomplete
       ## field {missing a "," and still data to read}
       while( $_->[2] !~ m{,} && $_->[0] < $_->[1] ) {
-        seek $fh, $_->[0], 0;
-        read $fh, $_->[2],    ## "Buffer"
+        seek $fh, $_->[0], 0;  ## 0 = from start of file!
+        read $fh,
+             $_->[2],    ## "Buffer"
              $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0],
              length $_->[2];  ## Length of "Buffer" so text gets
                               ## added to end
-        $_->[0]+=$BYTES;
+        $_->[0] += $BYTES;
       }
       $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{};
-      print {$ofh} $j,$1;
-      $j||=',';
+      print {$ofh} $j, $1;
+      $j ||= ',';
     }
     say {$ofh} '';
   }
-- 
cgit 


From 10946f7c706e74c55c782187150e3220340dbf3f Mon Sep 17 00:00:00 2001
From: drbaggy <js5@sanger.ac.uk>
Date: Wed, 28 Apr 2021 14:21:31 +0100
Subject: Update README.md

---
 challenge-110/james-smith/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/challenge-110/james-smith/README.md b/challenge-110/james-smith/README.md
index f39904c08a..9759983712 100644
--- a/challenge-110/james-smith/README.md
+++ b/challenge-110/james-smith/README.md
@@ -302,6 +302,8 @@ We list these in order of "memory consumption"...
 
 The size is the number of rows/columns - so the "1000" file has 1000 rows and 1000 columns (+row/column labels).
 
+As a "guestimate" for the 30,000 x 30,000 case for which the seek solution use roughly 50Mb, the regex solution would use 7GB memory and the split method would use about 75GB memory... Both these are more memory+swap than the machine that I'm using has!
+
 **File sizes:**
 
 | name         | rows   | columns | size       | row size |
-- 
cgit