Merge pull request #3974 from drbaggy/master

Changes to notes!
author: Mohammad S Anwar <Mohammad.Anwar@yahoo.com> 2021-04-28 02:16:20 +0100
committer: GitHub <noreply@github.com> 2021-04-28 02:16:20 +0100
commit: c28a1e3774ecf5261b17b9fd40aec26b5dab33f7 (patch)
tree: a03f5f4f5d14ad496e81bd52f3b3211a3c567017
parent: 53eec48290ffb6c4b8e4922e4d33d6a60a6be59d (diff)
parent: eed3560d163299335d7afcd3ed2532d00143db09 (diff)
download: perlweeklychallenge-club-c28a1e3774ecf5261b17b9fd40aec26b5dab33f7.tar.gz
perlweeklychallenge-club-c28a1e3774ecf5261b17b9fd40aec26b5dab33f7.tar.bz2
perlweeklychallenge-club-c28a1e3774ecf5261b17b9fd40aec26b5dab33f7.zip
2 files changed, 119 insertions, 57 deletions
diff --git a/challenge-110/james-smith/README.md b/challenge-110/james-smith/README.md
index 8e08f1b553..433fbf84f7 100644
--- a/challenge-110/james-smith/README.md
+++ b/challenge-110/james-smith/README.md
@@ -1,3 +1,5 @@
+# Perl Weekly Challenge #110
+
 # Challenge 1 - valid phone numbers...
 
 You are given a text file - Write a script to display all valid phone numbers in the given text file.
@@ -20,11 +22,33 @@ We group the three prefix patterns into a group match with `(`s and `|`s - remem
 we save memory by not storing the match.
 
 We wrap this regex in a function call:
+
 ``` perl
 sub is_valid_phone_number {
   return m{\A\s*(?:[+]\d+|00\d+|[(]\d+[)])\s+\d+\s*\Z};
 }
 ```
+
+or "commented" using the "x" modifier...
+
+``` perl
+sub is_valid_phone_number {
+  return m{
+    \A          # Start of line
+    \s*         # Possibly white-space
+    (?:         # Prefix - one of:
+      [+]\d+ |  #   +{digits}
+      00\d+  |  #   00{digits}
+      [(]\d+[)] #   ({digits})
+    )
+    \s+         # Some white-space
+    \d+         # String of numbers
+    \s*         # Possibly white-space
+    \Z          # End of line
+  }x;
+}
+```
+
 We can then just use this to grep over the lines of the file....
 
 ``` perl
@@ -57,7 +81,7 @@ show the pythonistas that actually Perl was still a better language for this sor
 Investigating the problem I realised that the method they were using was a slurp and print model....
 The problem with that for such large files was memory. Once slurped, chopped etc the machine was
 swapping OR running out of memory. So had to come up with a cleaner script... I will outline 3
-methods of performing this
+methods of performing this.
 
 ## Solution 2a - The simplest solution - load in and split into arrays of arrays.
 
@@ -72,11 +96,13 @@ is a simple one liner.....
 
 ``` perl
 sub transpose_split {
+  ## Slurp into array
   open my $fh, '<', $_[0];
-  my @in =  map { chomp;[ split /,/ ] } <$fh>;                 ## Slurp into array
+  my @in =  map { chomp;[ split /,/ ] } <$fh>;
   close $fh;
+  ## Generate transpose;
   open $fh, '>', $_[1];
-  say {$fh} join ',', map {shift @{$_} } @in while @{$in[0]};  ## Generate transpose;
+  say {$fh} join ',', map {shift @{$_} } @in while @{$in[0]};
   close $fh;
 }
 ```
@@ -95,9 +121,11 @@ strings are empty.
 
 ``` perl
 sub transpose_regex {
+  ## Slurp into array
   open my $fh, '<', $_[0];
-  my @in = <$fh>;                                              ## Slurp into array
+  my @in = <$fh>;
   close $fh;
+  ## Generate transpose;
   open $fh, '>', $_[1];
   say {$fh} join ',', map { s{^(.*?)[,\r\n]+}{}; $1 } @in while $in[0];
   close $fh;
@@ -137,24 +165,34 @@ If we get to the end of the line we only retrieve the number of bytes left in th
 ``` perl
 sub transpose_seek {
   my($prev,@pos) = (0);
+
   open my $fh,  '<', $_[0];
   open my $ofh, '>', $_[1];
-  ## Loop through the file and get the start/end of each line
+
+  ## Loop through the file and get the start/end position of each line,
+  ## and the first $BYTES characters of each line...
+
   push ( @pos, [$prev+$BYTES,tell $fh,substr $_,0,$BYTES]) &&
-       ( ($pos[-1][0]>$pos[-1][1]) && ($pos[-1][0]=$pos[-1][1]), $prev=tell $fh) while <$fh>;
+       ( $prev=tell $fh ) while <$fh>;
+
+  ## While we still have "columns" loop through each row and grab the first
+  ## entry and output results.
 
   while( $pos[0][0] < $pos[0][1] || length $pos[0][2] ) {
     my @line;
     foreach(@pos) {
       while( $_->[2] !~ m{,} && $_->[0] < $_->[1] ) {
         seek $fh, $_->[0], 0;
-        read $fh, $_->[2], $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0], length $_->[2];
-        $_->[0] = tell $fh;
+        read $fh,
+             $_->[2],         ## "Buffer"
+             $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0],
+             length $_->[2];  ## Length of "Buffer" so text gets added to end
+        $_->[0]+=$BYTES;
       }
       $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{};
       push @line, $1;
     }
-    say ${$ofh} join q(,), @line;
+    say {$ofh} join q(,), @line;
   }
 }
 ```
@@ -179,10 +217,7 @@ sub transpose_seek {
       ``` perl
       while( <$fh> ) {
         push ( @pos, [$prev+$BYTES,tell $fh,substr $_,0,$BYTES]) &&
-             (
-               ($pos[-1][0]>$pos[-1][1]) && ($pos[-1][0]=$pos[-1][1]),
-               $prev=tell $fh
-             );
+             ( $prev=tell $fh );
       }
       ```
 
@@ -209,31 +244,34 @@ sub transpose_seek {
             while( $_->[2] !~ m{,} && $_->[0] < $_->[1] ) {
               seek $fh, $_->[0], 0;
               read $fh, $_->[2], $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0], length $_->[2];
-              $_->[0] = tell $fh;
+              $_->[0] += $BYTES;
             } 
       ```
       In this loop we see if the row does not contain a comma AND there is data left... If this is the
       case we have to retrieve more data from the file. We do this by first `seek`ing to the location
       in the file that we need to get data from. We then retrieve the either $BYTES `bytes` of data (or
       all the data left for the row {if it is less than `$BYTES` bytes.}
-      We then update the location for that particular row (using `tell`).
+      We then update the location for that particular row (by adding `$BYTES` we can ignore the fact
+      that we overshot.
 
       Note also we use the 4 parameter version of read.
 
          `read $fh, $buffer, $bytes, $offset`
      
       By adding the offset - we can easily append this content onto the end of our buffer string. We have
-      to use length `$_->[2]` as you can use -ve indecies to read into the buffer with an offset from the
+      to use `length $_->[2]` as you can use -ve indecies to read into the buffer with an offset from the
       end - but this only works for -1, -2 etc not "-0".
 
-    * We then use the regex trick to get the first column of the data.
+    * We then use the regex trick in 2b to get the first column of the data.
 
   * Memory usage:
 
     * This script does not load the file all in one go - so really needs a lot less memory
       (vs more disc accesses). It is linear in the number of lines, e.g. for the 1000 line file we load in
       roughly 1Mb of data at a time, and the memory usage is roughly 1.3Mb.
+
     * Note this is `O(n)` as well as if the rows get longer then the number of bytes used does not increase.
+
     * Having played a bit - the sweet spot of `$BYTES` lies somewhere between 1K and 2K. Smaller makes the
       regex in the split more efficient, larger reduces the file IO overhead.
 
@@ -241,42 +279,48 @@ sub transpose_seek {
 
 The following are timings on a single core, 2G RAM, 4G swap machine:
 
-| Method/size | Time (s) | Kbytes | resident | shared |
-| ----------- | -------: | -----: | -------: | -----: |
-| Seek small  | 0.001 | 16016| 7836| 5228 |
-| Regex small | 0.000 | 16016| 7836| 5228 |
-| Split small | 0.000 | 16016| 7836| 5228 |
-| Seek 1000   | 1.346 | 17388| 9320| 5228 |
-| Seek 2000   | 5.841 | 18848| 10636| 5228 |
-| Seek 5000   | 54.208 | 23044| 14972| 5228 |
-| Regex 1000  | 1.293 | 25492| 17288| 5228 |
-| Seek 30000  | 3003.220 | 57312| 43948| 2720 |
-| Regex 2000  | 9.040 | 63896| 51376| 3140 |
-| Split 1000  | 0.934 | 105784| 93100| 3204 |
-| Regex 5000  | 130.411 | 260432| 248016| 3204 |
-| Split 2000  | 6.780 | 362028| 349388| 3204 |
-| Split 5000  | 527.614 | 2153576| 1423468| 2764 |
+**Timings:**
+
+We list these in order of "memory consumption"...
+
+| Method/size | Time (s)  | Kbytes    | resident  | shared |
+| ----------- | --------: | --------: | --------: | -----: |
+| Seek small  |     0.000 |    16,016 |     7,836 |  5,228 |
+| Regex small |     0.000 |    16,016 |     7,836 |  5,228 |
+| Split small |     0.000 |    16,016 |     7,836 |  5,228 |
+| Seek 1000   |     1.346 |    17,388 |     9,320 |  5,228 |
+| Seek 2000   |     5.841 |    18,848 |    10,636 |  5,228 |
+| Seek 5000   |    54.208 |    23,044 |    14,972 |  5,228 |
+| Regex 1000  |     1.293 |    25,492 |    17,288 |  5,228 |
+| Seek 30000  | 3,003.220 |    57,312 |    43,948 |  2,720 |
+| Regex 2000  |     9.040 |    63,896 |    51,376 |  3,140 |
+| Split 1000  |     0.934 |   105,784 |    93,100 |  3,204 |
+| Regex 5000  |   130.411 |   260,432 |   248,016 |  3,204 |
+| Split 2000  |     6.780 |   362,028 |   349,388 |  3,204 |
+| Split 5000  |   527.614 | 2,153,576 | 1,423,468 |  2,764 |
 
 The size is the number of rows/columns - so the "1000" file has 1000 rows and 1000 columns (+row/column labels).
 
-File sizes:
+**File sizes:**
 
-| name  | size | row size |
-| ----- | -----: | ----: |
-| small | 61 bytes | 12 |
-|  1000 | 6.6 Mbytes | 6.7K |
-|  2000 | 27 Mbytes | 13.5K |
-|  5000 | 165 Mbytes | 33.6K |
-| 30000 | 5.8 Gbytes | 201.0K |
+| name         | rows   | columns | size       | row size |
+| ------------ | -----: | ------: | ---------: | -------: |
+| in-small.txt |      5 |       3 |  61  bytes |       12 |
+|  in-1000.txt |  1,001 |   1,001 | 6.6 Mbytes |     6.7K |
+|  in-2000.txt |  2,001 |   2,001 |  27 Mbytes |    13.5K |
+|  in-5000.txt |  5,001 |   5,001 | 165 Mbytes |    33.6K |
+| in-30000.txt | 30,001 |  30,001 | 5.8 Gbytes |   201.0K |
 
 If we look at the timings by method we can see that for the smaller files the `split` is
 the most efficient {but the difference is relatively small}. But as the file size increases
 then it soon becomes the least efficient:
 
-| Size   | Split | Regex | Seek |
-| -----: | ----: | ----: | ----: |
-| small  | **0.000** | 0.000 | *0.001* |
-| 1000   | **0.934** | 1.293 | *1.346* |
-| 2000   | 6.890 | *9.040* | **5.841** |
-| 5000   | *527.614* | 130.411 | **54.208** |
-| 30000  | - | - | **3003.220** |
+**Comparisons:**
+
+| Size   | Split       | Regex       | Seek          |
+| -----: | ----------: | ----------: | ------------: |
+| small  |   **0.000** |     0.000   |      *0.000*  |
+| 1000   |   **0.934** |     1.293   |      *1.346*  |
+| 2000   |     6.890   |    *9.040*  |     **5.841** |
+| 5000   |  *527.614*  |   130.411   |    **54.208** |
+| 30000  |         -   |         -   | **3,003.220** |
diff --git a/challenge-110/james-smith/perl/ch-2.pl b/challenge-110/james-smith/perl/ch-2.pl
index 4f543ff569..4afa11e49c 100644
--- a/challenge-110/james-smith/perl/ch-2.pl
+++ b/challenge-110/james-smith/perl/ch-2.pl
@@ -52,18 +52,22 @@ my $t0;
    $t0 = time; transpose_split( $FN_LARGE,   'split-5000'  ); say 'Split 5000    - Time: ',sprintf('%13.6f',time-$t0),' ',get_statm_info();
 
 sub transpose_split {
+  ## Slurp into array
   open my $fh, '<', $_[0];
-  my @in =  map { chomp;[ split /,/ ] } <$fh>;                 ## Slurp into array
+  my @in =  map { chomp;[ split /,/ ] } <$fh>;
   close $fh;
+  ## Generate transpose
   open $fh, '>', $_[1];
-  say {$fh} join ',', map {shift @{$_} } @in while @{$in[0]};        ## Generate transpose;
+  say {$fh} join ',', map {shift @{$_} } @in while @{$in[0]};        
   close $fh;
 }
 
 sub transpose_regex {
+  ## Slurp into array
   open my $fh, '<', $_[0];
-  my @in = <$fh>;                                              ## Slurp into array
+  my @in = <$fh>;
   close $fh;
+  ## Generate transpose
   open $fh, '>', $_[1];
   say {$fh} join ',', map { s{^(.*?)[,\r\n]+}{}; $1 } @in while $in[0];
   close $fh;
@@ -71,34 +75,48 @@ sub transpose_regex {
 
 sub transpose_seek {
   my($prev,@pos) = (0);
-  open my $fh, '<', $_[0];
+
+  open my $fh,  '<', $_[0];
   open my $ofh, '>', $_[1];
-  ## Loop through the file and get the start/end of each line
-  push (@pos, [$prev+$BYTES,tell $fh,substr $_,0,$BYTES]) && (($pos[-1][0]>$pos[-1][1])&&($pos[-1][0]=$pos[-1][1]), $prev=tell $fh) while <$fh>;
+  
+  ## Loop through the file and get the start/end position of each line,
+  ## and the first $BYTES characters of each line...
+
+  push ( @pos, [$prev+$BYTES,tell $fh,substr $_,0,$BYTES]) &&
+       ( $prev=tell $fh ) while <$fh>;
+
+  ## While we still have "columns" loop through each row and grab the first
+  ## entry and output results.
 
   while( $pos[0][0] < $pos[0][1] || length $pos[0][2] ) {
     my @line;
     foreach(@pos) {
       while( $_->[2] !~ m{,} && $_->[0] < $_->[1] ) {
         seek $fh, $_->[0], 0;
-        read $fh, $_->[2], $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0], length $_->[2];
-        $_->[0] = tell $fh;
+        read $fh,
+             $_->[2],         ## "Buffer"
+             $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0],
+             length $_->[2];  ## Length of "Buffer" so text gets added to end
+        $_->[0]+=$BYTES;
       }
       $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{};
       push @line, $1;
     }
-    say ${$ofh} join q(,), @line;
+    say {$ofh} join q(,), @line;
   }
 }
 
 sub get_statm_info {
+  ## Support function gets the in memory size of the current process
+  ## we use this to show how efficient memory usage is...
   my $error = '';
   my $ref = {};
 
   if( open(_INFO,"</proc/$$/statm") ){
     my @info = split(/\s+/,<_INFO>);
     close(_INFO);
-    return sprintf 'Size: %8d, Resident: %8d, Shared: %8d', $info[0]*4, $info[1]*4, $info[2]*4;
+    return sprintf 'Size: %8d, Resident: %8d, Shared: %8d',
+      $info[0]*4, $info[1]*4, $info[2]*4;
   }
   return '-';
 }
author	Mohammad S Anwar <Mohammad.Anwar@yahoo.com>	2021-04-28 02:16:20 +0100
committer	GitHub <noreply@github.com>	2021-04-28 02:16:20 +0100
commit	c28a1e3774ecf5261b17b9fd40aec26b5dab33f7 (patch)
tree	a03f5f4f5d14ad496e81bd52f3b3211a3c567017
parent	53eec48290ffb6c4b8e4922e4d33d6a60a6be59d (diff)
parent	eed3560d163299335d7afcd3ed2532d00143db09 (diff)
download	perlweeklychallenge-club-c28a1e3774ecf5261b17b9fd40aec26b5dab33f7.tar.gz perlweeklychallenge-club-c28a1e3774ecf5261b17b9fd40aec26b5dab33f7.tar.bz2 perlweeklychallenge-club-c28a1e3774ecf5261b17b9fd40aec26b5dab33f7.zip