aboutsummaryrefslogtreecommitdiff
path: root/challenge-110/james-smith
diff options
context:
space:
mode:
authordrbaggy <js5@sanger.ac.uk>2021-04-27 10:03:14 +0100
committerdrbaggy <js5@sanger.ac.uk>2021-04-27 10:03:14 +0100
commit0b20bcb44f62e12d33cc4080e45f93a47b0cddb6 (patch)
tree179a7d7d53ed35dad9eec3c51b365a1d46bfa1ed /challenge-110/james-smith
parent03f28cae3ddea3b08a671dd3f20f3d32777aa4db (diff)
downloadperlweeklychallenge-club-0b20bcb44f62e12d33cc4080e45f93a47b0cddb6.tar.gz
perlweeklychallenge-club-0b20bcb44f62e12d33cc4080e45f93a47b0cddb6.tar.bz2
perlweeklychallenge-club-0b20bcb44f62e12d33cc4080e45f93a47b0cddb6.zip
files and scripts
Diffstat (limited to 'challenge-110/james-smith')
-rw-r--r--challenge-110/james-smith/perl/ch-1.pl14
-rw-r--r--challenge-110/james-smith/perl/ch-2.pl104
-rw-r--r--challenge-110/james-smith/perl/in.txt5
-rw-r--r--challenge-110/james-smith/perl/phone_nos.txt9
4 files changed, 132 insertions, 0 deletions
diff --git a/challenge-110/james-smith/perl/ch-1.pl b/challenge-110/james-smith/perl/ch-1.pl
new file mode 100644
index 0000000000..f47ec05169
--- /dev/null
+++ b/challenge-110/james-smith/perl/ch-1.pl
@@ -0,0 +1,14 @@
+#!/usr/local/bin/perl
+
+use strict;
+
+use warnings;
+use feature qw(say);
+use Test::More;
+
+print grep { is_valid_phone_number($_) } <>;
+
+sub is_valid_phone_number {
+ return m{\A\s*(?:[+]\d+|00\d+|[(]\d+[)])\s+\d+\s*\Z};
+}
+
diff --git a/challenge-110/james-smith/perl/ch-2.pl b/challenge-110/james-smith/perl/ch-2.pl
new file mode 100644
index 0000000000..9d3b276faa
--- /dev/null
+++ b/challenge-110/james-smith/perl/ch-2.pl
@@ -0,0 +1,104 @@
+#!/usr/local/bin/perl
+
+use strict;
+
+use warnings;
+use feature qw(say);
+use Data::Dumper;
+use Const::Fast qw(const);
+use Time::HiRes qw(time);
+const my $FN_TINY => 'in.txt';
+#const my $FN2 => 'in-large.txt';
+const my $FN_SMALL => 'in-1000.txt';
+const my $FN_MEDIUM => 'in-2000.txt';
+const my $FN_LARGE => 'in-5000.txt';
+const my $FN_MASSIVE => 'in-30000.txt';
+const my $BYTES => 1_024;
+
+select(STDOUT); $| = 1;
+
+## For a small file we can use just about any valid technique...
+## But here I will look at some different techniques which can
+## be more efficient with medium, large data files.
+##
+## Some of these are a trade off between performance and
+##
+## At work we have large 2d arrays of data - e.g. genes vs samples,
+## variations vs genes - often both dimensions are in the range 20,000
+## lines or more. So we need to look at faster - alternative less
+## memory intense solutions
+##
+## The last function transpose_seek - was my solution to speeding
+## up a python script
+##
+## Python 3 -> Python 2 sped things up by a factor or 4, the perl
+## rewrite sped things up by a factor of 100 for the size of arrays
+## we were looking at.
+
+my $t0;
+
+ $t0 = time; transpose_seek( $FN, 'seek-small' ); say 'Seek small ',time-$t0,' ',get_statm_info();
+ $t0 = time; transpose_regex( $FN, 'regex-small' ); say 'Regex small ',time-$t0,' ',get_statm_info();
+ $t0 = time; transpose_split( $FN, 'split-small' ); say 'Split small ',time-$t0,' ',get_statm_info();
+ $t0 = time; transpose_seek( $FN2, 'seek-1000' ); say 'Seek 1000 ',time-$t0,' ',get_statm_info();
+ $t0 = time; transpose_seek( $FN3, 'seek-2000' ); say 'Seek 2000 ',time-$t0,' ',get_statm_info();
+ $t0 = time; transpose_seek( $FN4, 'seek-5000' ); say 'Seek 5000 ',time-$t0,' ',get_statm_info();
+ $t0 = time; transpose_regex( $FN2, 'regex-1000' ); say 'Regex 1000 ',time-$t0,' ',get_statm_info();
+ $t0 = time; transpose_seek( $FN5, 'seek-30000' ); say 'Seek 30000 ',time-$t0,' ',get_statm_info();
+ $t0 = time; transpose_regex( $FN3, 'regex-2000' ); say 'Regex 2000 ',time-$t0,' ',get_statm_info();
+ $t0 = time; transpose_split( $FN2, 'split-1000' ); say 'Split 1000 ',time-$t0,' ',get_statm_info();
+ $t0 = time; transpose_regex( $FN4, 'regex-5000' ); say 'Regex 5000 ',time-$t0,' ',get_statm_info();
+ $t0 = time; transpose_split( $FN3, 'split-2000' ); say 'Split 2000 ',time-$t0,' ',get_statm_info();
+ $t0 = time; transpose_split( $FN4, 'split-5000' ); say 'Split 5000 ',time-$t0,' ',get_statm_info();
+
+sub transpose_split {
+ open my $fh, '<', $_[0];
+ my @in = map { chomp;[ split /,/ ] } <$fh>; ## Slurp into array
+ close $fh;
+ open $fh, '>', $_[1];
+ say {$fh} join ',', map {shift @{$_} } @in while @{$in[0]}; ## Generate transpose;
+ close $fh;
+}
+
+sub transpose_regex {
+ open my $fh, '<', $_[0];
+ my @in = <$fh>; ## Slurp into array
+ close $fh;
+ open $fh, '>', $_[1];
+ say {$fh} join ',', map { s{^(.*?)[,\r\n]+}{}; $1 } @in while $in[0];
+ close $fh;
+}
+
+sub transpose_seek {
+ my($prev,@pos) = (0);
+ open my $fh, '<', $_[0];
+ open my $ofh, '>', $_[1];
+ ## Loop through the file and get the start/end of each line
+ push (@pos, [$prev+$BYTES,tell $fh,substr $_,0,$BYTES]) && (($pos[-1][0]>$pos[-1][1])&&($pos[-1][0]=$pos[-1][1]), $prev=tell $fh) while <$fh>;
+
+ while( $pos[0][0] < $pos[0][1] || length $pos[0][2] ) {
+ my @line;
+ foreach(@pos) {
+ while( $_->[2] !~ m{,} && $_->[0] < $_->[1] ) {
+ seek $fh, $_->[0], 0;
+ read $fh, $_->[2], $_->[1]-$_->[0] > $BYTES ? $BYTES : $_->[1]-$_->[0], length $_->[2];
+ $_->[0] = tell $fh;
+ }
+ $_->[2] =~ s{^([^,\r\n]+)[,\r\n]*}{};
+ push @line, $1;
+ }
+ say ${$ofh} join q(,), @line;
+ }
+}
+
+sub get_statm_info {
+ my $error = '';
+ my $ref = {};
+
+ if( open(_INFO,"</proc/$$/statm") ){
+ my @info = split(/\s+/,<_INFO>);
+ close(_INFO);
+ return sprintf 'Size: %d, Resident: %d, Shared: %d', $info[0]*4, $info[1]*4, $info[2]*4;
+ }
+ return '-';
+}
diff --git a/challenge-110/james-smith/perl/in.txt b/challenge-110/james-smith/perl/in.txt
new file mode 100644
index 0000000000..716ebdce75
--- /dev/null
+++ b/challenge-110/james-smith/perl/in.txt
@@ -0,0 +1,5 @@
+name,age,sex
+Mohammad,45,m
+Joe,20,m
+Julie,35,f
+Cristina,10,f
diff --git a/challenge-110/james-smith/perl/phone_nos.txt b/challenge-110/james-smith/perl/phone_nos.txt
new file mode 100644
index 0000000000..4f167ab35b
--- /dev/null
+++ b/challenge-110/james-smith/perl/phone_nos.txt
@@ -0,0 +1,9 @@
+0044 1148820341
+ +44 1148820341
+ 44-11-4882-0341
+(44) 1148820341
+ (44) 1148820341
+ 00 1148820341
+00 1243143+1414
+ () 342432
+(00) 12321-4324