#! /usr/bin/perl

################################ doHaul.plx ######################################
#
# Scott McCue
# WHOI/MCG
# Oct 2005
#
# A script to convert a spreadsheet [a single spreadsheet saved to a individual
# file of comma separated values (.csv)] into a flat file for use with the
# Globec database.
#
# Prior to running this script, the target spreadsheet must be saved to a .csv file.
#
# For now, edit the target spreadsheet's name to be $infile (below). An outfile
# name will be generated by the script. A desireable code improvement would be for
# the script to accept a filelist and process the entire list.
#
# Source file for which this was developed: NECdataPolGillnet.xls via M.D. Allison
# 
# From that spreadsheet the worksheets WinterHaulData, FebHaulData, & MayHaulData
# were saved as separate .csv files. The content of these worksheets were unchanged
# with one important exception. See note 1 below.
#
# Note 1. IMPORTANT
#
#  The worksheet called "WinterHaulData" contains an unlabeled column of numbers in
# column C. The others did not. To avoid unecessary exception handling and extra
# programming, this column was deleted from the working form of the worksheet before
# the worksheet was aved out to a .cav file.
#
##################################################################################


# Assign the hashes
%fh = (); %hh = (); @trips = ();

# @infiles=qw(WinterHaulData.csv FebHaulData.csv MayHaulData.csv);

$infile="MayHaulData.csv";

# Create an outfile name.
$twn = $infile; $twn =~ m/(\w+).csv$/; $outfile = sprintf("%s_ff.dat", $1);

# Open inout/output files and get filehandles
open HAULINF, "<$infile" or die "Couldnt open $infile\n";
open OUTF, ">$outfile" or die "Couldnt open $outfile\n";

# Read entire worksheet into array and then close file.
# 'hhd' stands for haulheadersanddata
@hhd = <HAULINF>;  close HAULINF;

# Process each line of the file/array.
# Use one of the classic case-switch like structures from "Programming Perl".
foreach (@hhd) {

   # Case-switch for the worksheet. Data and meta-data are handled by the one block.
   HEADER_OR_DATA_CASE: {

      # Header 1: The (top) line giving the boat name.

       (/Vessel/ or /Irene/ or /Sasquatch/) && do {
            (@vesselsraw) = split(/,/);
            @vessels = grep(/Lady Irene|Sasquatch|M. Brandon/,@vesselsraw);
            $sizev = @vessels;
            last HEADER_OR_DATA_CASE;
       };

      # Header 2: The line giving trip names to this vessel/date combo

       /Trip ID/ && do {
            (@tripsraw) = split(/,/);
            @trips = grep(/GN\d{2,3}/,@tripsraw);
            $sizet = @trips;
            last HEADER_OR_DATA_CASE;
       };

     # Header 3: Date line

       /Date/ && do {
            (@datesraw) = split(/,/);
            @rdates = grep(/\d{1,2}\/\d{1,2}\/\d{2,4}/, @datesraw);
            foreach $rdate (@rdates) {$date = standardize_date_format($rdate); push @dates, $date;}
            $sized = @dates;
            last HEADER_OR_DATA_CASE;
       };

     # Header 4: Haul order
       /Haul,/ && do {
            (@haul_ordersraw) = split(/,/);
	    @haul_orders = grep(/\d{1}/, @haul_ordersraw);
            $sizeh = @haul_orders;
            last HEADER_OR_DATA_CASE;
       };

     # Header 5: Treatment type
       /Treatment/ && do {
            (@treatmentsraw) = split(/,/);
	    @treatments = grep(/[A-D]/, @treatmentsraw);
            @treatmentsA = grep(/^A$/,@treatmentsraw); # print "A: $#treatmentsA\n";
            @treatmentsB = grep(/^B$/,@treatmentsraw); # print "B: $#treatmentsB\n";
            @treatmentsC = grep(/^C$/,@treatmentsraw); # print "C: $#treatmentsB\n"; 
            @treatmentsD = grep(/^D$/,@treatmentsraw); # print "D: $#treatmentsB\n"; 

	    $treatmentsA = @treatmentsA;
	    $treatmentsB = @treatmentsB;
	    $treatmentsC = @treatmentsC;
	    $treatmentsD = @treatmentsD;

            $totaltreatments = $treatmentsA + $treatmentsB + $treatmentsC + $treatmentsD;

            last HEADER_OR_DATA_CASE;
       };

     # Header 6: start time
       /begin/ && do {
            (@starttimesraw) = split(/,/);
            @starttimes = grep(/\d{1,2}\/\d{1,2}\/\d{2,4}\s*\d{1,2}:\d{1,2}|\d{1,2}\/\d{1,2}\/\d{2,4}/,@starttimesraw);
            $sizest = @starttimes;
            last HEADER_OR_DATA_CASE;
       };

     # Header 7: end time
       /end/ && do {
            (@endtimesraw) = split(/,/);
            @endtimes = grep(/\d{1,2}\/\d{1,2}\/\d{2,4}\s*\d{1,2}:\d{1,2}|\d{1,2}\/\d{1,2}\/\d{2,4}/,@endtimesraw);
            $sizeet = @endtimes;
            last HEADER_OR_DATA_CASE;
       };

     # Header 8: durations
       /Duration/ && do {
            (@durationsraw) = split(/,/);
            @durations = grep(/\d/, @durationsraw);
            $sizedu = @durations;
            last HEADER_OR_DATA_CASE;
       };

     # Intermittent header, not in all worksheets: set latitude
       /Set Lat/ && do {
	   (@latsraw) = split(/,/);
           @lats = grep(/\d{5}/, @latsraw);
           $sizelat = @lats;
           last HEADER_OR_DATA_CASE;
       };

     # Intermittent header, not in all worksheets: set longitude
       /Set Lon/ && do {
	   (@lonsraw) = split(/,/);
           @lons = grep(/\d{5}/, @lonsraw);
           $sizelon = @lons;
           last HEADER_OR_DATA_CASE;
       };

     # Footer 1: Count of species caught on that trip/treatment
       /COUNT/ && do {
	    (@countsraw) = split(/,/);
            @counts = grep(/\d/, @countsraw);
            $sizeco = @counts;
            last HEADER_OR_DATA_CASE;
	};

     # Data lines will have either a K or a D in the second column.
     # There won't always be both. Treat each case separately.
  
     # Assumes that the headers have been processed already. They appear
     # earlier in the file.

       /,[Kk],/ && do {
	   ($fishtype, $KorD, @vals) = split(/,/);
            foreach $i (0 .. $#trips) {
              if ($KorD eq 'K' or $KorD eq 'k') {
#              if ($KorD =~ m/[kK]/) {
	       unless ($vals[$i] =~ m/\d/) {$vals[$i] = "nd";}
	       $fh{$trips[$i]}{$treatments[$i]}{$fishtype}{'K'}{Val} = $vals[$i];
	     } # if $KorD
	   }  # foreach

          last HEADER_OR_DATA_CASE;
       };

       /,[Dd],/ && do {
	   ($fishtype, $KorD, @vals) = split(/,/);
            foreach $i (0 .. $#trips) {
              if ($KorD eq 'D' or $KorD eq 'd') {
#              if ($KorD =~ m/[dD]/) {
	       unless ($vals[$i] =~ m/\d/) {$vals[$i] = "nd";}
	       $fh{$trips[$i]}{$treatments[$i]}{$fishtype}{'D'}{Val} = $vals[$i];
	     } # if $KorD
	   }  # foreach

          last HEADER_OR_DATA_CASE;
       };

  }; # HEADER_OR_DATA_CASE

} # foreach line of infile


# Now print out the parsed meta-data and data into the form Dicky expects (she sent an
# example). There are two hashes, %hh (metadata) and %fh (data), that have the fields
# and the relations between them.

# Dicky might be able to force the various date and time formats into what she wants, but
# it's a useful exercise to make it happen here. Subroutines at the end of the script.


foreach $i (0 .. $#trips) {
      $hh{$trips[$i]}{$treatments[$i]}{Vessel} = $vessels[$i];
      $hh{$trips[$i]}{$treatments[$i]}{Date} = standardize_date_format($dates[$i]);
      $hh{$trips[$i]}{$treatments[$i]}{Haul} = $haul_orders[$i];
      $hh{$trips[$i]}{$treatments[$i]}{Start} = standardize_time_format($starttimes[$i]);
      $hh{$trips[$i]}{$treatments[$i]}{End} = standardize_time_format($endtimes[$i]);
      $hh{$trips[$i]}{$treatments[$i]}{Duration} = $durations[$i];
      $hh{$trips[$i]}{$treatments[$i]}{Count} = $counts[$i];
      if (defined($lats[$i])) {
         $hh{$trips[$i]}{$treatments[$i]}{Lat} = $lats[$i];
         $hh{$trips[$i]}{$treatments[$i]}{Lon} = $lons[$i];
     } else {
         $hh{$trips[$i]}{$treatments[$i]}{Lat} = "nd";
         $hh{$trips[$i]}{$treatments[$i]}{Lon} = "nd";
     }
}


print STDERR "Processing $infile to $outfile\n";


printf OUTF "ShipID\tTripID\tDate\tLongitude\tLatitude\tHaul\tTreatment\tSet_begin\tHaul_end\tDuration\tTotal_Species_Caught\tSpecies\K/D\tWeight\n";
printf OUTF "==================================================================================================================================================\n";


foreach $trip (sort keys %fh) {
  foreach $treat (sort keys %{$fh{$trip}}) {
    foreach $fish (sort keys %{$fh{$trip}{$treat}}) {
	foreach $KorD (reverse sort keys %{$fh{$trip}{$treat}{$fish}}) {
	printf OUTF "$hh{$trip}{$treat}{Vessel}\t$trip\t$hh{$trip}{$treat}{Date}\t$hh{$trip}{$treat}{Lon}\t$hh{$trip}{$treat}{Lat}\t$hh{$trip}{$treat}{Haul}\t$treat\t$hh{$trip}{$treat}{Start}\t$hh{$trip}{$treat}{End}\t$hh{$trip}{$treat}{Duration}\t$hh{$trip}{$treat}{Count}\t$fish\t$KorD\t$fh{$trip}{$treat}{$fish}{$KorD}{Val}\n";
#	    printf OUTF "$hh{$trip}{$treat}{Vessel}\t$trip\t$hh{$trip}{$treat}{Date}\t$hh{$trip}{$treat}{Start}\t$hh{$trip}{$treat}{End}\t$hh{$trip}{$treat}{Duration}\n";
    }
    }
}
}

close OUTF;
	
######################################### Subroutines ###############################################
######################################### Subroutines ###############################################
######################################### Subroutines ###############################################

sub standardize_date_format {

    $in = $_[0];

   # Standard form of output is mm/dd/yyyy

   # Order of treatment
   #  1. m/d/yy
   #  2. mm/d/yy
   #  3  m/dd/yy
   #  4. mm/dd/yy
   #  5. m/d/yyyy
   #  6. mm/d/yyyy
   #  7. m/dd/yyyy


    if ($in =~ m/\b(\d{1})\/(\d{1})\/(\d{2})$/) {$yr = sprintf("20%2.2d", $3); $out = sprintf("%2.2d/%2.2d/%4.4d", $1, $2, $yr);}
    elsif ($in =~ m/\b(\d{2})\/(\d{1})\/(\d{2})$/) {$yr = sprintf("20%2.2d", $3); $out = sprintf("%2.2d/%2.2d/%4.4d", $1, $2, $yr);}
    elsif ($in =~ m/\b(\d{1})\/(\d{2})\/(\d{2})$/) {$yr = sprintf("20%2.2d", $3); $out = sprintf("%2.2d/%2.2d/%4.4d", $1, $2, $yr);}
    elsif ($in =~ m/\b(\d{2})\/(\d{2})\/(\d{2})$/) {$yr = sprintf("20%2.2d", $3); $out = sprintf("%2.2d/%2.2d/%4.4d", $1, $2, $yr);}
    elsif ($in =~ m/\b(\d{1})\/(\d{1})\/(\d{4})$/) {$out = sprintf("%2.2d/%2.2d/%4.4d", $1, $2, $3);}
    elsif ($in =~ m/\b(\d{2})\/(\d{1})\/(\d{4})$/) {$out = sprintf("%2.2d/%2.2d/%4.4d", $1, $2, $3);}
    elsif ($in =~ m/\b(\d{1})\/(\d{2})\/(\d{4})$/) {$out = sprintf("%2.2d/%2.2d/%4.4d", $1, $2, $3);}
    elsif ($in =~ m/\b(\d{2})\/(\d{2})\/(\d{4})\b/) {$out = sprintf("%2.2d/%2.2d/%4.4d", $1, $2, $3);}
    else { $out= "nd";}

    return ($out);
}

sub standardize_time_format {

    $in = $_[0];

   # Standard form of output is mm/dd/yyyy hh:mm
   # Split date and time into two portions and treat separately

   # Order of treatment of date
   #  1. m/d/yy
   #  2. mm/d/yy
   #  3  m/dd/yy
   #  4. mm/dd/yy
   #  5. m/d/yyyy
   #  6. mm/d/yyyy
   #  7. m/dd/yyyy
   #  8. mm/dd/yyyy
   #  9. missing

   # Order of treatment for time
   #  1. h:m
   #  2. h:mm
   #  3  hh:m
   #  4  hh:mm
   #  5 missing

 
$in =~ m/(\d{1,2}\/\d{1,2}\/\d{2,4})\s*(\d{1,2}:\d{1,2})|(\d{1,2}\/\d{1,2}\/\d{2,4})/; $indate = $1; $intime = $2;  

#    print "$in\t$indate\t$intime\n";

# Date
    if ($indate =~ m/\b(\d{1})\/(\d{1})\/(\d{2})\b/) {$yr = sprintf("20%2.2d", $3); $outd = sprintf("%2.2d/%2.2d/%4.4d", $1, $2, $yr);}
    elsif ($indate =~ m/\b(\d{2})\/(\d{1})\/(\d{2})\b/) {$yr = sprintf("20%2.2d", $3); $outd = sprintf("%2.2d/%2.2d/%4.4d", $1, $2, $yr);}
    elsif ($indate =~ m/\b(\d{1})\/(\d{2})\/(\d{2})\b/) {$yr = sprintf("20%2.2d", $3); $outd = sprintf("%2.2d/%2.2d/%4.4d", $1, $2, $yr);}
    elsif ($indate =~ m/\b(\d{2})\/(\d{2})\/(\d{2})\b/) {$yr = sprintf("20%2.2d", $3); $outd = sprintf("%2.2d/%2.2d/%4.4d", $1, $2, $yr);}
    elsif ($indate =~ m/\b(\d{1})\/(\d{1})\/(\d{4})\b/) {$outd = sprintf("%2.2d/%2.2d/%4.4d", $1, $2, $3);}
    elsif ($indate =~ m/\b(\d{2})\/(\d{1})\/(\d{4})\b/) {$outd = sprintf("%2.2d/%2.2d/%4.4d", $1, $2, $3);}
    elsif ($indate =~ m/\b(\d{1})\/(\d{2})\/(\d{4})\b/) {$outd = sprintf("%2.2d/%2.2d/%4.4d", $1, $2, $3);}
    elsif ($indate =~ m/\b(\d{2})\/(\d{2})\/(\d{4})\b/) {$outd = sprintf("%2.2d/%2.2d/%4.4d", $1, $2, $3);}
    else { $outd = "nd";}

# Time
    if ($intime =~ m/\b(\d{1}):(\d{1})\b/) {$outt = sprintf("%2.2d:%2.2d", $1, $2);}
    elsif ($intime =~ m/\b(\d{1}):(\d{2})\b/) {$outt = sprintf("%2.2d:%2.2d", $1, $2);}
    elsif ($intime =~ m/\b(\d{2}):(\d{1})\b/) {$outt = sprintf("%2.2d:%2.2d", $1, $2);}
    elsif ($intime =~ m/\b(\d{2}):(\d{2})\b/) {$outt = sprintf("%2.2d:%2.2d", $1, $2);}
    else { $outt = "nd";}

    $outboth = sprintf("%s %s", $outd, $outt);

    return ($outboth);
 
}

