#!/usr/bin/perl # MergeL.pl -- Merge arbitrary lines in arbitrary files #$ver = "v1.0"; # 2002-12-30 JPV #$ver = "v1.1"; # 2002-12-31 JPV Added check for num. of input files. #$ver = "v1.2"; # 2003-01-02 JPV Added filename globbing #$ver = "v1.3"; # 2003-01-05 JPV Fixed "no delimiter" warning #$ver = "v1.3a"; # 2003-01-13 JPV Updated "no delimiter" warning $ver = "v1.3b"; # 2021-04-14 JPV Syntax error tweak ########################################################################## (($myname = $0) =~ s/^.*(\/|\\)//ig); # remove up to last "\" or "/" $Greeting = ("$myname $ver Copyright 2003 JP Vossen (http://www.jpsdomain.org/)\n"); $Greeting .= (" Licensed under the GNU GENERAL PUBLIC LICENSE:\n"); $Greeting .= (" See http://www.gnu.org/copyleft/gpl.html for full text and details.\n"); if ( "@ARGV" =~ /^\?|^-h$|^--help$/ ) { print STDERR ("\n$Greeting\n\tUsage:\t$myname (options) (-o {outfile}) file1 file2 (file3...)\n"); print STDERR ("\t\t$myname (options) (-o {outfile}) file1.in file*.t?t\n\n"); print STDERR (" -o {outfile} = Use outfile as the output file, otherwise use STDOUT.\n"); print STDERR (" -d = Input record delimiter. Default is a TAB. *\n"); print STDERR (" -D = Output record delimiter. Default is a TAB. *\n"); print STDERR (" -k = Key field, starting from zero. Default is zero.\n"); print STDERR (" -K = Only print the keyfield in the output ONCE (first column).\n"); print STDERR (" -w = Do not warn about missing delimiters, and process record anyway.\n"); print STDERR (" -L = No input files have a label (header) row. Default is ALL do.\n"); print STDERR (" -q = Be quiet about it.\n"); print STDERR <<"EON"; # Various notes * Be careful using regex metacharacters like '|' as delimiters! You have to escape them, like -d '\|'... This program is similar to the UNIX join command, but is intended more to process TAB delimited files cut from and pasted back into a spreadsheet. It creates a table or matrix of data, where each line is merged on a unique key (hence MergeL). EON die ("\n"); } use Getopt::Std; # Use Perl5 built-in program argument handler getopts('o:d:D:k:KwLq'); # Define possible args. # Set defaults and use better names $delimiter_in = $opt_d || "\t"; $delimiter_out = $opt_D || "\t"; $keyfield = $opt_k || 0; $nokeyfielddups = $opt_K || undef; $noheaders = $opt_L || undef; if (! $opt_o) { $opt_o = "-"; } # If no output file specified, use STDOUT open (OUTFILE, ">$opt_o") or die ("$myname: error opening '$opt_o' for output: $!\n"); ########################################################################## # Get input if (! $opt_q) { print STDERR ("\n$Greeting\n"); } # Track the length of the records (so we can line up columns later) $MaxLineLen = 0; foreach $file (@ARGV) { if ($file =~ m/\*/) { push (@filearray, glob ("$file")); } elsif ($file =~ m/\?/) { push (@filearray, glob ("$file")); } else { push (@filearray, $file); } # end of glob files } # end of foreach file to glob @ARGV = @filearray if @filearray; if (@ARGV < 2) { die ("$myname: You must have at least 2 input files to merge!\n"); } foreach $infile (@ARGV) { open (INFILE, "$infile") or die ("$myname: error opening '$infile' for input: $!\n"); $ThisFileMaxLineLen = 0; # So far, we don't know how long the records are %filekeys = (); # This is to find duplicate keys in 1 file unless ($noheaders) { # Process the headers if not using -L chomp($aline = ); # If line processing returns successfully, add the header data if (&process_line ("header")) { $header .= $prefix . $aline; } } # end of no headers while ($aline = ) { # Process the records chomp($aline); # If line processing returns successfully, add the record to the hash if (&process_line) { $linehash{@arecord[$keyfield]} .= $prefix . $aline; } } # end of while input $MaxLineLen += $ThisFileMaxLineLen; # Track MaxLineLen for columns } # end of foreach input file ########################################################################## # Write output unless ($noheaders) { if ($nokeyfielddups) { # Keyfield is only printed ONCE, in 1st column print OUTFILE ("Keyfield$delimiter_out$header\n"); } else { # Otherwise print fields as they are in input print OUTFILE ("$header\n"); } # end of if no key field dups } # end of unless noheaders foreach $aline (sort keys %linehash) { if ($nokeyfielddups) { # Keyfield is only printed ONCE, in 1st column print OUTFILE ("$aline$delimiter_out$linehash{$aline}\n"); } else { # Otherwise print fields as they are in input print OUTFILE ("$linehash{$aline}\n"); } # end of if no key field dups } # end of foreach output record if (! $opt_q) { print STDERR ("\n\a$myname finished in ",time()-$^T," seconds.\n"); } # END of Main ########################################################################## #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ sub process_line { # Clunky method to process the correct stuff, since the header is # separate from the record array my $inheader = @_[0]; if ($inheader ne "header") { $inheader = undef; } # Warn about missing field delimiters if (($aline !~ m/$delimiter_in/) and (! $opt_w)) { warn ("$myname: Skipped Record--Missing delimiter(s) in '$infile' record ~$aline~! Try -d or -w.\n"); return (0); # Exit the sub right now and return a failure code } # end of missing key field check # Convert to whatever the output delimiter is. Odd things can # happen if delimiters are regex meta characters... :-( $aline =~ s/$delimiter_in/$delimiter_out/g; ############## Parse the current record ############# # Get record length too. Note -1 "chunking" to get trailing empty fields! $CurrentLineLen = (@arecord = split(/$delimiter_out/, $aline, -1)); # Warn about missing key fields if (@arecord[$keyfield] eq "") { warn ("$myname: Skipped Record--Missing keyfield in '$infile' record ~$aline~!\n"); return (0); # Exit the sub right now and return a failure code } # end of missing key field check # Warn about duplicate key fields if (defined $filekeys{@arecord[$keyfield]}) { # If we get a duplicate, warn about it and return as a failure warn ("$myname: Skipped Record--Duplicate keyfield in '$infile' record ~$aline~! Try -k.\n"); return (0); # Exit the sub right now and return a failure code } else { # Otherwise add this good key to the hash $filekeys{@arecord[$keyfield]}++; } # end of duplicate key field check # Remove all key fields if using -K if ($nokeyfielddups) { my @temparecord = @arecord; # Use a temp record to not screw up the real one splice (@temparecord, $keyfield, 1); # Remove the key field $aline = join ($delimiter_out, @temparecord); # Rebuild "$aline" $CurrentLineLen--; # Decrement line len } # end of no key field dups # Keep track of the longest line (in case lines are irregular length) # We need to line up the fields in the right columns or else the # program is useless. if ($CurrentLineLen > $ThisFileMaxLineLen) { $ThisFileMaxLineLen = $CurrentLineLen; } # end of keep track of longest line # Parse the EXISTING record and learn how long it is # Clunky method to process the correct stuff, since the header is # separate from the record array if ($inheader) { # Note -1 "chunking" to get trailing empty fields! $RecLineLen = split(/$delimiter_out/, $header, -1); } else { # Note -1 "chunking" to get trailing empty fields! $RecLineLen = split(/$delimiter_out/, $linehash{@arecord[$keyfield]}, -1); } # end of if processing a header # Figure out the prefix, so we get the columns lined up correctly # We need to line up the fields in the right columns or else the # program is useless. if ($MaxLineLen == 0) { # We're processing the first file so no prefix is needed yet $prefix = ""; } elsif ($RecLineLen == 0) { # We have a record in THIS file that was NOT in the previous file(s) # so we need to pad out the line to keep columns correct BUT # we do NOT need an extra delimiter as a field separator $prefix = $delimiter_out x ($MaxLineLen - $RecLineLen); } elsif (($MaxLineLen - $RecLineLen) == 0) { # We're processing other files AND # the existing records are already the correct length, SO # we just need to add 1 delimiter to separate fields $prefix = $delimiter_out; } elsif (($MaxLineLen - $RecLineLen) > 0) { # We're processing other files AND the existing record is too short # so we need to pad out the line, to keep columns correct # AND we do need an extra delimiter as a field separator $prefix = $delimiter_out x (($MaxLineLen - $RecLineLen) + 1); } else { # Should never get here... warn ("$myname: Unexpected situation in sub process_line 'prefix' code in\n"); warn ("'$infile' record ~$aline~! Do you have the correct input delimiter (-d)?\n"); } # end of setup prefix if (1==0) { # cheesy debug code warn "\ninfile ~$infile~\n"; warn "aline ~$aline~\n"; warn "ExistingRecord ~$linehash{@arecord[$keyfield]}~\n"; warn "RecLineLen ~$RecLineLen~\n"; warn "MaxLineLen ~$MaxLineLen~\n"; warn ("MaxLn - RecLn ~", ($MaxLineLen - $RecLineLen), "~\n"); warn "CurrentLineLen ~$CurrentLineLen~\n"; warn "ThisFileMaxLineLen ~$ThisFileMaxLineLen~\n"; $junk = $prefix; $junk =~ s/\t/t/g; warn "prefix ~$junk~\n"; } # end of cheesy debug return (1); # Return a success code } # end of sub process_line #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++