#!/usr/bin/perl # GrepCount.pl -- Count instances of one or more regexps $ver = "v1.0"; # 2003-03-21 JP Vossen ########################################################################## (($myname = $0) =~ s/^.*(\/|\\)//ig); # remove up to last "\" or "/" $Greeting = ("$myname $ver Copyright 2003 JP Vossen (http://www.jpsdomain.org/)\n"); $Greeting .= (" Licensed under the GNU GENERAL PUBLIC LICENSE:\n"); $Greeting .= (" See http://www.gnu.org/copyleft/gpl.html for full text and details.\n"); if (("@ARGV" =~ /\?/) || ("@ARGV" =~ /-h/) || "@ARGV" =~ /--help/) { print STDERR ("\n$Greeting\n\n"); print STDERR <<"EoN"; # Usage notes Usage: $myname [OPTIONS] (-i [FILE]) (-o [FILE]) -g [FILE] (-q) -i {infile} = Use infile as the input file, otherwise use STDIN. -o {outfile} = Use outfile as the output file, otherwise use STDOUT. -g {grepfile} = Grepfile contains the regular exp. to count in the input. -Q = Only print the regular expressions, do NOT print the count. * -q = Be quiet about it. Count the frequency of a list of regular expressions in arbitrary input data so that a file for use with 'egrep -f' can have the most frequent expressions first so it runs faster. * Do something like the following to take a list of regexps and put them in the fastest order. $myname -i input.txt -g grep.dat -Qo drops.dat EoN die ("\n"); } # end of usage # %countme = Hash of compiled RegExps to count # %counted = Hash of ASCII RexExps and their count (if any) use Getopt::Std; # Use Perl5 built-in program argument handler getopts('i:o:g:WQq'); # Define possible args. if (! $opt_i) { $opt_i = "-"; } # If no input file specified, use STDIN if (! $opt_o) { $opt_o = "-"; } # If no output file specified, use STDOUT open (GREPFILE, "$opt_g") or die ("$myname: error opening '$opt_g' for input: $!\n"); open (INFILE, "$opt_i") or die ("$myname: error opening '$opt_i' for input: $!\n"); open (OUTFILE, ">$opt_o") or die ("$myname: error opening '$opt_o' for output: $!\n"); if (! $opt_q) { print STDERR ("\n$Greeting\n"); } # Load the RegExps to count while ($aline = ) { chomp($aline); # pre-compile the regex and load it into the hash $countme{$aline} = qr/$aline/; } # end of while regex input while ($aline = ) { chomp($aline); foreach $regex (keys %countme) { # If we have a regex match, count it if ($aline =~ m/$countme{$regex}/) { $counted{$regex}++; } } # end of foreach output } # end of while imput # Write the output, decending sort by # of events. Note use of both # RexEx hashs to make sure the list is in the correct order but that # expressions NOT found in the source data are still output. foreach $regex (sort { $counted{$b} <=> $counted{$a} } keys %countme) { if ($opt_Q) { print OUTFILE ("$regex\n"); # Just the expressions, in order } else { print OUTFILE ("$counted{$regex}\t$regex\n"); # Count \t Exp. } # end of if sort-of quiet } # end of foreach output if (! $opt_q) { print STDERR ("\n\a$myname finished in ",time()-$^T," seconds.\n"); }