#!/bin/perl # # stat.pl # Summarize the log file specified on the command line or read from stdin # # (C) Copyright 2000-2002 Diomidis Spinellis # # Permission to use, copy, and distribute this software and its # documentation for any purpose and without fee is hereby granted, # provided that the above copyright notice appear in all copies and that # both that copyright notice and this permission notice appear in # supporting documentation. # # THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. # # $Id: stat.pl 1.6 2002/08/31 17:06:30 dds Exp $ # # 0 separate results into OK, ERR, merge domains # 1 separate results into OK, NET, PTH, merge domains # Print all results and for all domains $full = 1; $debug = 0; while (<>) { chop; if (/^REF/ && $inftp) { update_stats(); $inftp = 0; } if (/URL (ftp\:.*)/) { $gurl = $url = $1; $inftp = 1; undef $result; } elsif (/^getting ([^ ]*) 30\d /) { # getting http://www.cs.umd.edu:80/projects/hcil 301 Moved Permanently # Only remember the original redirection URL $moved_url = $1 unless (defined $moved_url); } elsif (/^REF .\/(co(\d\d\d\d)\/.*)/) { # REF ./co1999/ry136.html $year = $2; $ref = $1; $source = 'computer'; } elsif (/^REF \d+ .\/((\d\d\d\d)\-.*)/) { # REF 691 ./1995-38-11/p19-shneiderman/p19-shneiderman.pdf.txt $year = $2; $ref = $1; $source = 'cacm'; } elsif (/^URL (http\:.*)/) { # URL http://www.cs.umd.edu/projects/hcil $url = $1; } elsif ($inftp && /^226/) { $result = 200; } elsif ($inftp && /^550.*plain file/) { $result = 200; } elsif ($inftp && /^550/) { $result = 404; } elsif ($inftp && /^ftp:/) { $result = 504; } elsif (/^getting ([^ ]*) ([^3]\d\d) (.*)/ && !$inftp) { # getting http://www.cs.umd.edu:80/projects/hcil 200 OK $final_url = $1; # Might be malformed $result = $2; $word{$result} = $3; if ($moved_url) { $gurl = $moved_url; undef $moved_url; } else { $gurl = $final_url; } update_stats(); } } $# = "%.2f" unless($full); for $n (keys %word) { print "$n $word{$n}\t$errcount{$n}\n"; } # Merge small domains for $base (keys %{$count{prdom}}) { $sum = 0; for $result (keys %{$count{prdom}{$base}}) { $sum += $count{prdom}{$base}{$result}; } for $result (keys %{$count{prdom}{$base}}) { if ($sum < 20) { $count{maindom}{other}{$result} += $count{prdom}{$base}{$result}; } else { $count{maindom}{$base}{$result} = $count{prdom}{$base}{$result}; if ($full == 1) { $count{maindom}{$base}{'NET'} += 0; $count{maindom}{$base}{' OK'} += 0; $count{maindom}{$base}{'PTH'} += 0; } } } } print join("\t", sort keys %{$count{prdom}{com}}), "\n"; for $stat (keys %count) { next if ($stat eq prdom && $full <= 1); print "$stat\n"; for $base (sort keys %{$count{$stat}}) { $sum = 0; for $result (keys %{$count{$stat}{$base}}) { $sum += $count{$stat}{$base}{$result}; } print "$base" if ($full); for $result (sort keys %{$count{$stat}{$base}}) { $n = $count{$stat}{$base}{$result}; if ($full) { print "\t$n"; } else { print "\t\t$base:\t$n ($sum)\t", $n / $sum * 100, "\n" if ($result eq ERR); } } print "\n" if ($full); } } countstat("Article URLs", \%refcount, 2471); countstat("URLs", \%urlcount, 0); sub update_stats { print STDERR "No result for $url\n" unless defined($result); $refcount{$ref}++; $urlcount{$gurl}++; if ($inftp) { if (!( ($proto, $prdom, $path) = ($gurl =~ m/^([^:]+):\/\/[^:]*?\.(\w+)(\/.*)/) )) { print STDERR "Unable to parse ftp URL [$url]\n"; return; } $port = 0; } else { if (!( ($proto, $prdom, $port, $path) = ($gurl =~ m/^([^:]+):\/\/[^:]*?\.(\w+)\:(\d+)(\/.*)/) )) { print STDERR "Unable to parse [$gurl]\n"; return; } } @dirs = split(/\//, $path); $depth = $#dirs; $isfile = ($final_url =~ m/[^\/]$/) ? 'Y' : 'N'; $isdotfile = ($dirs[$#dirs] =~ m/\./) ? 'Y' : 'N'; $isuser = ($final_url =~ m/\/\~/) ? 'Y' : 'N'; if ($debug && $inftp) { print "URL = $gurl Proto=$proto Domain=$prdom Port=$port Path=$path Depth=$depth Result=$result "; } $errcount{$result}++; if ($full == 1) { if ($result == 200) { $result = ' OK'; } elsif ($result == 504 || ($result == 901 && $word{$result} =~ /Connection/)) { $result = 'NET'; } else { $result = 'PTH'; } } elsif ($full == 0) { $result = 'ERR' unless ($result == 200); } $count{year}{$year}{$result}++; $count{source}{$source}{$result}++; $count{proto}{$proto}{$result}++; $count{prdom}{$prdom}{$result}++; $count{port}{$port}{$result}++; $count{depth}{$depth}{$result}++; $count{isfile}{$isfile}{$result}++; $count{isdotfile}{$isdotfile}{$result}++; $count{isuser}{$isuser}{$result}++; } sub countstat { my($ref, $count, $population) = @_; my($sum, $n, $max, $min); my($mean, $median, $mode); my(@vals, %count2); @vals = sort {$a <=> $b} values %{$count}; for $k (@vals) { $sum += $k; $count2{$k}++; } if ($population) { $n = $population; my($half) = int($population / 2); my($virtual) = $population - $#vals - 1; $count2{0} = $virtual; if ($half < $virtual) { $median = 0; } else { $median = $vals[$half - $virtual]; } } else { $n = $#vals + 1; $median = $vals[$n / 2]; } $min = $vals[0]; $max = $vals[$#vals]; $mean = $sum / $n; ($modeval) = sort {$b <=> $a} values %count2; print " $ref N=$n min=$min max=$max Mean=$mean Median=$median Modeval=$modeval ", join(",", sort {$b <=> $a} values %count2), "\n"; for $k (keys %{$count}) { if (${$count}{$k} == $max) { print "Max key=$k\n"; } } for $k (keys %count2) { if ($count2{$k} == $modeval) { print "Mode=$k\n"; } } }