#!/usr/local/bin/perl5 -s # Name: mybin/http-sum Author: js-cgi@inwap.com 04-Nov-2001 # Purpose: Summarizes httpd access logs (NCSA common/combined log format) # I wrote this program before finding 'analog' and 'wwwstats' at Best.COM. # It does not do many of the things that those other programs do, but it does # what I want for "www.inwap.com". # # To use this program at Best.COM, first you must enable the log files. # Check to see if the file public_html/.logctl already exists. If it does, # then you should already have one or more public_html/httpd_access.* files. # If not, execute "echo 7z >public_html/.logctl" and wait until the first # log file is created (sometime between 3am and 4am Pacific time). # # Once the log files are available, you can execute this command daily: # bin/http-sum public_html/httpd_access.* # One way of doing this is by using a crontab of "30 07 * * * bin/http-daily". # See also the 'http-log' program in the http://www.inwap.com/mybin/ directory. # # (The above command examples assume that you have a directory called "bin" # in your home directory, and that the programs are stored there. Don't # forget to "chmod +x bin/http-*".) # # Another user at best.com has a list of other log analyzer programs at # http://www.best.com/~mentorms/eureka_i.htm in the section titled # "Secrets of Searching the Web and Promoting your Website". die "Usage: $0 [-b] [-r] [-h] httpd_access.*\n" if ($ARGV[0] eq ""); $u=$b=$r=$h=1 if !defined($b) && !defined($r) && !defined($h) && !defined($u); $do_refs = 1 if $r || $refs; # Show links refering to this site $do_hosts = 1 if $h || $hosts; # Summarize host info $do_band = 1 if $b || $bandwidth; # Show bandwidth usage every 2 hours $do_ua = 1 if $u || $useragent; # Summarize brower versions $debug = 1 if $d || $debug; $output_fil = "http-sum.fil"; # All files, sorted by bytes sent $output_fnf = "http-sum.fnf"; # Error 404 = File Not Found $output_ref = "http-sum.ref"; # Back-list of HTTP-REFERER $output_ver = "http-sum.ver"; # List of USER-AGENT (browser) versions $ENV{'PATH'} = "/usr/local/bin:$ENV{'PATH'}"; # To find zcat for *.gz $_ = $0; s%.*/%%; s/.pl$//; # Name of program minus the .pl extension $conf_file = -f "$_.ini" ? "$_.ini" : "$0.ini"; %protocol_versions = (); $login = ""; if (open(IN,$conf_file)) { # List of things to exclude print "Reading $conf_file\n"; $| = 1; while() { s/#.*//; # Remove comments ($key,$val) = split; next if $key eq ""; print "\t",$_ if /site_url|no_refer|login/; $no_count{$val} = 1 if $key eq "no_count"; # Exclude hits from these hosts push(@site_url,$val) if $key eq "site_url"; # Exclude links between pages push(@no_refer,$val) if $key eq "no_refer"; # Exclude certain referers $login = $val if $key eq "login"; # To override $ENV{'LOGNAME'} } print "\n"; } else { print "Could not read $conf_file: $!\n"; } if ($login eq "") { # If "login" not specified, get it from the site_url foreach $_ (@site_url) { m%/~(\w+)% && ($login = $1); } } $login = $ENV{'LOGNAME'} unless $login; # For http://server.com/~login/ $hits = $firstdate = 0; &bandwidth("START") if $do_band; foreach $infile (@ARGV) { if ($infile =~ /\.gz$/) { open(IN,"zcat $infile|") || warn("\nCannot open pipe 'zcat $infile':$!\n"); } else { open(IN,$infile) || warn("\nCannot read $infile:$!\n"); } &processfile; } &bandwidth("END") if $do_band; # Output the last line of bandwidth usage if (open(FNF,">$output_fnf")) { print FNF @FNF; close(FNF); @FNF = (); # Entries are in date order system("sort +1 $output_fnf -o $output_fnf") if -f "/usr/bin/sort"; } else { warn("Could not create $output_fnf: $!\n"); } &summarize; exit; # Typical input line: # uu.net /ftp - [12/Apr/1995:06:36:33 -0700] "GET / HTTP/1.0" 200 2512 "R" "U" sub processfile { ($_ = $infile) =~ s%.*/%%; # Strip off directory name # print STDERR "$_: (",-s $infile,")\n"; while () { chomp; tr%\\%/%; # Version 4.02 of Sambar server used backslash $hits++; $line = $_; $host = $status = $size = $file = $referer = $useragent = ""; if (/(.*) -> (.*)/) { # If reading referer_log ($referer,$file) = ($1,$2); } elsif (/^(\S+)\s+"(.*?)" "(.*)"$/) { # " " ($file,$referer,$useragent) = ($1,$2,$3); } elsif (/^(\S+)\s+(\S+)$/) { # " " ($file,$referer) = ($1,$2); } else { # Reading access_log ($host,$user,$auth,$timestamp,$command,$status,$size) = /(\S+) (\S+) (\S+) \[(.*)\] "(.*?)" (\d+) (.*)/; # Logs created after 1-Apr-96 have HTTP_REFERER and HTTP_USER_AGENT $size =~ /(\d+) "([^"]*)" "(.*)"/ && (($size,$referer,$useragent) = ($1,$2,$3)); $user .= $auth if $^W; # Keep "perl -w" happy ($verb,$file,$vers) = $command =~ /^(\S+)\s+(.*)\s+(HTTP.*)/i; ($verb,$file,$vers) = ($command,"-","-") unless $file; # For "-" $verb{$verb}++; # Count number of GET and HEAD requests $protocol_versions{$vers}++; } $file =~ s/%7e/~/ig; # '%7E' and '%7e' are aliases for '~' $file =~ s/\s+/+/g; # Browser should not send blank in URI $file =~ s/#.*//; # Browser should not send "#fragment" $file = "/" if $file eq ""; # If requesting top-level index $file .= "index.html" if $file =~ m%/$%; # If no name after slash ($fil = $file) =~ s%/index.s?html?%/%; # Strip "index.html" for %refs $fil .= "#Redirect#" if $status eq "302"; $referer =~ s%:80/%/%; # Remove default port number for http $referer =~ tr/"/'/; # Infoseek $referer =~ s/%(..)/pack("C", hex($1))/eg; # Change %7E to ~ $refs{$fil}{$referer}++ if $do_refs && $referer =~ /:/; # Skip "-" $useragent{$useragent}++ if $do_ua; $datetime = substr($timestamp,0,17); # dd/Mon/yyyy:hh:mm (:ss -zone) $firstdate = $datetime unless $firstdate; $lastdate = $datetime if $datetime =~ /\d:\d\d:\d\d/; $host =~ tr/A-Z/a-z/; # $host = nslookup($host) if $host =~ /^[\d.]+$/; print "host=$host time='$timestamp' cmd='$command' stat=$status size=$size\n" if $debug; next if $no_count{$host}; $hosts{$host}++ if $do_hosts; &bandwidth($timestamp,$size) if $do_band; # Update bandwidth usage $errs{$status}++; # OK status: 200=Successful, 206=Partial 302=Redirect, 304=Use_cache if ($status >= 200 && $status <= 399) { $file =~ s%^/~$login/%/%o; # Change /~inwap/ to / $file = "$status:$file" if $status != 200 && $status != 304; @parts = split('\.',$file); $ext = pop(@parts); $ext = '""' unless @parts; $ext = "cgi" if $file =~ m%\.cgi(\?|/)%; # test.cgi/extra/arguments $ext =~ s/\?[&,0-9]*$//; # imagemap or webcam time-of-day $ext = substr($ext,-8) if length($ext) > 8; # Files with no "." $ext_hits{$ext}++; # Count number of *.html, *.gif, *.map $ext_files{$ext}++ unless $files{$file}++; # Files with same ext $ext_bytes{$ext} += $size; # Bytes sent for *.html, *.gif $bytes{$file} += $size; $bytes += $size; # Total bytes sent } elsif ($status == 404) { # 404 = file not found push(@FNF,sprintf("%s %-25s %s\n",substr($datetime,0,6),$file,$referer)); } else { $err_name{"$status $file"}++; $err_date{"$status $file"} = $datetime; # Last occurrance } } # End of input } #use Socket; #sub nslookup ($) { # Return name assigned to an IP address # my $num = shift; # return $num2name{$num} if exists $num2name{$num}; # my $iaddr = inet_aton($num); # Convert to 32-bit IPv4 address # $num2name{$num} = lc(gethostbyaddr($iaddr, AF_INET)) || $num; #} sub commas { # Insert commas every three digits local($_) = int($_[0]); 1 while s/(.*\d)(\d\d\d)/$1,$2/; $_; } sub bandwidth { local($timestamp,$bytes) = @_; if ($timestamp eq "START") { print <= 26; $mon = $monthnum{$month} || 0; $off = "-0400" if $off eq "-0004"; # Bug in iServer config ($off_hr,$off_min) = $off =~ /(\d\d)(\d\d)/; $off = $off =~ /^-/ ? -1 : 1; $off = $off * ($off_hr * 3600 + $off_min * 60); # Offset in seconds if ($timelocal) { # Translate EST to GMT to PST on Unix $time = &timegm($sec,$min,$hour,$day,$mon,$yyyy-1900); $time -= $off; # Set to true GMT time ($sec,$min,$hour,$day,$mon,$year,$wday) = localtime($time); $yyyy = $year + 1900; } else { # Use time and date from log file $wday = 7; # Default to three blanks on Win95 foreach $_ (-1 .. 90) { # Use trial and error to find it @lt = localtime($^T - $_ * 24 * 60 * 60); (($wday = $lt[6]),last) if $lt[3] == $day && $lt[4] == $mon; } } ($sec,$min,$hour,$day,$mon,$year,$wday); # Return adjusted time & date } sub summarize { print "\n$hits hits from $firstdate to $lastdate\n\n"; &sum_hosts if $do_hosts; # Summary by file extension (*.html, *.gif) ########################### $format1 = "%7s %10s %5s %15s\n"; @ext = (); # Summary by file type (*.html vs *.jpg) $filehits = $files = 0; foreach (keys %ext_hits) { push(@ext,sprintf($format1, $ext_hits{$_},$_,$ext_files{$_},&commas($ext_bytes{$_}))); $files += $ext_files{$_}; $filehits += $ext_hits{$_}; } @ext = reverse(sort(@ext)); unshift(@ext,sprintf($format1,"hits","file_type","files","bytes_sent")); push(@ext,sprintf($format1," Total",$filehits,$files,&commas($bytes))); print @ext,"\n"; # Create a list of files that have been transfered ######################### if ($filehits > 0 && open(OUT,">$output_fil")) { $format3 = "%15s %7s %8s %s\n"; # Bytes sent, Hits, Size, Name printf OUT $format3,&commas($bytes),$filehits,"-","$firstdate to $lastdate\n"; printf OUT $format3,"Bytes sent","Hits","Size","Filename"; @temp = (); while (($nam,$cnt) = each %files) { $size = int($bytes{$nam}/$cnt); push(@temp,sprintf($format3,&commas($bytes{$nam}),$cnt,$size,$nam)); } print OUT (reverse sort @temp); # Most popular file first close(OUT); $format4 = "%7s %15s %s\n"; # Hits, Bytes sent, Filename @temp = (); while (($nam,$cnt) = each %files) { push(@temp,sprintf($format4,$cnt,&commas($bytes{$nam}),$nam)); } @temp = (reverse sort @temp); $#temp = 49 if $#temp > 49; # Truncate the arrary print "\nTop 50 files, based on number of hits\n"; printf $format4,"Hits","Bytes sent","Filename"; print @temp; @temp = (); } else { warn "Can't create $output_fil: $!\n" if $filehits > 0; } # Command and Error summary ############################################### print "\n\nCommand summary:\n"; foreach (sort keys %verb) { printf "%7s %s\n",$verb{$_},$_; }; print "\n"; &lookup_status_codes; print "\nError summary:\n"; foreach (sort keys %errs) { printf "%7s %s: %s\n",$errs{$_},$_,$status{$_}; }; print "\n"; print "\n\nHTTP protocol versions used:\n"; foreach (sort keys %protocol_versions) { printf "%-7s %6d",$_,$protocol_versions{$_}; }; print "\n\n"; # Create a list of files that have NOT been transfered ##################### @temp = (); while (($key,$val) = each %err_name) { # Files with errors push(@temp,sprintf("%7s %s %s\n",$val,$err_date{$key},$key)); } print (reverse sort @temp); # Create a list of external pages refering to our pages ################### @ref_line = (); foreach $uri (keys %refs) { @refs = sort { $refs{$uri}{$b} <=> $refs{$uri}{$a} } keys %{$refs{$uri}}; $count = 0; $line = ""; foreach $ref (@refs) { # Do largest counts first next if grep($ref =~ /$_/,@site_url,@no_refer); $cnt = $refs{$uri}{$ref}; next if $cnt < 4; # Ignore 3 or fewer references $count += $cnt; $line .= "\t$cnt:$ref"; } push @ref_line,sprintf("%6d:%s%s\n",$count,$uri,$line) if $count; } unlink($output_ref); if (@ref_line && open(OUT,">$output_ref")) { @ref_line = reverse sort @ref_line; # Put highest counts first $_ = $site_url[0]; s%/$%%; print OUT "$_\n$firstdate to $lastdate\n",@ref_line; close(OUT); } else { print (@ref_line ? "cannot write $output_ref: $!\n" : "no referer\n"); } # Create a list of browser versions ################### if ($do_ua && scalar keys %useragent > 2) { while (($_,$count) = each %useragent) { # Reduce browser variants s/Mozilla.*compatible;/:/ && s/; / (/; # MSIE s/ \[..\] / /; # [en] [fr] [es] s%(Mozilla/\d+\.\d+)Gold%$1%; # Treat Gold same as nonGold s/Update \S+;\s+//; # Ignore minor updates s/ via .*proxy//; $browser{$_} += $count; # Merge subversions together # Count Netscape versus MSIE if (/^(\D+\d+)/) { # Stop at first period in version $ver = $1; $ver =~ s/: MSIE/MS Internet Explorer/; $ver =~ s/Mozilla/Netscape/; $ns_ms{$ver} += $count; } }; undef %useragent; while (($_,$count) = each %ns_ms) { if (/\W*\d+$/) { $product = $`; # Name of browser, minus version number $prod_ver{$product}++; # Number of different versions $prod_count{$product} += $count; } } foreach $_ (keys %prod_ver) { # Summarize multiple versions $ns_ms{$_."__"} = $prod_count{$_} if $prod_ver{$_} > 1; } print " List of USER_AGENT (browser) versions. (':' = 'Mozilla* compatible')\n"; @browser = (); $total = 0; while (($key,$val) = each %browser) { push(@browser,sprintf("%6d %s\n",$val,$key)); $total += $val; # Count of hits with HTTP-REFERER } @browser = sort @browser; foreach $_ (sort keys %ns_ms) { printf("%8d %4.1f%% %s\n",$ns_ms{$_},$ns_ms{$_}*100/$total,$_); } print "\nPercent, cumulative percent, count for this version\n"; open(VER,">$output_ver") || warn("Could not output to $output_ver: $!\n"); $format = "%4.1f %3s %s"; $_ = sprintf("%6d %s\n",$total,"hits that reported USER-AGENT"); $_ = " % cum $_"; print VER $_; print $_; $i = $cum = 0; while($_ = pop(@browser)) { # List most hits first ($count) = split(' ',$_); $cum += $count; $percent = (100 * $count) / $total; $cumper = int((100 * $cum) / $total + 0.5); $_ = sprintf($format,$percent,$cumper,$_); print VER $_; print $_ if ++$i < 50 && $percent > 0.5; } } } sub sum_hosts { # Look for top-level domains (au ca edu com gov jp mil mx net org uk us) while (($key,$val) = each %hosts) { if ($key =~ /[a-zA-Z]/) { $key =~ s/(.*)\.(\d+)\.(\d+)\.(\d+)\.in-addr\.arpa/$4.$3.$2.$1/i; ($top,$b,$c,$d) = reverse(split('\.',$key)); $toplevel{substr($top,0,5)} += $val; # Watch out for "unknown_host" $_ = "$b.$top"; $_ = "$c.$_" if length($_) < 7; # demon.co.uk school.ac.uk egg.or.jp $level2{$_} += $val; # This is typically the company name } else { ($a,$b,$c,$d) = split('\.',$key); $d = $a < 128 ? $a : ($a < 192 ? "-B-" : "-C-"); $toplevel{$d} += $val; # Class B = 128-191, class C = 192-223 $level2{"$a.$b.$c"} += $val; } } # Create a list of all hosts who have accessed our Web pages $format2 = "%7s %s"; # No \n at end, up to 9,999,999 hits @toplevel = (); while (($key,$val) = each %toplevel) { push(@toplevel,sprintf($format2,$val,$key)); } @toplevel = sort @toplevel; $toplevel = scalar(@toplevel); @level2 = (); while (($key,$val) = each %level2) { push(@level2,sprintf($format2,$val,$key)); } @level2 = sort @level2; $level2 = scalar(@level2); @hosts = (); while (($key,$val) = each %hosts) { push(@hosts,sprintf($format2,$val, length($key) > 34 ? "..." . substr($key,-31) : $key)); } @hosts = sort @hosts; $hosts = scalar(@hosts); printf "%7s == %s ==\n",$hits, "total hits from $toplevel domains, $level2 companies, $hosts hosts"; for $_ (1 .. 50) { # Output top 50 in each category printf "%-13s %-23s %s\n",pop(@toplevel),pop(@level2),pop(@hosts); last if scalar(@toplevel)+scalar(@level2)+scalar(@hosts) == 0; } } ############################################################# sub lookup_status_codes { %status = ( # http://www.w3.org/hypertext/WWW/Protocols/HTTP/HTRESP.html "000","Unknown error (timeout)", # TCP connected, but no request came "100","Continue", # "101","Switching Protocols", # "200","Success", # normal response to query "201","Created", # a FORM has created a new document "202","Accepted", # (code is defined but never used) "203","Non-Authoritative Info", # From private cache "204","No response", # from FORM or ISMAP, no change "205","Reset Content", # "206","Partial Content", # http/1.1 - resume interrupted file "300","Multiple Choices", # "301","Moved Permanently", # permanent change "302","Redirect; Directory URL needs slash or .bhtaccess redirection", "303","See Other", # ? "304","Not Modified; use cache",# use local copy (If-Modified-Since) "305","Use Proxy", # "400","Bad Request", # illegal syntax "401","Unauthorized", # authorization password not provided "402","Payment Required", # "ChargeTo:" header is missing "403","Forbidden", # file is protected from reading "404","Not found", # no such file or directory "405","Method Not Allowed", # "406","None Acceptable", # "407","Proxy Auth Required", # "408","Request Timeout", # "409","Conflict", # "410","Gone", # "411","Length Required", # "412","Unless True", # "499","Download aborted from user's end", # Sambar server only "500","Internal Server error or limit exceeded", "501","Not implemented", # facility not supported "502","Bad Gateway", # "503","Service Unavailable", # "504","Gateway Timeout", # ); }