#!/usr/local/bin/perl5 # Name: mybin/http-ref Author: js-cgi@inwap.com 17-Nov-2001 # Purpose: Creates http-sum.htm from http-sum.ref (written by http-sum). # Uses canonical name of refering host, condensing entries if needed. # See the "http-*" utilities at http://www.inwap.com/mybin/ . use Socket; $|++; $debug = ($ARGV[0] eq "-d") ? shift : ""; $in_file = shift || die "Usage: $0 http-sum.ref public_html/referer.html\n"; $out_file = shift || die "Usage: $0 http-sum.ref public_html/referer.html\n"; # c_host: If "guiar.com" and "www.guiar.com" are two names for the same # IP address, combine the two and use the www address. sub c_host { # Returns canonical name of host my $host = shift; return $canonical{$host} if exists $canonical{$host}; print STDERR " $host = " if $debug; my ($cname,$aliases,$af_type,$ia_length,@iaddr) = gethostbyname($host); foreach $_ (@iaddr) { my $quad = inet_ntoa($_); # Addr in nnn.nnn.nnn.nnn form if (exists $canonical{$quad} and $canonical{$quad} ne $cname) { # Watch out for "www.guiar.com" and "guiar.com", which are not # listed as CNAMEs for each other, yet have the same IP address. # (But "virtualhost1.tripod.com" and "virtualhost2.tripod.com" are in # fact different, even though they have the same IP address(es).) my $old = $canonical{$quad}; if ($old eq "www.$cname") { $cname = $old; # Use previously seen long name print STDERR "(IP matches $old) "; } if ($cname eq "www.$old") { $cname = $old; # Use previously seen short name print STDERR "(IP matches $old) "; } } $canonical{$quad} = $cname; print STDERR "[$quad] " if $debug; } foreach $_ ($host,$cname,split / /,$aliases) { print STDERR "$_ " if $debug && ! exists $canonical{$_} && $_ ne $host; $canonical{$_} = $cname; } print STDERR "\n" if $debug; $canonical{$host}; } %canonical = (); $prefix = ""; if ($out_file =~ m%public_html/(.*)%) { my @dirs = split '/',$1; pop @dirs; # Last component is "referer.html" $prefix = "../" x @dirs; # URI of "/" relative to document } open IN,$in_file or die "Cannot open input file $in_file: $!\n"; @lines = ; close IN; $title = shift @lines; # First line is "Other sites that refer to this_site" $dates = shift @lines; # Second line of input file has start date to end date chomp $title; chomp $dates; $title = "Other sites that refer to $title" unless $title =~ / /; print STDERR "$title - $dates\n"; $_ = <$title

$title

$dates

EOM @out = ($_); foreach $_ (@lines) { # Each line has two or more items seperated by tabs. # Each item consists of a count, a colon, and some text. # print STDERR if $debug; ($count_page,@refs) = split /\t/; ($count,$page) = split /:/,$count_page,2; $redir = ""; $page =~ s/#Redirect#// && ($redir = "#Redirect#"); push @out, qq'\n
$count   $page$redir  \n'; # Some browsers do not convert hostnames to lowercase in the REFERER field. # If http-sum found 5 references from "www.yahoo.com" and 3 from # "WWW.YAHOO.COM", we need to combine them to 8 references from one host. %cnt = (); foreach $ref (@refs) { ($cnt,$url) = split /:/,$ref,2; ($proto,$host,$uri) = $url =~ m%(.*?//)(.*?)(/.*)%; ($proto,$host,$uri) = $url =~ m%(.*?//)(.*)% unless $host; $host =~ s/:80$//; # Remove default port number $host = c_host($host); # Need to check for non-canonical name usage $url = "$proto$host$uri"; $cnt{$url} += $cnt; # Accumulate count for canonical name } foreach $_ (sort { $cnt{$b} <=> $cnt{$a} } keys %cnt) { # Descending sort push @out, qq'\t$cnt{$_}\n'; } } open OUT,">$out_file" or die "Cannot write to $out_file: $!\n"; print OUT @out or warn "Problems writing $out_file: $!\n"; close OUT or warn "Problems closing $out_file: $!\n"; print STDERR `ls -l $out_file`; # All done