Nathan Pralle - www.nathanpralle.com
Kickass Phone Rates from 3U
PLACES TO GO:  
Back to Software Print Version
UNIQUE VISITORS

Scripts.com Freshmeat The Perl Archive HotScripts

#!/usr/bin/perl -w
use strict;
use Socket;

####################################################################
# UNIQUE VISITORS - A very simple utility to parse Apache log files
# and print a listing of unique visiting domains.
# Copyleft 2005 - Nathan E. Pralle
#
# DESCRIPTION: This utility parses an Apache log file and determines
#              how many unique domains visited your site, then compiles
#              a list of them (good for directing to an email or logfile).
#              It uses a cache file to speed up the process as well (not
#              having to do DNS lookups all the time).  It only handles
#              the first level of a domain, IE:  blah.mchsi.com and dink.mchsi.com
#              will just be listed as 2 entries from mchsi.com.
#
# SYNTAX:      perl unique_visitors.pl <days>
#              Where <days> is the number of days back you want to look.
#              I usually run mine at 12:30am and use a '1' to get the previous day.
#
# EXTRA:       You must have a file available called "unique_domains.dat"
#              in the same location as this binary and chmod'ed to 777.
#              This is the cache file for this binary.  I recommend:
#              'touch unique_domains.dat;chmod 777 unique_domains.dat'
#
# CONTACT:     Questions, comments, etc.
#              http://www.nathanpralle.com/contact.html
####################################################################

########################
# CONFIGURATION OPTIONS

#full path to your Apache access_log file
my $logfile="/path/to/access_log";
#######################

my $offset=shift||0;

my @months=qw/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/;
my @time=localtime;

my $year=$time[5]+1900;
my $shortmonth=$months[$time[4]];
my $day;
if(($time[3] - $offset)<1){
	$shortmonth=$months[$time[4] - 1];
	$day=30;
}
else{
	$day=pad($time[3] - $offset,2);
}
my $hour=pad($time[2],2);
my $min=pad($time[1],2);
my $sec=pad($time[0],2);
my $formatted="$day/$shortmonth/$year";

my $domainstring;
my @temparray;
my %iparray;
my $counter=0;
my $hitcounter=0;
my %domainlookups;
open(DOM,"unique_domains.dat");
while(<DOM>){
	chomp;
	my($ip,$domain)=split(/\t/);
	$domainlookups{$ip}=$domain;
}
close(DOM);
my @logfile;
open(LOGFILE,"$logfile")||die("can't open logfile\n");
@logfile=<LOGFILE>;
close(LOGFILE);
foreach my $line(@logfile){
	if($line=~/$formatted/){
		$counter++;
		@temparray=split(/ /,$line);
		my $lookup=$domainlookups{$temparray[0]};
		my $host;
		if(!defined $lookup){
			my $iaddr=inet_aton("$temparray[0]");
			$host=gethostbyaddr($iaddr,AF_INET);
		}
		else{
			$hitcounter++;
			$host=$lookup;
		}
		if($host){
			$domainlookups{$temparray[0]}=$host;
			if($host=~/[0-9]$/){
				$domainstring=$host;
			}
			else{
				my @domainarray=split(/\./,$host);
				my $limit=scalar @domainarray;
				$limit--;
				if(length($domainarray[$limit])<3){
					$domainstring=$domainarray[$limit - 2].'.'.$domainarray[$limit - 1].'.'.$domainarray[$limit];
				}	
				else{
					$domainstring=$domainarray[$limit - 1].'.'.$domainarray[$limit];
				}
			}
		}
		else{
			$domainstring=$temparray[0];
			$domainlookups{$temparray[0]}=$temparray[0];
		}
		my $num=$iparray{$domainstring};
		if($num){
			$iparray{$domainstring}=$num+1;
		}
		else{
			$iparray{$domainstring}=1;
		}
	}
}
open(DOM,">unique_domains.dat")||die("Can't open datafile for writing!\n");
foreach my $keyitem (keys %domainlookups){
	print DOM "$keyitem\t$domainlookups{$keyitem}\n";
}
close(DOM);
print "Visitors on $shortmonth $day, $year:  $counter ($hitcounter cached hits)\n\n";
foreach my $key (sort hashValueAscendingNum (keys(%iparray))){
	print pad($iparray{$key},4)."    $key\n";
}
exit;

sub pad{
	my $thing=shift;
	my $len=shift;
	while(length($thing)<$len){
		$thing="0".$thing;
	}
	return $thing;
}

		
sub hashValueAscendingNum{
	$iparray{$a}<=>$iparray{$b};
}


This site and all content (C)2002-2008 Nathan E. Pralle (www.nathanpralle.com).