Newer
Older
Digital_Repository / Repositories / statistics / scripts / eprints-usage_src.php
<?php

// NJS 2005-12-09 Switched to GeoIP from GeoIP:IPfree.
include("geoip.inc");

$gi = geoip_open("##GEOIP_DATABASE##",GEOIP_STANDARD);

	/*

	Apache log for ePrints uses this format:
	LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined

	If the log format differs the regular expression matching would need to be adjusted.		
	
	Parse:
		ip
		date YYYY MM DD
		archive ID

	*/

// Web server log files
$log_dir = '##APACHE_LOG_LOCATION##';
$log_file = array(
	'otago_eprints' => '##APACHE_LOG_NAME##',
);


// eprintstats db
$sqlserver = 'localhost';
$sqluser = 'eprintstatspriv';
$sqlpass = 'AuldGrizzel';
$sqldatabase = 'eprintstats';

// SQL details of your ePrints installation
$sqlserver2 = 'localhost';
$sqluser2 = 'otago_eprints';
$sqlpass2 = 'DrSyntaxRidesAgain';

/* NJS 2005-12-16
IP address ranges for your local Intranet(s). You can have multiple
ranges of IP addresses, each with a different "country name", so that
they will appear as separate entries in the by country stats pages. Note
that all sets are assigned the country code "T5", so they will all use
the flag icon for your local installation. If this isn't what you want,
you'll have to hack this yourself :)

Each address range is is keyed off the name that will appear in the
statistics database (the "country name"). Each entry in the range is
either a single IP address, or an array specifying a lower and upper
bound for a contiguous IP address range (see example below).

All IP addresses must be converted to long values using the ip2long()
function before being stored.

Note that address ranges may overlap. The script will use the first
range that matches a given IP, so list the ranges in the correct order
of precedence for your needs.

Example:

$local_IPs = array(
	'Repository Admin' => array(
		ip2long('192.168.1.5'),
		ip2long('192.168.1.22'),
		array(
			ip2long('192.168.1.30'),
			ip2long('192.168.1.35'),
		),
	),
	'Our Intranet' => array(
		array(
			lower => ip2long('192.168.1.0'),
			upper => ip2long('192.168.255.255'),
		),
	),
);

'Repository Admin' covers the IP addresses 192.168.1.5, 192.168.1.22 and
the range 192.168.1.30 to 192.168.1.35, inclusive. 'Our Intranet' covers
the range 192.168.1.0 to 192.168.255.255, inclusive. A machine will only
match the 'Our Intranet' range if it first fails to match the
'Repository Admin' range.
*/
$local_IPs = array(
	'Repository Admin' => array(
		ip2long('139.80.75.110'),  // Nigel @ Uni
		ip2long('60.234.209.74'),  // Nigel @ home
		ip2long('139.80.92.138'),  // Monica & Jeremy
		ip2long('139.80.92.151'),  //   @ Uni
		ip2long('203.89.162.155'), // Monica @ home
		ip2long('139.80.81.50'),   // eprints.otago.ac.nz
	),
	'Otago Intranet' => array(
		array(
			'lower' => ip2long('139.80.0.0'),
			'upper' => ip2long('139.80.127.255'),
		),
	),
);

###########################################
##
## No configuration required below here.
##
###########################################

$connect = mysql_pconnect ($sqlserver,$sqluser,$sqlpass);
$db = mysql_select_db($sqldatabase,$connect) or die("Could not connect");

// First get the date of last update
$query = "select lastproc from lastproc order by timeinsert desc limit 1";
$result = mysql_query($query,$connect);
$num_rows = mysql_num_rows($result);
if ($num_rows > 0) {
	$row = mysql_fetch_assoc($result);
	$lastproc = $row["lastproc"];
	$datetestA = strtotime($lastproc);
}
else {
	$datetestA = 0;
}

$connect2 = mysql_connect($sqlserver2,$sqluser2,$sqlpass2);
$counter = 1;
foreach($log_file as $archivename=>$archivelog) {
	$logf = $log_dir . $archivelog;
	$archive_name = $archivename;
	$handle = fopen($logf, "r");
	while (!feof($handle)) {
		$buffer = fgets($handle, 4096);
		// NJS 2005-11-25 Added regexp for EPrints short URLs.
		if	((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) ||
			(preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) ||
			(preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)))
		{
			$counter++;
			$country_code = '';
			$country_name = '';
			$insertid = '';
			$eprint_name = '';
			$view_type = '';
			$uniquebits = '';
			$ip = $matches[1];
			
			/* NJS 2005-12-16
				Determine country code and name.
				Check whether the IP number falls into any of the local
				intranet ranges. If so, then use that.
			*/
			$ip_long = ip2long($ip);
			$found_country = FALSE;
			foreach ($local_IPs as $name => $addresses)
			{
				foreach ($addresses as $ip_range)
				{
					if (is_array($ip_range)) // check against lower/upper bounds
					{
						$found_country = (($ip_long >= $ip_range['lower']) 
							&& ($ip_long <= $ip_range['upper']));
						break;
					}
					else if (is_long($ip_range)) // data type sanity check
					{
						$found_country = ($ip_long == $ip_range);
						break;
					}
					else // something is seriously broken, ignore this entry
					{
						print "Unsupported data type " . gettype($ip_range) .
							" (value " . $ip_range .
							") in \$local_IPs (expected long).\n";
						continue; 
					}
				}
				
				if ($found_country)
				{
					$country_code = 'T5';
					$country_name = $name;
					break;
				}
			}
			
			// Otherwise, fall back to GeoIP.
			if (!$found_country)
			{
				$country_code = geoip_country_code_by_addr($gi, $ip);
				$country_name = geoip_country_name_by_addr($gi, $ip);
			}
			// end NJS 2005-12-16
			
			$date = $matches[2];
			$archive = $matches[3];
			$uniquebits = $buffer;
			$date = preg_replace("/:.*/","",$date);
			$date = preg_replace("/\//", " ", $date);
			$when = getdate(strtotime($date));
			$request_date = $when["year"]."-".$when["mon"]."-".$when["mday"];
			$datetestB = strtotime($request_date);
			if ($datetestB < $datetestA)
				continue;
			
			// NJS 2005-11-25 Added regexp for EPrints short URLs.
			if(preg_match("/GET \/archive\/0{1,8}\d{1,4}\/\d\d\//i",$buffer) || preg_match("/GET \/\d{1,4}\/\d\d\//i",$buffer)) {
				$view_type = "download";
			} else {
				$view_type = "abstract";
			}
			if(isset($eprintname[$archive])) {
				$eprint_name = $eprintname[$archive];
			} else {
				$eprint_name = getePrintName($archive_name,$archive);
				$eprintname[$archive] = $eprint_name;
			}
			if($eprint_name=='') {
				// Do nothing.
			} else {
				$eprint_name = mysql_escape_string($eprint_name);
				$query = "
				INSERT into view (uniquebits,archive_name,ip,request_date,archiveid,country_code,country_name,view_type,eprint_name)
				values('".$uniquebits."','".$archive_name."','".$ip."','".$request_date."',".$archive.",'".$country_code."','".$country_name."','".$view_type."','".$eprint_name."')";
				$result = mysql_query($query,$connect);
				$insertid = mysql_insert_id($connect);
			}

		} else {
			// print "NO match" . "\n";
		}
	}
	fclose($handle);
}

	/*
		Keep track of where we are. Should avoid duplication of results
		if the script is run more than once on the same log file
	*/

$query = "INSERT into lastproc (lastproc) values('".$request_date."')";
$result = mysql_query($query,$connect);

#print "Records counted: $counter\n";
#print "Last count: $request_date\n";
mysql_close($connect2);
mysql_close($connect);

function getePrintName($db,$eprintid) {
	global $connect2;
	$sqldatabase = $db;
	$db = mysql_select_db($sqldatabase,$connect2);
	$query3 = "select title from archive where eprintid = $eprintid";
	$result3 = mysql_query($query3,$connect2);
	$row = mysql_fetch_assoc($result3);
	$row["title"] = trim($row["title"]);
	$row["title"] = preg_replace("/\s+/"," ",$row["title"]);
	return $row["title"];
}

?>