- Switched to timestamp based comparison for counting eprints rather than
date based, in order to fix the "hyper-inflated stats" bug.
1 parent b783392 commit 7f571df8b6f147d67ad52e2cc03b95de7a4ba3d5
nstanger authored on 28 Apr 2006
Showing 1 changed file
View
110
Repositories/statistics/scripts/eprints-usage_src.php
<?php
 
/* NJS 2006-04-28
In earlier versions of this script, which eprints to count was
determined by comparing the request date of the eprint against the
"lastproc" date of this script (i.e., minimum time unit one day).
This was fine if you only ran the script once per day, but if you ran
it more than that, it counted multiple times requests whose
$request_date == $lastproc. For example, if you ran this script five
times per day, all the downloads that occurred during that day would
be counted EVERY TIME this script ran, thus overinflating your stats
by a factor of up to five :(
The solution is to use the full time stamp for comparison rather than
just the date. This timestamp MUST include time zone information so
that things don't get screwed up by daylight saving time. As long as
this is done consistently, there's no need to do things like convert
to GMT, for example.
The very first thing we need to do is grab the current time stamp
with time zone, which will later be stored in the database as the
"lastproc" time. This needs to happen first so that we don't "lose"
any requests that occur while the script is running.
*/
$start_time = date('Y-m-d H:i:s O');
 
 
// NJS 2005-12-09 Switched to GeoIP from GeoIP:IPfree.
include("geoip.inc");
 
$connect = mysql_pconnect ($sqlserver,$sqluser,$sqlpass);
$db = mysql_select_db($sqldatabase,$connect) or die("Could not connect");
 
// First get the date of last update
$query = "select lastproc from lastproc order by timeinsert desc limit 1";
// NJS 2006-04-28 Changed this from order by timeinsert to order by id.
// The is always guaranteed to increase temporally, but is otherwise
// time-independent and thus not affected by things like daylight savings.
$query = "select lastproc from lastproc order by id desc limit 1";
$result = mysql_query($query,$connect);
$num_rows = mysql_num_rows($result);
if ($num_rows > 0) {
$row = mysql_fetch_assoc($result);
$datetestA = 0;
}
 
$connect2 = mysql_connect($sqlserver2,$sqluser2,$sqlpass2);
$counter = 1;
$counter = 0;
foreach($log_file as $archivename=>$archivelog) {
$logf = $log_dir . $archivelog;
$archive_name = $archivename;
$handle = fopen($logf, "r");
$date = $matches[2];
$archive = $matches[3];
$uniquebits = $buffer;
$date = preg_replace("/:.*/","",$date);
$date = preg_replace("/\//", " ", $date);
$when = getdate(strtotime($date));
$request_date = $when["year"]."-".$when["mon"]."-".$when["mday"];
$datetestB = strtotime($request_date);
/* NJS 2006-04-25
IMPORTANT: if you run this script more than once per day,
it will count multiple times downloads whose
$request_date == $lastproc. For example, if you ran this
script five times per day, all the downloads that
occurred during that day would be counted EVERY TIME this
script ran, thus overinflating your stats by a factor of
up to five :( This happens because $lastproc has one day
as its base unit.
 
If finer granularity for stats updates is desired, the
solution would be to use the full timestamp rather than
just the date.
/* NJS 2006-04-28
Switched to timestamp rather than date-based comparison.
First, clean up the Apache request date into something
that strtotime understands. Note that the Apache log
dates include time zone info by default.
*/
$date = preg_replace("/:/"," ",$date,1); // Change first ":" to " ".
$date = preg_replace("/\//", " ", $date); // Change all "/" to " ".
$datetestB = strtotime($date);
// Convert to properly formatted date string.
$request_date = date('Y-m-d H:i:s O', $datetestB);
 
if ($datetestB < $datetestA)
continue;
// NJS 2005-11-25 Added regexp for EPrints short URLs.
Keep track of where we are. Should avoid duplication of results
if the script is run more than once on the same log file
*/
 
$query = "INSERT into lastproc (lastproc) values('".$request_date."')";
// NJS 2006-04-28 Switched value inserted to $start_time instead of $request_date.
$query = "INSERT into lastproc (lastproc) values('".$start_time."')";
$result = mysql_query($query,$connect);
 
#print "Records counted: $counter\n";
#print "Last count: $request_date\n";
mysql_close($connect2);
mysql_close($connect);
 
// Look up the title corresponding to the specified eprint id.
function getePrintName($db,$eprintid) {
global $connect2;
$sqldatabase = $db;
$db = mysql_select_db($sqldatabase,$connect2);
$query3 = "select title from archive where eprintid = $eprintid";
$result3 = mysql_query($query3,$connect2);
$title = '';
$suffix = '';
// NJS 2006-04-25 Added check for empty result, probably a deleted item.
// Look in the deletion table for details.
if (mysql_num_rows($result3) == 0) {
return "Unknown item ($eprintid)";
} else {
$query3 = "select title from deletion where eprintid = $eprintid";
$result3 = mysql_query($query3,$connect2);
// If it's not in deletion, then we have no clue what it is.
if (mysql_num_rows($result3) == 0) {
$title = "Unknown item [$eprintid]";
}
else {
$suffix = ' [deleted]';
}
}
if ($title == '') {
$row = mysql_fetch_assoc($result3);
$row["title"] = trim($row["title"]);
$row["title"] = preg_replace("/\s+/"," ",$row["title"]);
return $row["title"];
$title = $row["title"];
}
return $title . $suffix;
}
 
?>