- Fixed bug that caused about duplicate entries in about 1% of cases.
- Simplified unidentified bot regexps.
- Refactored "datetest[AB]" variables to more sensible names.
- Corrected $referer to $user_agent.
1 parent f167d5e commit b54de2baed9018b6ec1fd6296dda7a80faa16a88
nstanger authored on 30 Jan 2007
Showing 1 changed file
View
118
Repositories/statistics/scripts/eprints-usage_src.php
any requests that occur while the script is running.
*/
$start_time = date('Y-m-d H:i:s O');
 
/* NJS 2007-01-30
A further twist! The original script ignored log lines that had a
date falling before $lastproc, i.e., if log line date < $lastproc
then it's already been dealt with. This is all fine. However, it
didn't bother checking for log lines that were written after the
script started running (i.e. log line date >= $start_time).
Why is this a problem? We're reading the live Apache log file, so
it's quite likely that new lines will be written to it after the
script has started (i.e., after $start_time). Suppose $start_time is
'2006-06-15 14:03:15 +1200', $lastproc is '2006-06-15 12:03:15 +1200'
(i.e., the script is run every two hours) and the log file contains
lines with the following dates:
'2006-06-15 10:03:15 +1200' [1] <-- written before $lastproc
'2006-06-15 12:03:14 +1200' [2] <-- written before $lastproc
'2006-06-15 13:03:15 +1200' [3] <-- written before $start_time
'2006-06-15 14:03:14 +1200' [4] <-- written before $start_time
'2006-06-15 14:03:15 +1200' [5] <-- written at $start_time
'2006-06-15 14:03:16 +1200' [6] <-- written after $start_time
 
During this run, dates [1] and [2] are both < $lastproc and thus
ignored. The remaining four dates ([4]--[6]) are >= $lastproc and
thus processed.
 
Two hours later, the script runs again, this time with $start_time
set to '2006-06-15 16:03:15 +1200' and $lastproc to '2006-06-15
14:03:15 +1200'. Dates [1] through [4] are all < $lastproc and
thus ignored. However, dates [5] and [6] are both >= $lastproc
and are processed a second time, resulting in a duplicate entry
in the database.
The solution is to ignore any log line entries that occur at or after
(>=) $start_time. In the example above, this would mean that in the
first run, dates [1], [2], [5] and [6] would be ignored and dates [3]
and [4] processed. In the second run, dates [1]--[4] would be ignored
and dates [5] and [6] processed.
*/
$test_starttime = strtotime($start_time);
 
 
// NJS 2005-12-09 Switched to GeoIP from GeoIP:IPfree.
include("geoip.inc");
 
// Language Observatory Project (http://www.language-observatory.org/)
'/UbiCrawler/',
'/http:\/\/gii\.nagaokaut\.ac\.jp\/~ubi\//',
// Unidentified
'/bot@bot\.bot/', // see (http://www.webmasterworld.com/search_engine_spiders/3186855.htm)
'/bot\/1.0/',
'/[Bb]ot/',
'/[Cc]rawler/',
'/[Ss]pider/',
'/larbin/', // also larbinSpider
'/HTTrack/',
'/nicebot/',
'/Snapbot/',
'/sogou spider/',
'/TMCrawler/',
'/voyager/',
'/AcadiaUniversityWebCensusClient/',
'/BeijingCrawler/',
'/FeedChecker/',
'/g2Crawler \(?nobody@airmail\.net\)?/',
'/gsa-crawler/',
'/JobSpider_BA/',
'/KnowItAll\(knowitall@cs\.washington\.edu\)/',
'/Mediapartners-Google/',
'/obeys UserAgent NimbleCrawler For problems contact: crawler@healthline\.com/',
'/psycheclone/',
'/RAMPyBot - www.giveRAMP.com/',
'/Robo Crawler/',
'/ScSpider/',
'/snap\.com beta crawler v0/',
'/spider (tspyyp@tom\.com)/',
'/topicblogs/',
'/Twiceler www\.cuill\.com\/robots\.html/',
'/WebFilter Robot/',
'/WILF \(cybermetrics\.wlv\.ac\.uk\/robots\.htm\)/',
'/Bot,Robot,Spider,Crawler,aromano@cli\.di\.unipi\.it/',
),
);
 
###########################################
$connect = mysql_pconnect ($sqlserver,$sqluser,$sqlpass);
$db = mysql_select_db($sqldatabase,$connect) or die("Could not connect");
 
// First get the date of last update
// NJS 2006-04-28 Changed this from order by timeinsert to order by id.
// The is always guaranteed to increase temporally, but is otherwise
// time-independent and thus not affected by things like daylight savings.
/* NJS 2006-04-28
Changed this from order by timeinsert to order by id. The ID is
always guaranteed to increase temporally, but is otherwise
time-independent and thus not affected by things like daylight
savings.
*/
$query = "SELECT lastproc FROM lastproc ORDER BY id DESC LIMIT 1";
$result = mysql_query($query,$connect);
$num_rows = mysql_num_rows($result);
if ($num_rows > 0) {
$row = mysql_fetch_assoc($result);
$lastproc = $row["lastproc"];
$datetestA = strtotime($lastproc);
// NJS 2007-01-30 Refactored $databaseA to more meaningful $test_lastproc.
$test_lastproc = strtotime($lastproc);
}
else {
$datetestA = 0;
$test_lastproc = 0;
}
 
// NJS 2006-06-14: Generalised connection list for multiple archives.
$eprints_connections = array();
$handle = fopen($logf, "r");
while (!feof($handle)) {
$buffer = fgets($handle, 4096);
// NJS 2005-11-25 Added regexp for EPrints short URLs.
// NJS 2007-01-26 Added referer match to all regexps to enable bot detection.
// NJS 2007-01-26 Added user-agent match to all regexps to enable bot detection.
// NJS 2007-01-29 Added missing regexp for EPrints short URLs with domain names rather than IP addresses.
if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")?$/i",$buffer,$matches)) ||
(preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")?$/i",$buffer,$matches)) ||
(preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")?$/i",$buffer,$matches)) ||
$view_type = '';
$uniquebits = '';
/* NJS 2007-01-29
Moved date checking to the start of the loop, as there's
no point in doing any of the regexp checks if we've already
processed this log entry and are going to discard it anyway.
Moved date checking to the start of the loop, as there's
no point in doing any of the regexp checks if we've already
processed this log entry and will discard it anyway.
*/
$date = $matches[2];
/* NJS 2006-04-28
Switched to timestamp rather than date-based comparison.
dates include time zone info by default.
*/
$date = preg_replace("/:/"," ",$date,1); // Change first ":" to " ".
$date = preg_replace("/\//", " ", $date); // Change all "/" to " ".
$datetestB = strtotime($date);
 
if ($datetestB < $datetestA)
// NJS 2007-01-30 Refactored $databaseB to more meaningful
// $test_logdate.
$test_logdate = strtotime($date);
 
// NJS 2007-01-30 Added test for log dates >= $start_time.
if ( ( $test_logdate < $test_lastproc ) ||
( $test_logdate >= $test_starttime ) )
continue;
// Convert to properly formatted date string.
$request_date = date('Y-m-d H:i:s O', $datetestB);
$request_date = date('Y-m-d H:i:s O', $test_logdate);
/* NJS 2005-12-16
Determine country code and name.
Check whether the IP number falls into any of the local
intranet ranges. If so, then use that.
Determine country code and name.
Check whether the IP number falls into any of the local
intranet ranges. If so, then use that.
*/
$ip = $matches[1];
$ip_long = ip2long($ip);
$found_country = FALSE;
}
// end NJS 2005-12-16
/* NJS 2007-01-26
Check whether this is a bot reference.
Check whether this is a bot reference.
*/
$referer = $matches[4];
$user_agent = $matches[4];
$found_country = FALSE;
foreach ($bot_patterns as $id => $patterns)
{
foreach ($patterns as $pat)
{
if (preg_match($pat, $referer, $matches2))
if (preg_match($pat, $user_agent, $matches2))
{
$found_country = TRUE;
break;
}