diff --git a/Repositories/statistics/scripts/eprints-usage_src.php b/Repositories/statistics/scripts/eprints-usage_src.php index 79e56bb..72ce7ec 100755 --- a/Repositories/statistics/scripts/eprints-usage_src.php +++ b/Repositories/statistics/scripts/eprints-usage_src.php @@ -24,6 +24,46 @@ */ $start_time = date('Y-m-d H:i:s O'); +/* NJS 2007-01-30 + A further twist! The original script ignored log lines that had a + date falling before $lastproc, i.e., if log line date < $lastproc + then it's already been dealt with. This is all fine. However, it + didn't bother checking for log lines that were written after the + script started running (i.e. log line date >= $start_time). + + Why is this a problem? We're reading the live Apache log file, so + it's quite likely that new lines will be written to it after the + script has started (i.e., after $start_time). Suppose $start_time is + '2006-06-15 14:03:15 +1200', $lastproc is '2006-06-15 12:03:15 +1200' + (i.e., the script is run every two hours) and the log file contains + lines with the following dates: + + '2006-06-15 10:03:15 +1200' [1] <-- written before $lastproc + '2006-06-15 12:03:14 +1200' [2] <-- written before $lastproc + '2006-06-15 13:03:15 +1200' [3] <-- written before $start_time + '2006-06-15 14:03:14 +1200' [4] <-- written before $start_time + '2006-06-15 14:03:15 +1200' [5] <-- written at $start_time + '2006-06-15 14:03:16 +1200' [6] <-- written after $start_time + + During this run, dates [1] and [2] are both < $lastproc and thus + ignored. The remaining four dates ([4]--[6]) are >= $lastproc and + thus processed. + + Two hours later, the script runs again, this time with $start_time + set to '2006-06-15 16:03:15 +1200' and $lastproc to '2006-06-15 + 14:03:15 +1200'. Dates [1] through [4] are all < $lastproc and + thus ignored. However, dates [5] and [6] are both >= $lastproc + and are processed a second time, resulting in a duplicate entry + in the database. + + The solution is to ignore any log line entries that occur at or after + (>=) $start_time. In the example above, this would mean that in the + first run, dates [1], [2], [5] and [6] would be ignored and dates [3] + and [4] processed. In the second run, dates [1]--[4] would be ignored + and dates [5] and [6] processed. +*/ +$test_starttime = strtotime($start_time); + // NJS 2005-12-09 Switched to GeoIP from GeoIP:IPfree. include("geoip.inc"); @@ -377,35 +417,18 @@ '/UbiCrawler/', '/http:\/\/gii\.nagaokaut\.ac\.jp\/~ubi\//', // Unidentified - '/bot@bot\.bot/', // see (http://www.webmasterworld.com/search_engine_spiders/3186855.htm) - '/bot\/1.0/', + '/[Bb]ot/', + '/[Cc]rawler/', + '/[Ss]pider/', '/larbin/', // also larbinSpider '/HTTrack/', - '/nicebot/', - '/Snapbot/', - '/sogou spider/', - '/TMCrawler/', '/voyager/', '/AcadiaUniversityWebCensusClient/', - '/BeijingCrawler/', '/FeedChecker/', - '/g2Crawler \(?nobody@airmail\.net\)?/', - '/gsa-crawler/', - '/JobSpider_BA/', '/KnowItAll\(knowitall@cs\.washington\.edu\)/', '/Mediapartners-Google/', - '/obeys UserAgent NimbleCrawler For problems contact: crawler@healthline\.com/', '/psycheclone/', - '/RAMPyBot - www.giveRAMP.com/', - '/Robo Crawler/', - '/ScSpider/', - '/snap\.com beta crawler v0/', - '/spider (tspyyp@tom\.com)/', '/topicblogs/', - '/Twiceler www\.cuill\.com\/robots\.html/', - '/WebFilter Robot/', - '/WILF \(cybermetrics\.wlv\.ac\.uk\/robots\.htm\)/', - '/Bot,Robot,Spider,Crawler,aromano@cli\.di\.unipi\.it/', ), ); @@ -419,19 +442,23 @@ $db = mysql_select_db($sqldatabase,$connect) or die("Could not connect"); // First get the date of last update -// NJS 2006-04-28 Changed this from order by timeinsert to order by id. -// The is always guaranteed to increase temporally, but is otherwise -// time-independent and thus not affected by things like daylight savings. +/* NJS 2006-04-28 + Changed this from order by timeinsert to order by id. The ID is + always guaranteed to increase temporally, but is otherwise + time-independent and thus not affected by things like daylight + savings. +*/ $query = "SELECT lastproc FROM lastproc ORDER BY id DESC LIMIT 1"; $result = mysql_query($query,$connect); $num_rows = mysql_num_rows($result); if ($num_rows > 0) { $row = mysql_fetch_assoc($result); $lastproc = $row["lastproc"]; - $datetestA = strtotime($lastproc); + // NJS 2007-01-30 Refactored $databaseA to more meaningful $test_lastproc. + $test_lastproc = strtotime($lastproc); } else { - $datetestA = 0; + $test_lastproc = 0; } // NJS 2006-06-14: Generalised connection list for multiple archives. @@ -448,7 +475,7 @@ while (!feof($handle)) { $buffer = fgets($handle, 4096); // NJS 2005-11-25 Added regexp for EPrints short URLs. - // NJS 2007-01-26 Added referer match to all regexps to enable bot detection. + // NJS 2007-01-26 Added user-agent match to all regexps to enable bot detection. // NJS 2007-01-29 Added missing regexp for EPrints short URLs with domain names rather than IP addresses. if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")?$/i",$buffer,$matches)) || (preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")?$/i",$buffer,$matches)) || @@ -464,9 +491,9 @@ $uniquebits = ''; /* NJS 2007-01-29 - Moved date checking to the start of the loop, as there's - no point in doing any of the regexp checks if we've already - processed this log entry and are going to discard it anyway. + Moved date checking to the start of the loop, as there's + no point in doing any of the regexp checks if we've already + processed this log entry and will discard it anyway. */ $date = $matches[2]; /* NJS 2006-04-28 @@ -477,18 +504,22 @@ */ $date = preg_replace("/:/"," ",$date,1); // Change first ":" to " ". $date = preg_replace("/\//", " ", $date); // Change all "/" to " ". - $datetestB = strtotime($date); + // NJS 2007-01-30 Refactored $databaseB to more meaningful + // $test_logdate. + $test_logdate = strtotime($date); - if ($datetestB < $datetestA) + // NJS 2007-01-30 Added test for log dates >= $start_time. + if ( ( $test_logdate < $test_lastproc ) || + ( $test_logdate >= $test_starttime ) ) continue; // Convert to properly formatted date string. - $request_date = date('Y-m-d H:i:s O', $datetestB); + $request_date = date('Y-m-d H:i:s O', $test_logdate); /* NJS 2005-12-16 - Determine country code and name. - Check whether the IP number falls into any of the local - intranet ranges. If so, then use that. + Determine country code and name. + Check whether the IP number falls into any of the local + intranet ranges. If so, then use that. */ $ip = $matches[1]; $ip_long = ip2long($ip); @@ -533,15 +564,15 @@ // end NJS 2005-12-16 /* NJS 2007-01-26 - Check whether this is a bot reference. + Check whether this is a bot reference. */ - $referer = $matches[4]; + $user_agent = $matches[4]; $found_country = FALSE; foreach ($bot_patterns as $id => $patterns) { foreach ($patterns as $pat) { - if (preg_match($pat, $referer, $matches2)) + if (preg_match($pat, $user_agent, $matches2)) { $found_country = TRUE; break;