| |
---|
| | any requests that occur while the script is running. |
---|
| | */ |
---|
| | $start_time = date('Y-m-d H:i:s O'); |
---|
| | |
---|
| | /* NJS 2007-01-30 |
---|
| | A further twist! The original script ignored log lines that had a |
---|
| | date falling before $lastproc, i.e., if log line date < $lastproc |
---|
| | then it's already been dealt with. This is all fine. However, it |
---|
| | didn't bother checking for log lines that were written after the |
---|
| | script started running (i.e. log line date >= $start_time). |
---|
| | |
---|
| | Why is this a problem? We're reading the live Apache log file, so |
---|
| | it's quite likely that new lines will be written to it after the |
---|
| | script has started (i.e., after $start_time). Suppose $start_time is |
---|
| | '2006-06-15 14:03:15 +1200', $lastproc is '2006-06-15 12:03:15 +1200' |
---|
| | (i.e., the script is run every two hours) and the log file contains |
---|
| | lines with the following dates: |
---|
| | |
---|
| | '2006-06-15 10:03:15 +1200' [1] <-- written before $lastproc |
---|
| | '2006-06-15 12:03:14 +1200' [2] <-- written before $lastproc |
---|
| | '2006-06-15 13:03:15 +1200' [3] <-- written before $start_time |
---|
| | '2006-06-15 14:03:14 +1200' [4] <-- written before $start_time |
---|
| | '2006-06-15 14:03:15 +1200' [5] <-- written at $start_time |
---|
| | '2006-06-15 14:03:16 +1200' [6] <-- written after $start_time |
---|
| | |
---|
| | During this run, dates [1] and [2] are both < $lastproc and thus |
---|
| | ignored. The remaining four dates ([4]--[6]) are >= $lastproc and |
---|
| | thus processed. |
---|
| | |
---|
| | Two hours later, the script runs again, this time with $start_time |
---|
| | set to '2006-06-15 16:03:15 +1200' and $lastproc to '2006-06-15 |
---|
| | 14:03:15 +1200'. Dates [1] through [4] are all < $lastproc and |
---|
| | thus ignored. However, dates [5] and [6] are both >= $lastproc |
---|
| | and are processed a second time, resulting in a duplicate entry |
---|
| | in the database. |
---|
| | |
---|
| | The solution is to ignore any log line entries that occur at or after |
---|
| | (>=) $start_time. In the example above, this would mean that in the |
---|
| | first run, dates [1], [2], [5] and [6] would be ignored and dates [3] |
---|
| | and [4] processed. In the second run, dates [1]--[4] would be ignored |
---|
| | and dates [5] and [6] processed. |
---|
| | */ |
---|
| | $test_starttime = strtotime($start_time); |
---|
| | |
---|
| | |
---|
| | // NJS 2005-12-09 Switched to GeoIP from GeoIP:IPfree. |
---|
| | include("geoip.inc"); |
---|
| | |
---|
| |
---|
| | // Language Observatory Project (http://www.language-observatory.org/) |
---|
| | '/UbiCrawler/', |
---|
| | '/http:\/\/gii\.nagaokaut\.ac\.jp\/~ubi\//', |
---|
| | // Unidentified |
---|
| | '/bot@bot\.bot/', // see (http://www.webmasterworld.com/search_engine_spiders/3186855.htm) |
---|
| | '/bot\/1.0/', |
---|
| | '/[Bb]ot/', |
---|
| | '/[Cc]rawler/', |
---|
| | '/[Ss]pider/', |
---|
| | '/larbin/', // also larbinSpider |
---|
| | '/HTTrack/', |
---|
| | '/nicebot/', |
---|
| | '/Snapbot/', |
---|
| | '/sogou spider/', |
---|
| | '/TMCrawler/', |
---|
| | '/voyager/', |
---|
| | '/AcadiaUniversityWebCensusClient/', |
---|
| | '/BeijingCrawler/', |
---|
| | '/FeedChecker/', |
---|
| | '/g2Crawler \(?nobody@airmail\.net\)?/', |
---|
| | '/gsa-crawler/', |
---|
| | '/JobSpider_BA/', |
---|
| | '/KnowItAll\(knowitall@cs\.washington\.edu\)/', |
---|
| | '/Mediapartners-Google/', |
---|
| | '/obeys UserAgent NimbleCrawler For problems contact: crawler@healthline\.com/', |
---|
| | '/psycheclone/', |
---|
| | '/RAMPyBot - www.giveRAMP.com/', |
---|
| | '/Robo Crawler/', |
---|
| | '/ScSpider/', |
---|
| | '/snap\.com beta crawler v0/', |
---|
| | '/spider (tspyyp@tom\.com)/', |
---|
| | '/topicblogs/', |
---|
| | '/Twiceler www\.cuill\.com\/robots\.html/', |
---|
| | '/WebFilter Robot/', |
---|
| | '/WILF \(cybermetrics\.wlv\.ac\.uk\/robots\.htm\)/', |
---|
| | '/Bot,Robot,Spider,Crawler,aromano@cli\.di\.unipi\.it/', |
---|
| | ), |
---|
| | ); |
---|
| | |
---|
| | ########################################### |
---|
| |
---|
| | $connect = mysql_pconnect ($sqlserver,$sqluser,$sqlpass); |
---|
| | $db = mysql_select_db($sqldatabase,$connect) or die("Could not connect"); |
---|
| | |
---|
| | // First get the date of last update |
---|
| | // NJS 2006-04-28 Changed this from order by timeinsert to order by id. |
---|
| | // The is always guaranteed to increase temporally, but is otherwise |
---|
| | // time-independent and thus not affected by things like daylight savings. |
---|
| | /* NJS 2006-04-28 |
---|
| | Changed this from order by timeinsert to order by id. The ID is |
---|
| | always guaranteed to increase temporally, but is otherwise |
---|
| | time-independent and thus not affected by things like daylight |
---|
| | savings. |
---|
| | */ |
---|
| | $query = "SELECT lastproc FROM lastproc ORDER BY id DESC LIMIT 1"; |
---|
| | $result = mysql_query($query,$connect); |
---|
| | $num_rows = mysql_num_rows($result); |
---|
| | if ($num_rows > 0) { |
---|
| | $row = mysql_fetch_assoc($result); |
---|
| | $lastproc = $row["lastproc"]; |
---|
| | $datetestA = strtotime($lastproc); |
---|
| | // NJS 2007-01-30 Refactored $databaseA to more meaningful $test_lastproc. |
---|
| | $test_lastproc = strtotime($lastproc); |
---|
| | } |
---|
| | else { |
---|
| | $datetestA = 0; |
---|
| | $test_lastproc = 0; |
---|
| | } |
---|
| | |
---|
| | // NJS 2006-06-14: Generalised connection list for multiple archives. |
---|
| | $eprints_connections = array(); |
---|
| |
---|
| | $handle = fopen($logf, "r"); |
---|
| | while (!feof($handle)) { |
---|
| | $buffer = fgets($handle, 4096); |
---|
| | // NJS 2005-11-25 Added regexp for EPrints short URLs. |
---|
| | // NJS 2007-01-26 Added referer match to all regexps to enable bot detection. |
---|
| | // NJS 2007-01-26 Added user-agent match to all regexps to enable bot detection. |
---|
| | // NJS 2007-01-29 Added missing regexp for EPrints short URLs with domain names rather than IP addresses. |
---|
| | if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")?$/i",$buffer,$matches)) || |
---|
| | (preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")?$/i",$buffer,$matches)) || |
---|
| | (preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")?$/i",$buffer,$matches)) || |
---|
| |
---|
| | $view_type = ''; |
---|
| | $uniquebits = ''; |
---|
| | |
---|
| | /* NJS 2007-01-29 |
---|
| | Moved date checking to the start of the loop, as there's |
---|
| | no point in doing any of the regexp checks if we've already |
---|
| | processed this log entry and are going to discard it anyway. |
---|
| | Moved date checking to the start of the loop, as there's |
---|
| | no point in doing any of the regexp checks if we've already |
---|
| | processed this log entry and will discard it anyway. |
---|
| | */ |
---|
| | $date = $matches[2]; |
---|
| | /* NJS 2006-04-28 |
---|
| | Switched to timestamp rather than date-based comparison. |
---|
| |
---|
| | dates include time zone info by default. |
---|
| | */ |
---|
| | $date = preg_replace("/:/"," ",$date,1); // Change first ":" to " ". |
---|
| | $date = preg_replace("/\//", " ", $date); // Change all "/" to " ". |
---|
| | $datetestB = strtotime($date); |
---|
| | |
---|
| | if ($datetestB < $datetestA) |
---|
| | // NJS 2007-01-30 Refactored $databaseB to more meaningful |
---|
| | // $test_logdate. |
---|
| | $test_logdate = strtotime($date); |
---|
| | |
---|
| | // NJS 2007-01-30 Added test for log dates >= $start_time. |
---|
| | if ( ( $test_logdate < $test_lastproc ) || |
---|
| | ( $test_logdate >= $test_starttime ) ) |
---|
| | continue; |
---|
| | |
---|
| | // Convert to properly formatted date string. |
---|
| | $request_date = date('Y-m-d H:i:s O', $datetestB); |
---|
| | $request_date = date('Y-m-d H:i:s O', $test_logdate); |
---|
| | |
---|
| | /* NJS 2005-12-16 |
---|
| | Determine country code and name. |
---|
| | Check whether the IP number falls into any of the local |
---|
| | intranet ranges. If so, then use that. |
---|
| | Determine country code and name. |
---|
| | Check whether the IP number falls into any of the local |
---|
| | intranet ranges. If so, then use that. |
---|
| | */ |
---|
| | $ip = $matches[1]; |
---|
| | $ip_long = ip2long($ip); |
---|
| | $found_country = FALSE; |
---|
| |
---|
| | } |
---|
| | // end NJS 2005-12-16 |
---|
| | |
---|
| | /* NJS 2007-01-26 |
---|
| | Check whether this is a bot reference. |
---|
| | Check whether this is a bot reference. |
---|
| | */ |
---|
| | $referer = $matches[4]; |
---|
| | $user_agent = $matches[4]; |
---|
| | $found_country = FALSE; |
---|
| | foreach ($bot_patterns as $id => $patterns) |
---|
| | { |
---|
| | foreach ($patterns as $pat) |
---|
| | { |
---|
| | if (preg_match($pat, $referer, $matches2)) |
---|
| | if (preg_match($pat, $user_agent, $matches2)) |
---|
| | { |
---|
| | $found_country = TRUE; |
---|
| | break; |
---|
| | } |
---|
| |
---|
|