- Rationalised bot patterns and made them all case-insensitive.
- Refactored log line regexps into a single pattern.
- Removed unnecessary third argument from preg_match() call.
1 parent d448b68 commit e713b591e91ae081ccfacd44a6af522200d0e772
nstanger authored on 31 Jan 2007
Showing 1 changed file
View
415
Repositories/statistics/scripts/eprints-usage_src.php
Fortunately, most search bots insert a readily identifiable string into
the user-agent part of the HTTP response, which gets recorded in the Apache
log file. We can look for these and re-code log entries as appropriate.
 
The format of this list is similar to that of the $local_IPs variable. The
key is the "country name" (in this case the name of the search engine) plus
an "X" ISO 3166-1 country code, separated by a comma. Each key value has an
associated list of corresponding regular expressions that can occur in the
user-agent part of the Apache log entry. If any one of these REs matches
the user-agent part of the log entry, then we should re-code the country
appropriately.
 
Note that this means that several of the "X" country codes are now reserved
and can no longer be used in $local_IPs.
The format of this list is similar to that of the $local_IPs variable.
The key is the "country name" (in this case the name of the search
engine) plus a non-standard four-character country code starting with
"X@", separated by a comma. Each key value has an associated list of
corresponding regular expressions that can occur in the user-agent part
of the Apache log entry. If any one of these REs matches the user-agent
part of the log entry, then we should re-code the country appropriately.
 
A four-character code is used because that what the database allows, and
it avoids having to reserve several of the "X" country codes for search
engines.
*/
$bot_patterns = array(
// Yahoo! (http://www.yahoo.com/)
'Yahoo!,X@YH' => array(
'/yahoo! slurp/i',
'/yahooseeker/i',
),
// Windows Live Search (http://search.msn.com/)
'Windows Live Search,X@MS' => array(
'/msnbot/i',
),
// Google (http://www.google.com/)
'Google,XG' => array(
'/Googlebot/',
'/http:\/\/www\.google\.com\/bot\.html/',
),
// Windows Live Search (http://search.msn.com/)
'Windows Live Search,XM' => array(
'/msnbot/',
'/http:\/\/search\.msn\.com\/msnbot\.htm/',
),
// Yahoo! (http://www.yahoo.com/)
'Yahoo!,XY' => array(
'/Yahoo! Slurp/',
'/YahooSeeker/',
'/http:\/\/help\.yahoo\.com\/help\/us\/ysearch\/slurp/',
'/yahooseeker-jp-mobile AT Yahoo!JAPAN/',
'Google,X@GG' => array(
'/googlebot/i',
),
// Ask.com (http://www.ask.com/)
'Ask.com,XJ' => array(
'/Ask Jeeves\/Teoma/',
'/http:\/\/about\.ask\.com\/en\/docs\/about\/webmasters\.shtml/',
'Ask.com,X@AC' => array(
'/ask jeeves\/teoma/i',
),
// Everything else I could find in our log files :)
'Other search engine,XZ' => array(
'Other search engine,X@OS' => array(
// TAMU Internet Research Lab (http://irl.cs.tamu.edu/)
'/http:\/\/irl\.cs\.tamu\.edu\/crawler/',
'/http:\/\/irl\.cs\.tamu\.edu\/crawler/i',
// Alexa web search (http://www.alexa.com/)
'/ia_archiver/',
'/ia_archiver/i',
// TrueKnowledge for Web (http://www.authoritativeweb.com/)
'/ConveraCrawler/',
'/http:\/\/www\.authoritativeweb\.com\/crawl/',
'/converacrawler/i',
// Majestic 12 distributed search engine (http://www.majestic12.co.uk/)
'/MJ12bot/',
'/http:\/\/majestic12\.co\.uk\/bot\.php/',
'/mj12bot/i',
// Picsearch (http://www.picsearch.com/)
'/psbot/',
'/http:\/\/www\.picsearch\.com\/bot\.html/',
'/psbot/i',
// Exalead (http://www.exalead.com/search)
'/Exabot/',
'/exabot/i',
// Cazoodle (note cazoodle.com doesn't exist)
'/CazoodleBot Crawler/',
'/http:\/\/www\.cazoodle\.com/',
'/mqbot@cazoodle\.com/',
'/cazoodlebot crawler/i',
'/mqbot@cazoodle\.com/i',
// Gigablast (http://www.gigablast.com/)
'/Gigabot/',
'/http:\/\/www\.gigablast\.com\/spider\.html/',
'/gigabot/i',
// Houxou (http://www.houxou.com/)
'/HouxouCrawler/',
'/http:\/\/www\.houxou\.com\/crawler/',
'/crawler at houxou dot com/',
'/houxoucrawler/i',
'/crawler at houxou dot com/i',
// IBM Almaden Research Center Computer Science group (http://www.almaden.ibm.com/cs/)
'/http:\/\/www\.almaden\.ibm\.com\/cs\/crawler/',
'/http:\/\/www\.almaden\.ibm\.com\/cs\/crawler/i',
// Goo? (http://help.goo.ne.jp/)
'/ichiro/',
'/http:\/\/help\.goo\.ne\.jp\/door\/crawler\.html/',
'/ichiro/i',
// Daum Communications Corp (Korea)
'/Edacious & Intelligent Web Robot/',
'/Daum Communications Corp/',
'/DAUM Web Robot/',
'/MSIE is not me/',
'/DAUMOA/',
'/edacious & intelligent web robot/i',
'/daum communications corp/i',
'/daum web robot/i',
'/msie is not me/i',
'/daumoa/i',
// Girafa (http://www.girafa.com/)
'/[Gg]irafabot/',
'/girafabot at girafa dot com/',
'/http:\/\/www\.girafa\.com/',
'/girafabot/i',
// The Generations Network (http://www.myfamilyinc.com/)
'/MyFamilyBot/',
'/http:\/\/www\.ancestry\.com\/learn\/bot\.aspx/',
'/http:\/\/www\.myfamilyinc\.com/',
'/myfamilybot/i',
// Naver? (http://www.naver.com/)
'/NaverBot/',
'/http:\/\/help\.naver\.com\/delete_main\.asp/',
'/naverbot/i',
// WiseNut (http://www.wisenutbot.com/)
'/ZyBorg/',
'/wn-[0-9]+\.zyborg@looksmart\.net/',
'/http:\/\/www\.WISEnutbot\.com/',
'/zyborg/i',
'/wn-[0-9]+\.zyborg@looksmart\.net/i',
// Accelobot (http://www.accelobot.com/)
// This one seems particularly busy!
'/heritrix/',
'/http:\/\/www\.accelobot\.com/',
'/heritrix/i',
// Seeqpod (http://www.seeqpod.com/)
'/seeqpod-vertical-crawler/',
'/http:\/\/www\.seeqpod\.com/',
'/seeqpod-vertical-crawler/i',
// University of Illinois at Urbana-Champaign, Computer Science (http://www.cs.uiuc.edu/)
'/MQBOT Crawler/',
'/http:\/\/falcon\.cs\.uiuc\.edu/',
'/mqbot@cs\.uiuc\.edu/',
'/mqbot crawler/i',
'/mqbot@cs\.uiuc\.edu/i',
// Microsoft Research (http://research.microsoft.com/)
'/MSRBOT/',
'/http:\/\/research\.microsoft\.com\/research\/sv\/msrbot\//',
'/msrbot/i',
// Nusearch
'/Nusearch Spider/',
'/www\.nusearch\.com/',
'/nusearch spider/i',
// SourceForge (http://www.sf.net/)
'/nutch-agent@lists\.sourceforge\.net/',
'/nutch-agent@lists\.sourceforge\.net/i',
// Lucene (http://lucene.apache.org/)
'/nutch-agent@lucene\.apache\.org/',
'/raphael@unterreuth.de/',
'/nutch-agent@lucene\.apache\.org/i',
'/raphael@unterreuth.de/i',
// Computer Science, University of Washington (http://cs.washington.edu/)
'/Nutch running at UW/',
'/http:\/\/crawlers\.cs\.washington\.edu\//',
'/sycrawl@cs\.washington\.edu/',
'/nutch running at uw/i',
'/sycrawl@cs\.washington\.edu/i',
// Chikayama & Taura Laboratory, University of Tokyo (http://www.logos.ic.i.u-tokyo.ac.jp/)
'/Shim-Crawler/',
'/http:\/\/www\.logos\.ic\.i\.u-tokyo\.ac\.jp\/crawler\//',
'/crawl@logos\.ic\.i\.u-tokyo\.ac\.jp/',
'/shim-crawler/i',
'/crawl@logos\.ic\.i\.u-tokyo\.ac\.jp/i',
// Sproose (http://www.sproose.com/)
'/sproose bot/',
'/http:\/\/www\.sproose\.com\/bot\.html/',
'/crawler@sproose\.com/',
'/sproose bot/i',
'/crawler@sproose\.com/i',
// Turnitin (http://www.turnitin.com/)
'/TurnitinBot/',
'/http:\/\/www\.turnitin\.com\/robot\/crawlerinfo\.html/',
'/turnitinbot/i',
// WISH Project (http://wish.slis.tsukuba.ac.jp/)
'/wish-project/',
'/http:\/\/wish\.slis\.tsukuba\.ac\.jp\//',
'/wish-project/i',
// WWWster
'/wwwster/',
'/gue@cis\.uni-muenchen\.de/',
'/wwwster/i',
'/gue@cis\.uni-muenchen\.de/i',
// Forex Trading Network Organization (http://www.netforex.org/)
'/Forex Trading Network Organization/',
'/http:\/\/www\.netforex\.org/',
'/info@netforex\.org/',
'/forex trading network organization/i',
'/info@netforex\.org/i',
// FunnelBack (http://www.funnelback.com/)
'/FunnelBack/',
'/http:\/\/www\.funnelback\.com\/robot\.html/',
'/funnelback/i',
// Baidu (http://www.baidu.com/)
'/Baiduspider/',
'/http:\/\/www\.baidu\.com\/search\/spider\.htm/',
'/baiduspider/i',
// Brandimensions (http://www.brandimensions.com/)
'/BDFetch/',
'/bdfetch/i',
// Blaiz Enterprises (http://www.blaiz.net/)
'/Blaiz-Bee/',
'/http:\/\/www\.blaiz\.net/',
'/blaiz-bee/i',
// Boitho/SearchDaimon (http://www.boitho.com/ or http://www.searchdaimon.com/)
'/boitho\.com-dc/',
'/http:\/\/www\.boitho\.com\/dcbot\.html/',
'/boitho\.com-dc/i',
// Celestial (OAI aggregator, see http://oai-perl.sourceforge.net/ for a little info)
'/Celestial/',
'/celestial/i',
// Cipinet (http://www.cipinet.com/)
'/CipinetBot/',
'/http:\/\/www\.cipinet\.com\/bot\.html/',
'/cipinetbot/i',
// iVia (http://ivia.ucr.edu/)
'/CrawlerTest CrawlerTest/',
'/http:\/\/ivia\.ucr\.edu\/useragents\.shtml/',
'/crawlertest crawlertest/i',
// Encyclopedia of Keywords (http://keywen.com/)
'/EasyDL/',
'/http:\/\/keywen\.com\/Encyclopedia\/Bot/',
'/easydl/i',
// Everest-Vulcan Inc. (http://everest.vulcan.com/)
'/Everest-Vulcan Inc/',
'/http:\/\/everest\.vulcan\.com\/crawlerhelp/',
'/everest-vulcan inc/i',
// FactBites (http://www.factbites.com/)
'/Factbot/',
'/http:\/\/www\.factbites\.com\/webmasters\.php/',
'/factbot/i',
// Scirus (http://www.scirus.com/)
'/Scirus scirus-crawler@fast\.no/',
'/http:\/\/www\.scirus\.com\/srsapp\/contactus\//',
'/scirus scirus-crawler@fast\.no/i',
// UOL (http://www.uol.com.br/)
'/UOLCrawler/',
'/soscrawler@uol\.com\.br/',
'/uolcrawler/i',
'/soscrawler@uol\.com\.br/i',
// Always Updated (http://www.updated.com/)
'/updated crawler/',
'/crawler@updated\.com/',
'/http:\/\/www\.updated\.com/',
'/updated crawler/i',
'/crawler@updated\.com/i',
// FAST Enterprise Search (http://www.fast.no/)
'/crawler@fast\.no/',
'/FAST MetaWeb Crawler/',
'/helpdesk at fastsearch dot com/',
'/fast metaweb crawler/i',
'/crawler@fast\.no/i',
'/helpdesk at fastsearch dot com/i',
// Deutsche Wortschatz Portal (http://wortschatz.uni-leipzig.de/)
'/findlinks/',
'/http:\/\/wortschatz\.uni-leipzig\.de\/findlinks\//',
'/findlinks/i',
// Gais (http://gais.cs.ccu.edu.tw/)
'/Gaisbot/',
'/robot[0-9]{2}@gais.cs.ccu.edu.tw/',
'/http:\/\/gais\.cs\.ccu\.edu\.tw\/robot\.php/',
'/gaisbot/i',
'/robot[0-9]{2}@gais.cs.ccu.edu.tw/i',
// http://ilse.net/
'/INGRID/',
'/http:\/\/webmaster\.ilse\.nl\/jsp\/webmaster\.jsp/',
'/ingrid/i',
// Krugle (http://corp.krugle.com/)
'/Krugle\/Krugle/',
'/Krugle web crawler/',
'/http:\/\/corp\.krugle\.com\/crawler\/info\.html/',
'/webcrawler@krugle\.com/',
'/krugle\/krugle/i',
'/krugle web crawler/i',
'/webcrawler@krugle\.com/i',
// WebWobot (http://www.webwobot.com/)
'/ScollSpider/',
'/http:\/\/www\.webwobot\.com/',
'/scollspider/i',
// Omni-Explorer (http://www.omni-explorer.com/)
'/OmniExplorer_Bot/',
'/http:\/\/www\.omni-explorer\.com/',
'/WorldIndexer/',
'/omniexplorer_bot/i',
'/worldindexer/i',
// PageBull (http://www.pagebull.com/)
'/Pagebull http:\/\/www\.pagebull\.com\//',
'/pagebull http:\/\/www\.pagebull\.com\//i',
// dir.com (http://dir.com/)
'/Pompos/',
'/http:\/\/dir\.com\/pompos\.html/',
'/pompos/i',
// Sensis (http://sensis.com.au/)
'/Sensis Web Crawler/',
'/search_comments\\\\at\\\\sensis\\\\dot\\\\com\\\\dot\\\\au/',
'/sensis web crawler/i',
'/search_comments\\\\at\\\\sensis\\\\dot\\\\com\\\\dot\\\\au/i',
// Shopwiki (http://www.shopwiki.com/)
'/ShopWiki/',
'/http:\/\/www\.shopwiki\.com\//',
'/shopwiki/i',
// Guruji (http://www.terrawiz.com/)
'/TerrawizBot/',
'/http:\/\/www\.terrawiz\.com\/bot\.html/',
'/terrawizbot/i',
// Language Observatory Project (http://www.language-observatory.org/)
'/UbiCrawler/',
'/http:\/\/gii\.nagaokaut\.ac\.jp\/~ubi\//',
'/ubicrawler/i',
// MSIE offline bookmarks crawler
'/msiecrawler/i',
// Unidentified
'/[Bb]ot/',
'/[Cc]rawler/',
'/[Ss]pider/',
'/larbin/', // also larbinSpider
'/HTTrack/',
'/voyager/',
'/AcadiaUniversityWebCensusClient/',
'/FeedChecker/',
'/KnowItAll\(knowitall@cs\.washington\.edu\)/',
'/Mediapartners-Google/',
'/psycheclone/',
'/topicblogs/',
'/bot/i',
'/crawler/i',
'/spider/i',
'/larbin/i', // also larbinSpider
'/httrack/i',
'/voyager/i',
'/acadiauniversitywebcensusclient/i',
'/feedchecker/i',
'/knowitall\(knowitall@cs\.washington\.edu\)/i',
'/mediapartners-google/i',
'/psycheclone/i',
'/topicblogs/i',
'/nutch/i',
),
);
 
###########################################
$logf = $log_dir . $archive_log;
$handle = fopen($logf, "r");
while (!feof($handle)) {
$buffer = fgets($handle, 4096);
// NJS 2005-11-25 Added regexp for EPrints short URLs.
// NJS 2007-01-26 Added user-agent match to all regexps to enable bot detection.
// NJS 2007-01-29 Added missing regexp for EPrints short URLs with domain names rather than IP addresses.
if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*?(\"[^\"]+\")?$/i",$buffer,$matches)) ||
(preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*?(\"[^\"]+\")?$/i",$buffer,$matches)) ||
(preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*?(\"[^\"]+\")?$/i",$buffer,$matches)) ||
(preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*?(\"[^\"]+\")?$/i",$buffer,$matches)))
/* NJS 2007-01-26
Added user-agent match to all regexps to enable bot detection.
NJS 2007-01-31
Refactored regexps from four down to one, after realising
that (a) long EPrints URLs are a superset of the short ones,
and (b) a regexp that matches domain names works just as well
for IP addresses (the GeoIP lookup doesn't care which it
gets). Also fixed the pattern so it can handle an arbitrary
number of subdomains. Note that the latter would be the main
argument for keeping a separate IP address pattern, as IP
addresses always comprise exactly four parts. However, it's
not really up to the script to verify IP addresses; Apache
should be recording them correctly in the first place!
The typical kinds of strings we are matching look something
like this:
fetch abstract (short, long):
168.192.1.1 - - [31/Jan/2007:09:15:36 +1300] "GET /1/ HTTP/1.1" 200 12345 "referer" "user-agent"
168.192.1.1 - - [31/Jan/2007:09:15:36 +1300] "GET /archive/00000001/ HTTP/1.1" 200 12345 "referer" "user-agent"
download item (short, long):
168.192.1.1 - - [31/Jan/2007:09:15:37 +1300] "GET /1/01/foo.pdf HTTP/1.1" 200 12345 "referer" "user-agent"
168.192.1.1 - - [31/Jan/2007:09:15:37 +1300] "GET /archive/00000001/01/foo.pdf HTTP/1.1" 200 12345 "referer" "user-agent"
Plus any of the above with a domain name substituted for the IP
address (e.g., foo.bar.com instead of 168.192.1.1).
*/
if (preg_match("/^(\S+(?:\.\S+)+) - - \[(.*?)\] \"GET \/(?:archive\/0{1,8})?(\d{1,4}).*? HTTP\/1..\" 200 .*?(\"[^\"]+\")?$/i",$buffer,$matches))
{
$counter++;
$country_code = '';
$country_name = '';
foreach ($bot_patterns as $id => $patterns)
{
foreach ($patterns as $pat)
{
if (preg_match($pat, $user_agent, $matches2))
if (preg_match($pat, $user_agent))
{
$found_country = TRUE;
break;
}
$eprint_id = $matches[3];
$uniquebits = $buffer;
// NJS 2005-11-25 Added regexp for EPrints short URLs.
if(preg_match("/GET \/archive\/0{1,8}\d{1,4}\/\d\d\//i",$buffer) || preg_match("/GET \/\d{1,4}\/\d\d\//i",$buffer)) {
// NJS 2007-01-31 Refactored into one regexp for both styles.
if (preg_match("/GET \/(?:archive\/0{1,8})?\d{1,4}\/\d\d\//i",$buffer)) {
$view_type = "download";
} else {
$view_type = "abstract";
}
}
 
/*
Keep track of where we are. Should avoid duplication of results
if the script is run more than once on the same log file
if the script is run more than once on the same log file.
*/
 
// NJS 2006-04-28 Switched value inserted to $start_time instead of $request_date.
$query = "INSERT into lastproc (lastproc) values('".$start_time."')";