diff --git a/Repositories/statistics/scripts/eprints-usage_src.php b/Repositories/statistics/scripts/eprints-usage_src.php index 0ecf965..ccb967a 100755 --- a/Repositories/statistics/scripts/eprints-usage_src.php +++ b/Repositories/statistics/scripts/eprints-usage_src.php @@ -147,6 +147,268 @@ ), ); +/* NJS 2007-01-26 +Patterns to match various search engine bots. Ideally, we'd use a similar +mechanism to the $local_IPs variable above, but this isn't feasible because +we'd need to know the IP ranges for the likes of Google, for example. This +clearly isn't possible in practice. + +Fortunately, most search bots insert a readily identifiable string into +the user-agent part of the HTTP response, which gets recorded in the Apache +log file. We can look for these and re-code log entries as appropriate. + +The format of this list is similar to that of the $local_IPs variable. The +key is the "country name" (in this case the name of the search engine) plus +an "X" ISO 3166-1 country code, separated by a comma. Each key value has an +associated list of corresponding regular expressions that can occur in the +user-agent part of the Apache log entry. If any one of these REs matches +the user-agent part of the log entry, then we should re-code the country +appropriately. + +Note that this means that several of the "X" country codes are now reserved +and can no longer be used in $local_IPs. +*/ +$bot_patterns = array( + // Google (http://www.google.com/) + 'Google,XG' => array( + '/Googlebot/', + '/http:\/\/www\.google\.com\/bot\.html/', + ), + // Windows Live Search (http://search.msn.com/) + 'Windows Live Search,XM' => array( + '/msnbot/', + '/http:\/\/search\.msn\.com\/msnbot\.htm/', + ), + // Yahoo! (http://www.yahoo.com/) + 'Yahoo!,XY' => array( + '/Yahoo! Slurp/', + '/YahooSeeker/', + '/http:\/\/help\.yahoo\.com\/help\/us\/ysearch\/slurp/', + '/yahooseeker-jp-mobile AT Yahoo!JAPAN/', + ), + // Ask.com (http://www.ask.com/) + 'Ask.com,XJ' => array( + '/Ask Jeeves\/Teoma/', + '/http:\/\/about\.ask\.com\/en\/docs\/about\/webmasters\.shtml/', + ), + // Everything else I could find in our log files :) + 'Other search engine,XZ' => array( + // TAMU Internet Research Lab (http://irl.cs.tamu.edu/) + '/http:\/\/irl\.cs\.tamu\.edu\/crawler/', + // Alexa web search (http://www.alexa.com/) + '/ia_archiver/', + // TrueKnowledge for Web (http://www.authoritativeweb.com/) + '/ConveraCrawler/', + '/http:\/\/www\.authoritativeweb\.com\/crawl/', + // Majestic 12 distributed search engine (http://www.majestic12.co.uk/) + '/MJ12bot/', + '/http:\/\/majestic12\.co\.uk\/bot\.php/', + // Picsearch (http://www.picsearch.com/) + '/psbot/', + '/http:\/\/www\.picsearch\.com\/bot\.html/', + // Exalead (http://www.exalead.com/search) + '/Exabot/', + // Cazoodle (note cazoodle.com doesn't exist) + '/CazoodleBot Crawler/', + '/http:\/\/www\.cazoodle\.com/', + '/mqbot@cazoodle\.com/', + // Gigablast (http://www.gigablast.com/) + '/Gigabot/', + '/http:\/\/www\.gigablast\.com\/spider\.html/', + // Houxou (http://www.houxou.com/) + '/HouxouCrawler/', + '/http:\/\/www\.houxou\.com\/crawler/', + '/crawler at houxou dot com/', + // IBM Almaden Research Center Computer Science group (http://www.almaden.ibm.com/cs/) + '/http:\/\/www\.almaden\.ibm\.com\/cs\/crawler/', + // Goo? (http://help.goo.ne.jp/) + '/ichiro/', + '/http:\/\/help\.goo\.ne\.jp\/door\/crawler\.html/', + // Daum Communications Corp (Korea) + '/Edacious & Intelligent Web Robot/', + '/Daum Communications Corp/', + '/DAUM Web Robot/', + '/MSIE is not me/', + '/DAUMOA/', + // Girafa (http://www.girafa.com/) + '/[Gg]irafabot/', + '/girafabot at girafa dot com/', + '/http:\/\/www\.girafa\.com/', + // The Generations Network (http://www.myfamilyinc.com/) + '/MyFamilyBot/', + '/http:\/\/www\.ancestry\.com\/learn\/bot\.aspx/', + '/http:\/\/www\.myfamilyinc\.com/', + // Naver? (http://www.naver.com/) + '/NaverBot/', + '/http:\/\/help\.naver\.com\/delete_main\.asp/', + // WiseNut (http://www.wisenutbot.com/) + '/ZyBorg/', + '/wn-[0-9]+\.zyborg@looksmart\.net/', + '/http:\/\/www\.WISEnutbot\.com/', + // Accelobot (http://www.accelobot.com/) + // This one seems particularly busy! + '/heritrix/', + '/http:\/\/www\.accelobot\.com/', + // Seeqpod (http://www.seeqpod.com/) + '/seeqpod-vertical-crawler/', + '/http:\/\/www\.seeqpod\.com/', + // University of Illinois at Urbana-Champaign, Computer Science (http://www.cs.uiuc.edu/) + '/MQBOT Crawler/', + '/http:\/\/falcon\.cs\.uiuc\.edu/', + '/mqbot@cs\.uiuc\.edu/', + // Microsoft Research (http://research.microsoft.com/) + '/MSRBOT/', + '/http:\/\/research\.microsoft\.com\/research\/sv\/msrbot\//', + // Nusearch + '/Nusearch Spider/', + '/www\.nusearch\.com/', + // SourceForge (http://www.sf.net/) + '/nutch-agent@lists\.sourceforge\.net/', + // Lucene (http://lucene.apache.org/) + '/nutch-agent@lucene\.apache\.org/', + '/raphael@unterreuth.de/', + // Computer Science, University of Washington (http://cs.washington.edu/) + '/Nutch running at UW/', + '/http:\/\/crawlers\.cs\.washington\.edu\//', + '/sycrawl@cs\.washington\.edu/', + // Chikayama & Taura Laboratory, University of Tokyo (http://www.logos.ic.i.u-tokyo.ac.jp/) + '/Shim-Crawler/', + '/http:\/\/www\.logos\.ic\.i\.u-tokyo\.ac\.jp\/crawler\//', + '/crawl@logos\.ic\.i\.u-tokyo\.ac\.jp/', + // Sproose (http://www.sproose.com/) + '/sproose bot/', + '/http:\/\/www\.sproose\.com\/bot\.html/', + '/crawler@sproose\.com/', + // Turnitin (http://www.turnitin.com/) + '/TurnitinBot/', + '/http:\/\/www\.turnitin\.com\/robot\/crawlerinfo\.html/', + // WISH Project (http://wish.slis.tsukuba.ac.jp/) + '/wish-project/', + '/http:\/\/wish\.slis\.tsukuba\.ac\.jp\//', + // WWWster + '/wwwster/', + '/gue@cis\.uni-muenchen\.de/', + // Forex Trading Network Organization (http://www.netforex.org/) + '/Forex Trading Network Organization/', + '/http:\/\/www\.netforex\.org/', + '/info@netforex\.org/', + // FunnelBack (http://www.funnelback.com/) + '/FunnelBack/', + '/http:\/\/www\.funnelback\.com\/robot\.html/', + // Baidu (http://www.baidu.com/) + '/Baiduspider/', + '/http:\/\/www\.baidu\.com\/search\/spider\.htm/', + // Brandimensions (http://www.brandimensions.com/) + '/BDFetch/', + // Blaiz Enterprises (http://www.blaiz.net/) + '/Blaiz-Bee/', + '/http:\/\/www\.blaiz\.net/', + // Boitho/SearchDaimon (http://www.boitho.com/ or http://www.searchdaimon.com/) + '/boitho\.com-dc/', + '/http:\/\/www\.boitho\.com\/dcbot\.html/', + // Celestial (OAI aggregator, see http://oai-perl.sourceforge.net/ for a little info) + '/Celestial/', + // Cipinet (http://www.cipinet.com/) + '/CipinetBot/', + '/http:\/\/www\.cipinet\.com\/bot\.html/', + // iVia (http://ivia.ucr.edu/) + '/CrawlerTest CrawlerTest/', + '/http:\/\/ivia\.ucr\.edu\/useragents\.shtml/', + // Encyclopedia of Keywords (http://keywen.com/) + '/EasyDL/', + '/http:\/\/keywen\.com\/Encyclopedia\/Bot/', + // Everest-Vulcan Inc. (http://everest.vulcan.com/) + '/Everest-Vulcan Inc/', + '/http:\/\/everest\.vulcan\.com\/crawlerhelp/', + // FactBites (http://www.factbites.com/) + '/Factbot/', + '/http:\/\/www\.factbites\.com\/webmasters\.php/', + // Scirus (http://www.scirus.com/) + '/Scirus scirus-crawler@fast\.no/', + '/http:\/\/www\.scirus\.com\/srsapp\/contactus\//', + // UOL (http://www.uol.com.br/) + '/UOLCrawler/', + '/soscrawler@uol\.com\.br/', + // Always Updated (http://www.updated.com/) + '/updated crawler/', + '/crawler@updated\.com/', + '/http:\/\/www\.updated\.com/', + // FAST Enterprise Search (http://www.fast.no/) + '/crawler@fast\.no/', + '/FAST MetaWeb Crawler/', + '/helpdesk at fastsearch dot com/', + // Deutsche Wortschatz Portal (http://wortschatz.uni-leipzig.de/) + '/findlinks/', + '/http:\/\/wortschatz\.uni-leipzig\.de\/findlinks\//', + // Gais (http://gais.cs.ccu.edu.tw/) + '/Gaisbot/', + '/robot[0-9]{2}@gais.cs.ccu.edu.tw/', + '/http:\/\/gais\.cs\.ccu\.edu\.tw\/robot\.php/', + // http://ilse.net/ + '/INGRID/', + '/http:\/\/webmaster\.ilse\.nl\/jsp\/webmaster\.jsp/', + // Krugle (http://corp.krugle.com/) + '/Krugle\/Krugle/', + '/Krugle web crawler/', + '/http:\/\/corp\.krugle\.com\/crawler\/info\.html/', + '/webcrawler@krugle\.com/', + // WebWobot (http://www.webwobot.com/) + '/ScollSpider/', + '/http:\/\/www\.webwobot\.com/', + // Omni-Explorer (http://www.omni-explorer.com/) + '/OmniExplorer_Bot/', + '/http:\/\/www\.omni-explorer\.com/', + '/WorldIndexer/', + // PageBull (http://www.pagebull.com/) + '/Pagebull http:\/\/www\.pagebull\.com\//', + // dir.com (http://dir.com/) + '/Pompos/', + '/http:\/\/dir\.com\/pompos\.html/', + // Sensis (http://sensis.com.au/) + '/Sensis Web Crawler/', + '/search_comments\\\\at\\\\sensis\\\\dot\\\\com\\\\dot\\\\au/', + // Shopwiki (http://www.shopwiki.com/) + '/ShopWiki/', + '/http:\/\/www\.shopwiki\.com\//', + // Guruji (http://www.terrawiz.com/) + '/TerrawizBot/', + '/http:\/\/www\.terrawiz\.com\/bot\.html/', + // Language Observatory Project (http://www.language-observatory.org/) + '/UbiCrawler/', + '/http:\/\/gii\.nagaokaut\.ac\.jp\/~ubi\//', + // Unidentified + '/bot@bot\.bot/', // see (http://www.webmasterworld.com/search_engine_spiders/3186855.htm) + '/bot\/1.0/', + '/larbin/', // also larbinSpider + '/HTTrack/', + '/nicebot/', + '/Snapbot/', + '/sogou spider/', + '/TMCrawler/', + '/voyager/', + '/AcadiaUniversityWebCensusClient/', + '/BeijingCrawler/', + '/FeedChecker/', + '/g2Crawler \(?nobody@airmail\.net\)?/', + '/gsa-crawler/', + '/JobSpider_BA/', + '/KnowItAll\(knowitall@cs\.washington\.edu\)/', + '/Mediapartners-Google/', + '/obeys UserAgent NimbleCrawler For problems contact: crawler@healthline\.com/', + '/psycheclone/', + '/RAMPyBot - www.giveRAMP.com/', + '/Robo Crawler/', + '/ScSpider/', + '/snap\.com beta crawler v0/', + '/spider (tspyyp@tom\.com)/', + '/topicblogs/', + '/Twiceler www\.cuill\.com\/robots\.html/', + '/WebFilter Robot/', + '/WILF \(cybermetrics\.wlv\.ac\.uk\/robots\.htm\)/', + '/Bot,Robot,Spider,Crawler,aromano@cli\.di\.unipi\.it/', + ), +); + ########################################### ## ## No configuration required below here. @@ -186,9 +448,9 @@ while (!feof($handle)) { $buffer = fgets($handle, 4096); // NJS 2005-11-25 Added regexp for EPrints short URLs. - if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) || - (preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) || - (preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches))) + if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")$/i",$buffer,$matches)) || + (preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")$/i",$buffer,$matches)) || + (preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")$/i",$buffer,$matches))) { $counter++; $country_code = ''; @@ -245,6 +507,30 @@ } // end NJS 2005-12-16 + // NJS 2007-01-26 + // Check whether this is a bot reference. + $referer = $matches[4]; + $found_country = FALSE; + foreach ($bot_patterns as $id => $patterns) + { + foreach ($patterns as $pat) + { + if (preg_match($pat, $referer, $matches2)) + { + $found_country = TRUE; + break; + } + } + + if ($found_country) + { + list($country_name, $country_code) = explode(',', $id); + break; + } + } + // end NJS 2007-01-26 + + $date = $matches[2]; $eprint_id = $matches[3]; $uniquebits = $buffer;