diff --git a/Repositories/statistics/scripts/eprints-usage_src.php b/Repositories/statistics/scripts/eprints-usage_src.php index 4ef8a31..cd46d81 100755 --- a/Repositories/statistics/scripts/eprints-usage_src.php +++ b/Repositories/statistics/scripts/eprints-usage_src.php @@ -197,238 +197,192 @@ the user-agent part of the HTTP response, which gets recorded in the Apache log file. We can look for these and re-code log entries as appropriate. -The format of this list is similar to that of the $local_IPs variable. The -key is the "country name" (in this case the name of the search engine) plus -an "X" ISO 3166-1 country code, separated by a comma. Each key value has an -associated list of corresponding regular expressions that can occur in the -user-agent part of the Apache log entry. If any one of these REs matches -the user-agent part of the log entry, then we should re-code the country -appropriately. +The format of this list is similar to that of the $local_IPs variable. +The key is the "country name" (in this case the name of the search +engine) plus a non-standard four-character country code starting with +"X@", separated by a comma. Each key value has an associated list of +corresponding regular expressions that can occur in the user-agent part +of the Apache log entry. If any one of these REs matches the user-agent +part of the log entry, then we should re-code the country appropriately. -Note that this means that several of the "X" country codes are now reserved -and can no longer be used in $local_IPs. +A four-character code is used because that what the database allows, and +it avoids having to reserve several of the "X" country codes for search +engines. */ $bot_patterns = array( - // Google (http://www.google.com/) - 'Google,XG' => array( - '/Googlebot/', - '/http:\/\/www\.google\.com\/bot\.html/', + // Yahoo! (http://www.yahoo.com/) + 'Yahoo!,X@YH' => array( + '/yahoo! slurp/i', + '/yahooseeker/i', ), // Windows Live Search (http://search.msn.com/) - 'Windows Live Search,XM' => array( - '/msnbot/', - '/http:\/\/search\.msn\.com\/msnbot\.htm/', + 'Windows Live Search,X@MS' => array( + '/msnbot/i', ), - // Yahoo! (http://www.yahoo.com/) - 'Yahoo!,XY' => array( - '/Yahoo! Slurp/', - '/YahooSeeker/', - '/http:\/\/help\.yahoo\.com\/help\/us\/ysearch\/slurp/', - '/yahooseeker-jp-mobile AT Yahoo!JAPAN/', + // Google (http://www.google.com/) + 'Google,X@GG' => array( + '/googlebot/i', ), // Ask.com (http://www.ask.com/) - 'Ask.com,XJ' => array( - '/Ask Jeeves\/Teoma/', - '/http:\/\/about\.ask\.com\/en\/docs\/about\/webmasters\.shtml/', + 'Ask.com,X@AC' => array( + '/ask jeeves\/teoma/i', ), // Everything else I could find in our log files :) - 'Other search engine,XZ' => array( + 'Other search engine,X@OS' => array( // TAMU Internet Research Lab (http://irl.cs.tamu.edu/) - '/http:\/\/irl\.cs\.tamu\.edu\/crawler/', + '/http:\/\/irl\.cs\.tamu\.edu\/crawler/i', // Alexa web search (http://www.alexa.com/) - '/ia_archiver/', + '/ia_archiver/i', // TrueKnowledge for Web (http://www.authoritativeweb.com/) - '/ConveraCrawler/', - '/http:\/\/www\.authoritativeweb\.com\/crawl/', + '/converacrawler/i', // Majestic 12 distributed search engine (http://www.majestic12.co.uk/) - '/MJ12bot/', - '/http:\/\/majestic12\.co\.uk\/bot\.php/', + '/mj12bot/i', // Picsearch (http://www.picsearch.com/) - '/psbot/', - '/http:\/\/www\.picsearch\.com\/bot\.html/', + '/psbot/i', // Exalead (http://www.exalead.com/search) - '/Exabot/', + '/exabot/i', // Cazoodle (note cazoodle.com doesn't exist) - '/CazoodleBot Crawler/', - '/http:\/\/www\.cazoodle\.com/', - '/mqbot@cazoodle\.com/', + '/cazoodlebot crawler/i', + '/mqbot@cazoodle\.com/i', // Gigablast (http://www.gigablast.com/) - '/Gigabot/', - '/http:\/\/www\.gigablast\.com\/spider\.html/', + '/gigabot/i', // Houxou (http://www.houxou.com/) - '/HouxouCrawler/', - '/http:\/\/www\.houxou\.com\/crawler/', - '/crawler at houxou dot com/', + '/houxoucrawler/i', + '/crawler at houxou dot com/i', // IBM Almaden Research Center Computer Science group (http://www.almaden.ibm.com/cs/) - '/http:\/\/www\.almaden\.ibm\.com\/cs\/crawler/', + '/http:\/\/www\.almaden\.ibm\.com\/cs\/crawler/i', // Goo? (http://help.goo.ne.jp/) - '/ichiro/', - '/http:\/\/help\.goo\.ne\.jp\/door\/crawler\.html/', + '/ichiro/i', // Daum Communications Corp (Korea) - '/Edacious & Intelligent Web Robot/', - '/Daum Communications Corp/', - '/DAUM Web Robot/', - '/MSIE is not me/', - '/DAUMOA/', + '/edacious & intelligent web robot/i', + '/daum communications corp/i', + '/daum web robot/i', + '/msie is not me/i', + '/daumoa/i', // Girafa (http://www.girafa.com/) - '/[Gg]irafabot/', - '/girafabot at girafa dot com/', - '/http:\/\/www\.girafa\.com/', + '/girafabot/i', // The Generations Network (http://www.myfamilyinc.com/) - '/MyFamilyBot/', - '/http:\/\/www\.ancestry\.com\/learn\/bot\.aspx/', - '/http:\/\/www\.myfamilyinc\.com/', + '/myfamilybot/i', // Naver? (http://www.naver.com/) - '/NaverBot/', - '/http:\/\/help\.naver\.com\/delete_main\.asp/', + '/naverbot/i', // WiseNut (http://www.wisenutbot.com/) - '/ZyBorg/', - '/wn-[0-9]+\.zyborg@looksmart\.net/', - '/http:\/\/www\.WISEnutbot\.com/', + '/zyborg/i', + '/wn-[0-9]+\.zyborg@looksmart\.net/i', // Accelobot (http://www.accelobot.com/) // This one seems particularly busy! - '/heritrix/', - '/http:\/\/www\.accelobot\.com/', + '/heritrix/i', // Seeqpod (http://www.seeqpod.com/) - '/seeqpod-vertical-crawler/', - '/http:\/\/www\.seeqpod\.com/', + '/seeqpod-vertical-crawler/i', // University of Illinois at Urbana-Champaign, Computer Science (http://www.cs.uiuc.edu/) - '/MQBOT Crawler/', - '/http:\/\/falcon\.cs\.uiuc\.edu/', - '/mqbot@cs\.uiuc\.edu/', + '/mqbot crawler/i', + '/mqbot@cs\.uiuc\.edu/i', // Microsoft Research (http://research.microsoft.com/) - '/MSRBOT/', - '/http:\/\/research\.microsoft\.com\/research\/sv\/msrbot\//', + '/msrbot/i', // Nusearch - '/Nusearch Spider/', - '/www\.nusearch\.com/', + '/nusearch spider/i', // SourceForge (http://www.sf.net/) - '/nutch-agent@lists\.sourceforge\.net/', + '/nutch-agent@lists\.sourceforge\.net/i', // Lucene (http://lucene.apache.org/) - '/nutch-agent@lucene\.apache\.org/', - '/raphael@unterreuth.de/', + '/nutch-agent@lucene\.apache\.org/i', + '/raphael@unterreuth.de/i', // Computer Science, University of Washington (http://cs.washington.edu/) - '/Nutch running at UW/', - '/http:\/\/crawlers\.cs\.washington\.edu\//', - '/sycrawl@cs\.washington\.edu/', + '/nutch running at uw/i', + '/sycrawl@cs\.washington\.edu/i', // Chikayama & Taura Laboratory, University of Tokyo (http://www.logos.ic.i.u-tokyo.ac.jp/) - '/Shim-Crawler/', - '/http:\/\/www\.logos\.ic\.i\.u-tokyo\.ac\.jp\/crawler\//', - '/crawl@logos\.ic\.i\.u-tokyo\.ac\.jp/', + '/shim-crawler/i', + '/crawl@logos\.ic\.i\.u-tokyo\.ac\.jp/i', // Sproose (http://www.sproose.com/) - '/sproose bot/', - '/http:\/\/www\.sproose\.com\/bot\.html/', - '/crawler@sproose\.com/', + '/sproose bot/i', + '/crawler@sproose\.com/i', // Turnitin (http://www.turnitin.com/) - '/TurnitinBot/', - '/http:\/\/www\.turnitin\.com\/robot\/crawlerinfo\.html/', + '/turnitinbot/i', // WISH Project (http://wish.slis.tsukuba.ac.jp/) - '/wish-project/', - '/http:\/\/wish\.slis\.tsukuba\.ac\.jp\//', + '/wish-project/i', // WWWster - '/wwwster/', - '/gue@cis\.uni-muenchen\.de/', + '/wwwster/i', + '/gue@cis\.uni-muenchen\.de/i', // Forex Trading Network Organization (http://www.netforex.org/) - '/Forex Trading Network Organization/', - '/http:\/\/www\.netforex\.org/', - '/info@netforex\.org/', + '/forex trading network organization/i', + '/info@netforex\.org/i', // FunnelBack (http://www.funnelback.com/) - '/FunnelBack/', - '/http:\/\/www\.funnelback\.com\/robot\.html/', + '/funnelback/i', // Baidu (http://www.baidu.com/) - '/Baiduspider/', - '/http:\/\/www\.baidu\.com\/search\/spider\.htm/', + '/baiduspider/i', // Brandimensions (http://www.brandimensions.com/) - '/BDFetch/', + '/bdfetch/i', // Blaiz Enterprises (http://www.blaiz.net/) - '/Blaiz-Bee/', - '/http:\/\/www\.blaiz\.net/', + '/blaiz-bee/i', // Boitho/SearchDaimon (http://www.boitho.com/ or http://www.searchdaimon.com/) - '/boitho\.com-dc/', - '/http:\/\/www\.boitho\.com\/dcbot\.html/', + '/boitho\.com-dc/i', // Celestial (OAI aggregator, see http://oai-perl.sourceforge.net/ for a little info) - '/Celestial/', + '/celestial/i', // Cipinet (http://www.cipinet.com/) - '/CipinetBot/', - '/http:\/\/www\.cipinet\.com\/bot\.html/', + '/cipinetbot/i', // iVia (http://ivia.ucr.edu/) - '/CrawlerTest CrawlerTest/', - '/http:\/\/ivia\.ucr\.edu\/useragents\.shtml/', + '/crawlertest crawlertest/i', // Encyclopedia of Keywords (http://keywen.com/) - '/EasyDL/', - '/http:\/\/keywen\.com\/Encyclopedia\/Bot/', + '/easydl/i', // Everest-Vulcan Inc. (http://everest.vulcan.com/) - '/Everest-Vulcan Inc/', - '/http:\/\/everest\.vulcan\.com\/crawlerhelp/', + '/everest-vulcan inc/i', // FactBites (http://www.factbites.com/) - '/Factbot/', - '/http:\/\/www\.factbites\.com\/webmasters\.php/', + '/factbot/i', // Scirus (http://www.scirus.com/) - '/Scirus scirus-crawler@fast\.no/', - '/http:\/\/www\.scirus\.com\/srsapp\/contactus\//', + '/scirus scirus-crawler@fast\.no/i', // UOL (http://www.uol.com.br/) - '/UOLCrawler/', - '/soscrawler@uol\.com\.br/', + '/uolcrawler/i', + '/soscrawler@uol\.com\.br/i', // Always Updated (http://www.updated.com/) - '/updated crawler/', - '/crawler@updated\.com/', - '/http:\/\/www\.updated\.com/', + '/updated crawler/i', + '/crawler@updated\.com/i', // FAST Enterprise Search (http://www.fast.no/) - '/crawler@fast\.no/', - '/FAST MetaWeb Crawler/', - '/helpdesk at fastsearch dot com/', + '/fast metaweb crawler/i', + '/crawler@fast\.no/i', + '/helpdesk at fastsearch dot com/i', // Deutsche Wortschatz Portal (http://wortschatz.uni-leipzig.de/) - '/findlinks/', - '/http:\/\/wortschatz\.uni-leipzig\.de\/findlinks\//', + '/findlinks/i', // Gais (http://gais.cs.ccu.edu.tw/) - '/Gaisbot/', - '/robot[0-9]{2}@gais.cs.ccu.edu.tw/', - '/http:\/\/gais\.cs\.ccu\.edu\.tw\/robot\.php/', + '/gaisbot/i', + '/robot[0-9]{2}@gais.cs.ccu.edu.tw/i', // http://ilse.net/ - '/INGRID/', - '/http:\/\/webmaster\.ilse\.nl\/jsp\/webmaster\.jsp/', + '/ingrid/i', // Krugle (http://corp.krugle.com/) - '/Krugle\/Krugle/', - '/Krugle web crawler/', - '/http:\/\/corp\.krugle\.com\/crawler\/info\.html/', - '/webcrawler@krugle\.com/', + '/krugle\/krugle/i', + '/krugle web crawler/i', + '/webcrawler@krugle\.com/i', // WebWobot (http://www.webwobot.com/) - '/ScollSpider/', - '/http:\/\/www\.webwobot\.com/', + '/scollspider/i', // Omni-Explorer (http://www.omni-explorer.com/) - '/OmniExplorer_Bot/', - '/http:\/\/www\.omni-explorer\.com/', - '/WorldIndexer/', + '/omniexplorer_bot/i', + '/worldindexer/i', // PageBull (http://www.pagebull.com/) - '/Pagebull http:\/\/www\.pagebull\.com\//', + '/pagebull http:\/\/www\.pagebull\.com\//i', // dir.com (http://dir.com/) - '/Pompos/', - '/http:\/\/dir\.com\/pompos\.html/', + '/pompos/i', // Sensis (http://sensis.com.au/) - '/Sensis Web Crawler/', - '/search_comments\\\\at\\\\sensis\\\\dot\\\\com\\\\dot\\\\au/', + '/sensis web crawler/i', + '/search_comments\\\\at\\\\sensis\\\\dot\\\\com\\\\dot\\\\au/i', // Shopwiki (http://www.shopwiki.com/) - '/ShopWiki/', - '/http:\/\/www\.shopwiki\.com\//', + '/shopwiki/i', // Guruji (http://www.terrawiz.com/) - '/TerrawizBot/', - '/http:\/\/www\.terrawiz\.com\/bot\.html/', + '/terrawizbot/i', // Language Observatory Project (http://www.language-observatory.org/) - '/UbiCrawler/', - '/http:\/\/gii\.nagaokaut\.ac\.jp\/~ubi\//', + '/ubicrawler/i', + // MSIE offline bookmarks crawler + '/msiecrawler/i', // Unidentified - '/[Bb]ot/', - '/[Cc]rawler/', - '/[Ss]pider/', - '/larbin/', // also larbinSpider - '/HTTrack/', - '/voyager/', - '/AcadiaUniversityWebCensusClient/', - '/FeedChecker/', - '/KnowItAll\(knowitall@cs\.washington\.edu\)/', - '/Mediapartners-Google/', - '/psycheclone/', - '/topicblogs/', + '/bot/i', + '/crawler/i', + '/spider/i', + '/larbin/i', // also larbinSpider + '/httrack/i', + '/voyager/i', + '/acadiauniversitywebcensusclient/i', + '/feedchecker/i', + '/knowitall\(knowitall@cs\.washington\.edu\)/i', + '/mediapartners-google/i', + '/psycheclone/i', + '/topicblogs/i', + '/nutch/i', ), ); @@ -474,13 +428,36 @@ $handle = fopen($logf, "r"); while (!feof($handle)) { $buffer = fgets($handle, 4096); - // NJS 2005-11-25 Added regexp for EPrints short URLs. - // NJS 2007-01-26 Added user-agent match to all regexps to enable bot detection. - // NJS 2007-01-29 Added missing regexp for EPrints short URLs with domain names rather than IP addresses. - if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*?(\"[^\"]+\")?$/i",$buffer,$matches)) || - (preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*?(\"[^\"]+\")?$/i",$buffer,$matches)) || - (preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*?(\"[^\"]+\")?$/i",$buffer,$matches)) || - (preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*?(\"[^\"]+\")?$/i",$buffer,$matches))) + /* NJS 2007-01-26 + Added user-agent match to all regexps to enable bot detection. + + NJS 2007-01-31 + Refactored regexps from four down to one, after realising + that (a) long EPrints URLs are a superset of the short ones, + and (b) a regexp that matches domain names works just as well + for IP addresses (the GeoIP lookup doesn't care which it + gets). Also fixed the pattern so it can handle an arbitrary + number of subdomains. Note that the latter would be the main + argument for keeping a separate IP address pattern, as IP + addresses always comprise exactly four parts. However, it's + not really up to the script to verify IP addresses; Apache + should be recording them correctly in the first place! + + The typical kinds of strings we are matching look something + like this: + + fetch abstract (short, long): + 168.192.1.1 - - [31/Jan/2007:09:15:36 +1300] "GET /1/ HTTP/1.1" 200 12345 "referer" "user-agent" + 168.192.1.1 - - [31/Jan/2007:09:15:36 +1300] "GET /archive/00000001/ HTTP/1.1" 200 12345 "referer" "user-agent" + + download item (short, long): + 168.192.1.1 - - [31/Jan/2007:09:15:37 +1300] "GET /1/01/foo.pdf HTTP/1.1" 200 12345 "referer" "user-agent" + 168.192.1.1 - - [31/Jan/2007:09:15:37 +1300] "GET /archive/00000001/01/foo.pdf HTTP/1.1" 200 12345 "referer" "user-agent" + + Plus any of the above with a domain name substituted for the IP + address (e.g., foo.bar.com instead of 168.192.1.1). + */ + if (preg_match("/^(\S+(?:\.\S+)+) - - \[(.*?)\] \"GET \/(?:archive\/0{1,8})?(\d{1,4}).*? HTTP\/1..\" 200 .*?(\"[^\"]+\")?$/i",$buffer,$matches)) { $counter++; $country_code = ''; @@ -572,7 +549,7 @@ { foreach ($patterns as $pat) { - if (preg_match($pat, $user_agent, $matches2)) + if (preg_match($pat, $user_agent)) { $found_country = TRUE; break; @@ -592,7 +569,8 @@ $uniquebits = $buffer; // NJS 2005-11-25 Added regexp for EPrints short URLs. - if(preg_match("/GET \/archive\/0{1,8}\d{1,4}\/\d\d\//i",$buffer) || preg_match("/GET \/\d{1,4}\/\d\d\//i",$buffer)) { + // NJS 2007-01-31 Refactored into one regexp for both styles. + if (preg_match("/GET \/(?:archive\/0{1,8})?\d{1,4}\/\d\d\//i",$buffer)) { $view_type = "download"; } else { $view_type = "abstract"; @@ -633,7 +611,7 @@ /* Keep track of where we are. Should avoid duplication of results - if the script is run more than once on the same log file + if the script is run more than once on the same log file. */ // NJS 2006-04-28 Switched value inserted to $start_time instead of $request_date.