| |
---|
| | ), |
---|
| | ), |
---|
| | ); |
---|
| | |
---|
| | /* NJS 2007-01-26 |
---|
| | Patterns to match various search engine bots. Ideally, we'd use a similar |
---|
| | mechanism to the $local_IPs variable above, but this isn't feasible because |
---|
| | we'd need to know the IP ranges for the likes of Google, for example. This |
---|
| | clearly isn't possible in practice. |
---|
| | |
---|
| | Fortunately, most search bots insert a readily identifiable string into |
---|
| | the user-agent part of the HTTP response, which gets recorded in the Apache |
---|
| | log file. We can look for these and re-code log entries as appropriate. |
---|
| | |
---|
| | The format of this list is similar to that of the $local_IPs variable. The |
---|
| | key is the "country name" (in this case the name of the search engine) plus |
---|
| | an "X" ISO 3166-1 country code, separated by a comma. Each key value has an |
---|
| | associated list of corresponding regular expressions that can occur in the |
---|
| | user-agent part of the Apache log entry. If any one of these REs matches |
---|
| | the user-agent part of the log entry, then we should re-code the country |
---|
| | appropriately. |
---|
| | |
---|
| | Note that this means that several of the "X" country codes are now reserved |
---|
| | and can no longer be used in $local_IPs. |
---|
| | */ |
---|
| | $bot_patterns = array( |
---|
| | // Google (http://www.google.com/) |
---|
| | 'Google,XG' => array( |
---|
| | '/Googlebot/', |
---|
| | '/http:\/\/www\.google\.com\/bot\.html/', |
---|
| | ), |
---|
| | // Windows Live Search (http://search.msn.com/) |
---|
| | 'Windows Live Search,XM' => array( |
---|
| | '/msnbot/', |
---|
| | '/http:\/\/search\.msn\.com\/msnbot\.htm/', |
---|
| | ), |
---|
| | // Yahoo! (http://www.yahoo.com/) |
---|
| | 'Yahoo!,XY' => array( |
---|
| | '/Yahoo! Slurp/', |
---|
| | '/YahooSeeker/', |
---|
| | '/http:\/\/help\.yahoo\.com\/help\/us\/ysearch\/slurp/', |
---|
| | '/yahooseeker-jp-mobile AT Yahoo!JAPAN/', |
---|
| | ), |
---|
| | // Ask.com (http://www.ask.com/) |
---|
| | 'Ask.com,XJ' => array( |
---|
| | '/Ask Jeeves\/Teoma/', |
---|
| | '/http:\/\/about\.ask\.com\/en\/docs\/about\/webmasters\.shtml/', |
---|
| | ), |
---|
| | // Everything else I could find in our log files :) |
---|
| | 'Other search engine,XZ' => array( |
---|
| | // TAMU Internet Research Lab (http://irl.cs.tamu.edu/) |
---|
| | '/http:\/\/irl\.cs\.tamu\.edu\/crawler/', |
---|
| | // Alexa web search (http://www.alexa.com/) |
---|
| | '/ia_archiver/', |
---|
| | // TrueKnowledge for Web (http://www.authoritativeweb.com/) |
---|
| | '/ConveraCrawler/', |
---|
| | '/http:\/\/www\.authoritativeweb\.com\/crawl/', |
---|
| | // Majestic 12 distributed search engine (http://www.majestic12.co.uk/) |
---|
| | '/MJ12bot/', |
---|
| | '/http:\/\/majestic12\.co\.uk\/bot\.php/', |
---|
| | // Picsearch (http://www.picsearch.com/) |
---|
| | '/psbot/', |
---|
| | '/http:\/\/www\.picsearch\.com\/bot\.html/', |
---|
| | // Exalead (http://www.exalead.com/search) |
---|
| | '/Exabot/', |
---|
| | // Cazoodle (note cazoodle.com doesn't exist) |
---|
| | '/CazoodleBot Crawler/', |
---|
| | '/http:\/\/www\.cazoodle\.com/', |
---|
| | '/mqbot@cazoodle\.com/', |
---|
| | // Gigablast (http://www.gigablast.com/) |
---|
| | '/Gigabot/', |
---|
| | '/http:\/\/www\.gigablast\.com\/spider\.html/', |
---|
| | // Houxou (http://www.houxou.com/) |
---|
| | '/HouxouCrawler/', |
---|
| | '/http:\/\/www\.houxou\.com\/crawler/', |
---|
| | '/crawler at houxou dot com/', |
---|
| | // IBM Almaden Research Center Computer Science group (http://www.almaden.ibm.com/cs/) |
---|
| | '/http:\/\/www\.almaden\.ibm\.com\/cs\/crawler/', |
---|
| | // Goo? (http://help.goo.ne.jp/) |
---|
| | '/ichiro/', |
---|
| | '/http:\/\/help\.goo\.ne\.jp\/door\/crawler\.html/', |
---|
| | // Daum Communications Corp (Korea) |
---|
| | '/Edacious & Intelligent Web Robot/', |
---|
| | '/Daum Communications Corp/', |
---|
| | '/DAUM Web Robot/', |
---|
| | '/MSIE is not me/', |
---|
| | '/DAUMOA/', |
---|
| | // Girafa (http://www.girafa.com/) |
---|
| | '/[Gg]irafabot/', |
---|
| | '/girafabot at girafa dot com/', |
---|
| | '/http:\/\/www\.girafa\.com/', |
---|
| | // The Generations Network (http://www.myfamilyinc.com/) |
---|
| | '/MyFamilyBot/', |
---|
| | '/http:\/\/www\.ancestry\.com\/learn\/bot\.aspx/', |
---|
| | '/http:\/\/www\.myfamilyinc\.com/', |
---|
| | // Naver? (http://www.naver.com/) |
---|
| | '/NaverBot/', |
---|
| | '/http:\/\/help\.naver\.com\/delete_main\.asp/', |
---|
| | // WiseNut (http://www.wisenutbot.com/) |
---|
| | '/ZyBorg/', |
---|
| | '/wn-[0-9]+\.zyborg@looksmart\.net/', |
---|
| | '/http:\/\/www\.WISEnutbot\.com/', |
---|
| | // Accelobot (http://www.accelobot.com/) |
---|
| | // This one seems particularly busy! |
---|
| | '/heritrix/', |
---|
| | '/http:\/\/www\.accelobot\.com/', |
---|
| | // Seeqpod (http://www.seeqpod.com/) |
---|
| | '/seeqpod-vertical-crawler/', |
---|
| | '/http:\/\/www\.seeqpod\.com/', |
---|
| | // University of Illinois at Urbana-Champaign, Computer Science (http://www.cs.uiuc.edu/) |
---|
| | '/MQBOT Crawler/', |
---|
| | '/http:\/\/falcon\.cs\.uiuc\.edu/', |
---|
| | '/mqbot@cs\.uiuc\.edu/', |
---|
| | // Microsoft Research (http://research.microsoft.com/) |
---|
| | '/MSRBOT/', |
---|
| | '/http:\/\/research\.microsoft\.com\/research\/sv\/msrbot\//', |
---|
| | // Nusearch |
---|
| | '/Nusearch Spider/', |
---|
| | '/www\.nusearch\.com/', |
---|
| | // SourceForge (http://www.sf.net/) |
---|
| | '/nutch-agent@lists\.sourceforge\.net/', |
---|
| | // Lucene (http://lucene.apache.org/) |
---|
| | '/nutch-agent@lucene\.apache\.org/', |
---|
| | '/raphael@unterreuth.de/', |
---|
| | // Computer Science, University of Washington (http://cs.washington.edu/) |
---|
| | '/Nutch running at UW/', |
---|
| | '/http:\/\/crawlers\.cs\.washington\.edu\//', |
---|
| | '/sycrawl@cs\.washington\.edu/', |
---|
| | // Chikayama & Taura Laboratory, University of Tokyo (http://www.logos.ic.i.u-tokyo.ac.jp/) |
---|
| | '/Shim-Crawler/', |
---|
| | '/http:\/\/www\.logos\.ic\.i\.u-tokyo\.ac\.jp\/crawler\//', |
---|
| | '/crawl@logos\.ic\.i\.u-tokyo\.ac\.jp/', |
---|
| | // Sproose (http://www.sproose.com/) |
---|
| | '/sproose bot/', |
---|
| | '/http:\/\/www\.sproose\.com\/bot\.html/', |
---|
| | '/crawler@sproose\.com/', |
---|
| | // Turnitin (http://www.turnitin.com/) |
---|
| | '/TurnitinBot/', |
---|
| | '/http:\/\/www\.turnitin\.com\/robot\/crawlerinfo\.html/', |
---|
| | // WISH Project (http://wish.slis.tsukuba.ac.jp/) |
---|
| | '/wish-project/', |
---|
| | '/http:\/\/wish\.slis\.tsukuba\.ac\.jp\//', |
---|
| | // WWWster |
---|
| | '/wwwster/', |
---|
| | '/gue@cis\.uni-muenchen\.de/', |
---|
| | // Forex Trading Network Organization (http://www.netforex.org/) |
---|
| | '/Forex Trading Network Organization/', |
---|
| | '/http:\/\/www\.netforex\.org/', |
---|
| | '/info@netforex\.org/', |
---|
| | // FunnelBack (http://www.funnelback.com/) |
---|
| | '/FunnelBack/', |
---|
| | '/http:\/\/www\.funnelback\.com\/robot\.html/', |
---|
| | // Baidu (http://www.baidu.com/) |
---|
| | '/Baiduspider/', |
---|
| | '/http:\/\/www\.baidu\.com\/search\/spider\.htm/', |
---|
| | // Brandimensions (http://www.brandimensions.com/) |
---|
| | '/BDFetch/', |
---|
| | // Blaiz Enterprises (http://www.blaiz.net/) |
---|
| | '/Blaiz-Bee/', |
---|
| | '/http:\/\/www\.blaiz\.net/', |
---|
| | // Boitho/SearchDaimon (http://www.boitho.com/ or http://www.searchdaimon.com/) |
---|
| | '/boitho\.com-dc/', |
---|
| | '/http:\/\/www\.boitho\.com\/dcbot\.html/', |
---|
| | // Celestial (OAI aggregator, see http://oai-perl.sourceforge.net/ for a little info) |
---|
| | '/Celestial/', |
---|
| | // Cipinet (http://www.cipinet.com/) |
---|
| | '/CipinetBot/', |
---|
| | '/http:\/\/www\.cipinet\.com\/bot\.html/', |
---|
| | // iVia (http://ivia.ucr.edu/) |
---|
| | '/CrawlerTest CrawlerTest/', |
---|
| | '/http:\/\/ivia\.ucr\.edu\/useragents\.shtml/', |
---|
| | // Encyclopedia of Keywords (http://keywen.com/) |
---|
| | '/EasyDL/', |
---|
| | '/http:\/\/keywen\.com\/Encyclopedia\/Bot/', |
---|
| | // Everest-Vulcan Inc. (http://everest.vulcan.com/) |
---|
| | '/Everest-Vulcan Inc/', |
---|
| | '/http:\/\/everest\.vulcan\.com\/crawlerhelp/', |
---|
| | // FactBites (http://www.factbites.com/) |
---|
| | '/Factbot/', |
---|
| | '/http:\/\/www\.factbites\.com\/webmasters\.php/', |
---|
| | // Scirus (http://www.scirus.com/) |
---|
| | '/Scirus scirus-crawler@fast\.no/', |
---|
| | '/http:\/\/www\.scirus\.com\/srsapp\/contactus\//', |
---|
| | // UOL (http://www.uol.com.br/) |
---|
| | '/UOLCrawler/', |
---|
| | '/soscrawler@uol\.com\.br/', |
---|
| | // Always Updated (http://www.updated.com/) |
---|
| | '/updated crawler/', |
---|
| | '/crawler@updated\.com/', |
---|
| | '/http:\/\/www\.updated\.com/', |
---|
| | // FAST Enterprise Search (http://www.fast.no/) |
---|
| | '/crawler@fast\.no/', |
---|
| | '/FAST MetaWeb Crawler/', |
---|
| | '/helpdesk at fastsearch dot com/', |
---|
| | // Deutsche Wortschatz Portal (http://wortschatz.uni-leipzig.de/) |
---|
| | '/findlinks/', |
---|
| | '/http:\/\/wortschatz\.uni-leipzig\.de\/findlinks\//', |
---|
| | // Gais (http://gais.cs.ccu.edu.tw/) |
---|
| | '/Gaisbot/', |
---|
| | '/robot[0-9]{2}@gais.cs.ccu.edu.tw/', |
---|
| | '/http:\/\/gais\.cs\.ccu\.edu\.tw\/robot\.php/', |
---|
| | // http://ilse.net/ |
---|
| | '/INGRID/', |
---|
| | '/http:\/\/webmaster\.ilse\.nl\/jsp\/webmaster\.jsp/', |
---|
| | // Krugle (http://corp.krugle.com/) |
---|
| | '/Krugle\/Krugle/', |
---|
| | '/Krugle web crawler/', |
---|
| | '/http:\/\/corp\.krugle\.com\/crawler\/info\.html/', |
---|
| | '/webcrawler@krugle\.com/', |
---|
| | // WebWobot (http://www.webwobot.com/) |
---|
| | '/ScollSpider/', |
---|
| | '/http:\/\/www\.webwobot\.com/', |
---|
| | // Omni-Explorer (http://www.omni-explorer.com/) |
---|
| | '/OmniExplorer_Bot/', |
---|
| | '/http:\/\/www\.omni-explorer\.com/', |
---|
| | '/WorldIndexer/', |
---|
| | // PageBull (http://www.pagebull.com/) |
---|
| | '/Pagebull http:\/\/www\.pagebull\.com\//', |
---|
| | // dir.com (http://dir.com/) |
---|
| | '/Pompos/', |
---|
| | '/http:\/\/dir\.com\/pompos\.html/', |
---|
| | // Sensis (http://sensis.com.au/) |
---|
| | '/Sensis Web Crawler/', |
---|
| | '/search_comments\\\\at\\\\sensis\\\\dot\\\\com\\\\dot\\\\au/', |
---|
| | // Shopwiki (http://www.shopwiki.com/) |
---|
| | '/ShopWiki/', |
---|
| | '/http:\/\/www\.shopwiki\.com\//', |
---|
| | // Guruji (http://www.terrawiz.com/) |
---|
| | '/TerrawizBot/', |
---|
| | '/http:\/\/www\.terrawiz\.com\/bot\.html/', |
---|
| | // Language Observatory Project (http://www.language-observatory.org/) |
---|
| | '/UbiCrawler/', |
---|
| | '/http:\/\/gii\.nagaokaut\.ac\.jp\/~ubi\//', |
---|
| | // Unidentified |
---|
| | '/bot@bot\.bot/', // see (http://www.webmasterworld.com/search_engine_spiders/3186855.htm) |
---|
| | '/bot\/1.0/', |
---|
| | '/larbin/', // also larbinSpider |
---|
| | '/HTTrack/', |
---|
| | '/nicebot/', |
---|
| | '/Snapbot/', |
---|
| | '/sogou spider/', |
---|
| | '/TMCrawler/', |
---|
| | '/voyager/', |
---|
| | '/AcadiaUniversityWebCensusClient/', |
---|
| | '/BeijingCrawler/', |
---|
| | '/FeedChecker/', |
---|
| | '/g2Crawler \(?nobody@airmail\.net\)?/', |
---|
| | '/gsa-crawler/', |
---|
| | '/JobSpider_BA/', |
---|
| | '/KnowItAll\(knowitall@cs\.washington\.edu\)/', |
---|
| | '/Mediapartners-Google/', |
---|
| | '/obeys UserAgent NimbleCrawler For problems contact: crawler@healthline\.com/', |
---|
| | '/psycheclone/', |
---|
| | '/RAMPyBot - www.giveRAMP.com/', |
---|
| | '/Robo Crawler/', |
---|
| | '/ScSpider/', |
---|
| | '/snap\.com beta crawler v0/', |
---|
| | '/spider (tspyyp@tom\.com)/', |
---|
| | '/topicblogs/', |
---|
| | '/Twiceler www\.cuill\.com\/robots\.html/', |
---|
| | '/WebFilter Robot/', |
---|
| | '/WILF \(cybermetrics\.wlv\.ac\.uk\/robots\.htm\)/', |
---|
| | '/Bot,Robot,Spider,Crawler,aromano@cli\.di\.unipi\.it/', |
---|
| | ), |
---|
| | ); |
---|
| | |
---|
| | ########################################### |
---|
| | ## |
---|
| | ## No configuration required below here. |
---|
| | ## |
---|
| |
---|
| | $handle = fopen($logf, "r"); |
---|
| | while (!feof($handle)) { |
---|
| | $buffer = fgets($handle, 4096); |
---|
| | // NJS 2005-11-25 Added regexp for EPrints short URLs. |
---|
| | if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) || |
---|
| | (preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) || |
---|
| | (preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches))) |
---|
| | if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")$/i",$buffer,$matches)) || |
---|
| | (preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")$/i",$buffer,$matches)) || |
---|
| | (preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")$/i",$buffer,$matches))) |
---|
| | { |
---|
| | $counter++; |
---|
| | $country_code = ''; |
---|
| | $country_name = ''; |
---|
| |
---|
| | $country_name = geoip_country_name_by_addr($gi, $ip); |
---|
| | } |
---|
| | // end NJS 2005-12-16 |
---|
| | |
---|
| | // NJS 2007-01-26 |
---|
| | // Check whether this is a bot reference. |
---|
| | $referer = $matches[4]; |
---|
| | $found_country = FALSE; |
---|
| | foreach ($bot_patterns as $id => $patterns) |
---|
| | { |
---|
| | foreach ($patterns as $pat) |
---|
| | { |
---|
| | if (preg_match($pat, $referer, $matches2)) |
---|
| | { |
---|
| | $found_country = TRUE; |
---|
| | break; |
---|
| | } |
---|
| | } |
---|
| | |
---|
| | if ($found_country) |
---|
| | { |
---|
| | list($country_name, $country_code) = explode(',', $id); |
---|
| | break; |
---|
| | } |
---|
| | } |
---|
| | // end NJS 2007-01-26 |
---|
| | |
---|
| | |
---|
| | $date = $matches[2]; |
---|
| | $eprint_id = $matches[3]; |
---|
| | $uniquebits = $buffer; |
---|
| | /* NJS 2006-04-28 |
---|
| |
---|
|