| |
---|
| | Fortunately, most search bots insert a readily identifiable string into |
---|
| | the user-agent part of the HTTP response, which gets recorded in the Apache |
---|
| | log file. We can look for these and re-code log entries as appropriate. |
---|
| | |
---|
| | The format of this list is similar to that of the $local_IPs variable. The |
---|
| | key is the "country name" (in this case the name of the search engine) plus |
---|
| | an "X" ISO 3166-1 country code, separated by a comma. Each key value has an |
---|
| | associated list of corresponding regular expressions that can occur in the |
---|
| | user-agent part of the Apache log entry. If any one of these REs matches |
---|
| | the user-agent part of the log entry, then we should re-code the country |
---|
| | appropriately. |
---|
| | |
---|
| | Note that this means that several of the "X" country codes are now reserved |
---|
| | and can no longer be used in $local_IPs. |
---|
| | The format of this list is similar to that of the $local_IPs variable. |
---|
| | The key is the "country name" (in this case the name of the search |
---|
| | engine) plus a non-standard four-character country code starting with |
---|
| | "X@", separated by a comma. Each key value has an associated list of |
---|
| | corresponding regular expressions that can occur in the user-agent part |
---|
| | of the Apache log entry. If any one of these REs matches the user-agent |
---|
| | part of the log entry, then we should re-code the country appropriately. |
---|
| | |
---|
| | A four-character code is used because that what the database allows, and |
---|
| | it avoids having to reserve several of the "X" country codes for search |
---|
| | engines. |
---|
| | */ |
---|
| | $bot_patterns = array( |
---|
| | // Yahoo! (http://www.yahoo.com/) |
---|
| | 'Yahoo!,X@YH' => array( |
---|
| | '/yahoo! slurp/i', |
---|
| | '/yahooseeker/i', |
---|
| | ), |
---|
| | // Windows Live Search (http://search.msn.com/) |
---|
| | 'Windows Live Search,X@MS' => array( |
---|
| | '/msnbot/i', |
---|
| | ), |
---|
| | // Google (http://www.google.com/) |
---|
| | 'Google,XG' => array( |
---|
| | '/Googlebot/', |
---|
| | '/http:\/\/www\.google\.com\/bot\.html/', |
---|
| | ), |
---|
| | // Windows Live Search (http://search.msn.com/) |
---|
| | 'Windows Live Search,XM' => array( |
---|
| | '/msnbot/', |
---|
| | '/http:\/\/search\.msn\.com\/msnbot\.htm/', |
---|
| | ), |
---|
| | // Yahoo! (http://www.yahoo.com/) |
---|
| | 'Yahoo!,XY' => array( |
---|
| | '/Yahoo! Slurp/', |
---|
| | '/YahooSeeker/', |
---|
| | '/http:\/\/help\.yahoo\.com\/help\/us\/ysearch\/slurp/', |
---|
| | '/yahooseeker-jp-mobile AT Yahoo!JAPAN/', |
---|
| | 'Google,X@GG' => array( |
---|
| | '/googlebot/i', |
---|
| | ), |
---|
| | // Ask.com (http://www.ask.com/) |
---|
| | 'Ask.com,XJ' => array( |
---|
| | '/Ask Jeeves\/Teoma/', |
---|
| | '/http:\/\/about\.ask\.com\/en\/docs\/about\/webmasters\.shtml/', |
---|
| | 'Ask.com,X@AC' => array( |
---|
| | '/ask jeeves\/teoma/i', |
---|
| | ), |
---|
| | // Everything else I could find in our log files :) |
---|
| | 'Other search engine,XZ' => array( |
---|
| | 'Other search engine,X@OS' => array( |
---|
| | // TAMU Internet Research Lab (http://irl.cs.tamu.edu/) |
---|
| | '/http:\/\/irl\.cs\.tamu\.edu\/crawler/', |
---|
| | '/http:\/\/irl\.cs\.tamu\.edu\/crawler/i', |
---|
| | // Alexa web search (http://www.alexa.com/) |
---|
| | '/ia_archiver/', |
---|
| | '/ia_archiver/i', |
---|
| | // TrueKnowledge for Web (http://www.authoritativeweb.com/) |
---|
| | '/ConveraCrawler/', |
---|
| | '/http:\/\/www\.authoritativeweb\.com\/crawl/', |
---|
| | '/converacrawler/i', |
---|
| | // Majestic 12 distributed search engine (http://www.majestic12.co.uk/) |
---|
| | '/MJ12bot/', |
---|
| | '/http:\/\/majestic12\.co\.uk\/bot\.php/', |
---|
| | '/mj12bot/i', |
---|
| | // Picsearch (http://www.picsearch.com/) |
---|
| | '/psbot/', |
---|
| | '/http:\/\/www\.picsearch\.com\/bot\.html/', |
---|
| | '/psbot/i', |
---|
| | // Exalead (http://www.exalead.com/search) |
---|
| | '/Exabot/', |
---|
| | '/exabot/i', |
---|
| | // Cazoodle (note cazoodle.com doesn't exist) |
---|
| | '/CazoodleBot Crawler/', |
---|
| | '/http:\/\/www\.cazoodle\.com/', |
---|
| | '/mqbot@cazoodle\.com/', |
---|
| | '/cazoodlebot crawler/i', |
---|
| | '/mqbot@cazoodle\.com/i', |
---|
| | // Gigablast (http://www.gigablast.com/) |
---|
| | '/Gigabot/', |
---|
| | '/http:\/\/www\.gigablast\.com\/spider\.html/', |
---|
| | '/gigabot/i', |
---|
| | // Houxou (http://www.houxou.com/) |
---|
| | '/HouxouCrawler/', |
---|
| | '/http:\/\/www\.houxou\.com\/crawler/', |
---|
| | '/crawler at houxou dot com/', |
---|
| | '/houxoucrawler/i', |
---|
| | '/crawler at houxou dot com/i', |
---|
| | // IBM Almaden Research Center Computer Science group (http://www.almaden.ibm.com/cs/) |
---|
| | '/http:\/\/www\.almaden\.ibm\.com\/cs\/crawler/', |
---|
| | '/http:\/\/www\.almaden\.ibm\.com\/cs\/crawler/i', |
---|
| | // Goo? (http://help.goo.ne.jp/) |
---|
| | '/ichiro/', |
---|
| | '/http:\/\/help\.goo\.ne\.jp\/door\/crawler\.html/', |
---|
| | '/ichiro/i', |
---|
| | // Daum Communications Corp (Korea) |
---|
| | '/Edacious & Intelligent Web Robot/', |
---|
| | '/Daum Communications Corp/', |
---|
| | '/DAUM Web Robot/', |
---|
| | '/MSIE is not me/', |
---|
| | '/DAUMOA/', |
---|
| | '/edacious & intelligent web robot/i', |
---|
| | '/daum communications corp/i', |
---|
| | '/daum web robot/i', |
---|
| | '/msie is not me/i', |
---|
| | '/daumoa/i', |
---|
| | // Girafa (http://www.girafa.com/) |
---|
| | '/[Gg]irafabot/', |
---|
| | '/girafabot at girafa dot com/', |
---|
| | '/http:\/\/www\.girafa\.com/', |
---|
| | '/girafabot/i', |
---|
| | // The Generations Network (http://www.myfamilyinc.com/) |
---|
| | '/MyFamilyBot/', |
---|
| | '/http:\/\/www\.ancestry\.com\/learn\/bot\.aspx/', |
---|
| | '/http:\/\/www\.myfamilyinc\.com/', |
---|
| | '/myfamilybot/i', |
---|
| | // Naver? (http://www.naver.com/) |
---|
| | '/NaverBot/', |
---|
| | '/http:\/\/help\.naver\.com\/delete_main\.asp/', |
---|
| | '/naverbot/i', |
---|
| | // WiseNut (http://www.wisenutbot.com/) |
---|
| | '/ZyBorg/', |
---|
| | '/wn-[0-9]+\.zyborg@looksmart\.net/', |
---|
| | '/http:\/\/www\.WISEnutbot\.com/', |
---|
| | '/zyborg/i', |
---|
| | '/wn-[0-9]+\.zyborg@looksmart\.net/i', |
---|
| | // Accelobot (http://www.accelobot.com/) |
---|
| | // This one seems particularly busy! |
---|
| | '/heritrix/', |
---|
| | '/http:\/\/www\.accelobot\.com/', |
---|
| | '/heritrix/i', |
---|
| | // Seeqpod (http://www.seeqpod.com/) |
---|
| | '/seeqpod-vertical-crawler/', |
---|
| | '/http:\/\/www\.seeqpod\.com/', |
---|
| | '/seeqpod-vertical-crawler/i', |
---|
| | // University of Illinois at Urbana-Champaign, Computer Science (http://www.cs.uiuc.edu/) |
---|
| | '/MQBOT Crawler/', |
---|
| | '/http:\/\/falcon\.cs\.uiuc\.edu/', |
---|
| | '/mqbot@cs\.uiuc\.edu/', |
---|
| | '/mqbot crawler/i', |
---|
| | '/mqbot@cs\.uiuc\.edu/i', |
---|
| | // Microsoft Research (http://research.microsoft.com/) |
---|
| | '/MSRBOT/', |
---|
| | '/http:\/\/research\.microsoft\.com\/research\/sv\/msrbot\//', |
---|
| | '/msrbot/i', |
---|
| | // Nusearch |
---|
| | '/Nusearch Spider/', |
---|
| | '/www\.nusearch\.com/', |
---|
| | '/nusearch spider/i', |
---|
| | // SourceForge (http://www.sf.net/) |
---|
| | '/nutch-agent@lists\.sourceforge\.net/', |
---|
| | '/nutch-agent@lists\.sourceforge\.net/i', |
---|
| | // Lucene (http://lucene.apache.org/) |
---|
| | '/nutch-agent@lucene\.apache\.org/', |
---|
| | '/raphael@unterreuth.de/', |
---|
| | '/nutch-agent@lucene\.apache\.org/i', |
---|
| | '/raphael@unterreuth.de/i', |
---|
| | // Computer Science, University of Washington (http://cs.washington.edu/) |
---|
| | '/Nutch running at UW/', |
---|
| | '/http:\/\/crawlers\.cs\.washington\.edu\//', |
---|
| | '/sycrawl@cs\.washington\.edu/', |
---|
| | '/nutch running at uw/i', |
---|
| | '/sycrawl@cs\.washington\.edu/i', |
---|
| | // Chikayama & Taura Laboratory, University of Tokyo (http://www.logos.ic.i.u-tokyo.ac.jp/) |
---|
| | '/Shim-Crawler/', |
---|
| | '/http:\/\/www\.logos\.ic\.i\.u-tokyo\.ac\.jp\/crawler\//', |
---|
| | '/crawl@logos\.ic\.i\.u-tokyo\.ac\.jp/', |
---|
| | '/shim-crawler/i', |
---|
| | '/crawl@logos\.ic\.i\.u-tokyo\.ac\.jp/i', |
---|
| | // Sproose (http://www.sproose.com/) |
---|
| | '/sproose bot/', |
---|
| | '/http:\/\/www\.sproose\.com\/bot\.html/', |
---|
| | '/crawler@sproose\.com/', |
---|
| | '/sproose bot/i', |
---|
| | '/crawler@sproose\.com/i', |
---|
| | // Turnitin (http://www.turnitin.com/) |
---|
| | '/TurnitinBot/', |
---|
| | '/http:\/\/www\.turnitin\.com\/robot\/crawlerinfo\.html/', |
---|
| | '/turnitinbot/i', |
---|
| | // WISH Project (http://wish.slis.tsukuba.ac.jp/) |
---|
| | '/wish-project/', |
---|
| | '/http:\/\/wish\.slis\.tsukuba\.ac\.jp\//', |
---|
| | '/wish-project/i', |
---|
| | // WWWster |
---|
| | '/wwwster/', |
---|
| | '/gue@cis\.uni-muenchen\.de/', |
---|
| | '/wwwster/i', |
---|
| | '/gue@cis\.uni-muenchen\.de/i', |
---|
| | // Forex Trading Network Organization (http://www.netforex.org/) |
---|
| | '/Forex Trading Network Organization/', |
---|
| | '/http:\/\/www\.netforex\.org/', |
---|
| | '/info@netforex\.org/', |
---|
| | '/forex trading network organization/i', |
---|
| | '/info@netforex\.org/i', |
---|
| | // FunnelBack (http://www.funnelback.com/) |
---|
| | '/FunnelBack/', |
---|
| | '/http:\/\/www\.funnelback\.com\/robot\.html/', |
---|
| | '/funnelback/i', |
---|
| | // Baidu (http://www.baidu.com/) |
---|
| | '/Baiduspider/', |
---|
| | '/http:\/\/www\.baidu\.com\/search\/spider\.htm/', |
---|
| | '/baiduspider/i', |
---|
| | // Brandimensions (http://www.brandimensions.com/) |
---|
| | '/BDFetch/', |
---|
| | '/bdfetch/i', |
---|
| | // Blaiz Enterprises (http://www.blaiz.net/) |
---|
| | '/Blaiz-Bee/', |
---|
| | '/http:\/\/www\.blaiz\.net/', |
---|
| | '/blaiz-bee/i', |
---|
| | // Boitho/SearchDaimon (http://www.boitho.com/ or http://www.searchdaimon.com/) |
---|
| | '/boitho\.com-dc/', |
---|
| | '/http:\/\/www\.boitho\.com\/dcbot\.html/', |
---|
| | '/boitho\.com-dc/i', |
---|
| | // Celestial (OAI aggregator, see http://oai-perl.sourceforge.net/ for a little info) |
---|
| | '/Celestial/', |
---|
| | '/celestial/i', |
---|
| | // Cipinet (http://www.cipinet.com/) |
---|
| | '/CipinetBot/', |
---|
| | '/http:\/\/www\.cipinet\.com\/bot\.html/', |
---|
| | '/cipinetbot/i', |
---|
| | // iVia (http://ivia.ucr.edu/) |
---|
| | '/CrawlerTest CrawlerTest/', |
---|
| | '/http:\/\/ivia\.ucr\.edu\/useragents\.shtml/', |
---|
| | '/crawlertest crawlertest/i', |
---|
| | // Encyclopedia of Keywords (http://keywen.com/) |
---|
| | '/EasyDL/', |
---|
| | '/http:\/\/keywen\.com\/Encyclopedia\/Bot/', |
---|
| | '/easydl/i', |
---|
| | // Everest-Vulcan Inc. (http://everest.vulcan.com/) |
---|
| | '/Everest-Vulcan Inc/', |
---|
| | '/http:\/\/everest\.vulcan\.com\/crawlerhelp/', |
---|
| | '/everest-vulcan inc/i', |
---|
| | // FactBites (http://www.factbites.com/) |
---|
| | '/Factbot/', |
---|
| | '/http:\/\/www\.factbites\.com\/webmasters\.php/', |
---|
| | '/factbot/i', |
---|
| | // Scirus (http://www.scirus.com/) |
---|
| | '/Scirus scirus-crawler@fast\.no/', |
---|
| | '/http:\/\/www\.scirus\.com\/srsapp\/contactus\//', |
---|
| | '/scirus scirus-crawler@fast\.no/i', |
---|
| | // UOL (http://www.uol.com.br/) |
---|
| | '/UOLCrawler/', |
---|
| | '/soscrawler@uol\.com\.br/', |
---|
| | '/uolcrawler/i', |
---|
| | '/soscrawler@uol\.com\.br/i', |
---|
| | // Always Updated (http://www.updated.com/) |
---|
| | '/updated crawler/', |
---|
| | '/crawler@updated\.com/', |
---|
| | '/http:\/\/www\.updated\.com/', |
---|
| | '/updated crawler/i', |
---|
| | '/crawler@updated\.com/i', |
---|
| | // FAST Enterprise Search (http://www.fast.no/) |
---|
| | '/crawler@fast\.no/', |
---|
| | '/FAST MetaWeb Crawler/', |
---|
| | '/helpdesk at fastsearch dot com/', |
---|
| | '/fast metaweb crawler/i', |
---|
| | '/crawler@fast\.no/i', |
---|
| | '/helpdesk at fastsearch dot com/i', |
---|
| | // Deutsche Wortschatz Portal (http://wortschatz.uni-leipzig.de/) |
---|
| | '/findlinks/', |
---|
| | '/http:\/\/wortschatz\.uni-leipzig\.de\/findlinks\//', |
---|
| | '/findlinks/i', |
---|
| | // Gais (http://gais.cs.ccu.edu.tw/) |
---|
| | '/Gaisbot/', |
---|
| | '/robot[0-9]{2}@gais.cs.ccu.edu.tw/', |
---|
| | '/http:\/\/gais\.cs\.ccu\.edu\.tw\/robot\.php/', |
---|
| | '/gaisbot/i', |
---|
| | '/robot[0-9]{2}@gais.cs.ccu.edu.tw/i', |
---|
| | // http://ilse.net/ |
---|
| | '/INGRID/', |
---|
| | '/http:\/\/webmaster\.ilse\.nl\/jsp\/webmaster\.jsp/', |
---|
| | '/ingrid/i', |
---|
| | // Krugle (http://corp.krugle.com/) |
---|
| | '/Krugle\/Krugle/', |
---|
| | '/Krugle web crawler/', |
---|
| | '/http:\/\/corp\.krugle\.com\/crawler\/info\.html/', |
---|
| | '/webcrawler@krugle\.com/', |
---|
| | '/krugle\/krugle/i', |
---|
| | '/krugle web crawler/i', |
---|
| | '/webcrawler@krugle\.com/i', |
---|
| | // WebWobot (http://www.webwobot.com/) |
---|
| | '/ScollSpider/', |
---|
| | '/http:\/\/www\.webwobot\.com/', |
---|
| | '/scollspider/i', |
---|
| | // Omni-Explorer (http://www.omni-explorer.com/) |
---|
| | '/OmniExplorer_Bot/', |
---|
| | '/http:\/\/www\.omni-explorer\.com/', |
---|
| | '/WorldIndexer/', |
---|
| | '/omniexplorer_bot/i', |
---|
| | '/worldindexer/i', |
---|
| | // PageBull (http://www.pagebull.com/) |
---|
| | '/Pagebull http:\/\/www\.pagebull\.com\//', |
---|
| | '/pagebull http:\/\/www\.pagebull\.com\//i', |
---|
| | // dir.com (http://dir.com/) |
---|
| | '/Pompos/', |
---|
| | '/http:\/\/dir\.com\/pompos\.html/', |
---|
| | '/pompos/i', |
---|
| | // Sensis (http://sensis.com.au/) |
---|
| | '/Sensis Web Crawler/', |
---|
| | '/search_comments\\\\at\\\\sensis\\\\dot\\\\com\\\\dot\\\\au/', |
---|
| | '/sensis web crawler/i', |
---|
| | '/search_comments\\\\at\\\\sensis\\\\dot\\\\com\\\\dot\\\\au/i', |
---|
| | // Shopwiki (http://www.shopwiki.com/) |
---|
| | '/ShopWiki/', |
---|
| | '/http:\/\/www\.shopwiki\.com\//', |
---|
| | '/shopwiki/i', |
---|
| | // Guruji (http://www.terrawiz.com/) |
---|
| | '/TerrawizBot/', |
---|
| | '/http:\/\/www\.terrawiz\.com\/bot\.html/', |
---|
| | '/terrawizbot/i', |
---|
| | // Language Observatory Project (http://www.language-observatory.org/) |
---|
| | '/UbiCrawler/', |
---|
| | '/http:\/\/gii\.nagaokaut\.ac\.jp\/~ubi\//', |
---|
| | '/ubicrawler/i', |
---|
| | // MSIE offline bookmarks crawler |
---|
| | '/msiecrawler/i', |
---|
| | // Unidentified |
---|
| | '/[Bb]ot/', |
---|
| | '/[Cc]rawler/', |
---|
| | '/[Ss]pider/', |
---|
| | '/larbin/', // also larbinSpider |
---|
| | '/HTTrack/', |
---|
| | '/voyager/', |
---|
| | '/AcadiaUniversityWebCensusClient/', |
---|
| | '/FeedChecker/', |
---|
| | '/KnowItAll\(knowitall@cs\.washington\.edu\)/', |
---|
| | '/Mediapartners-Google/', |
---|
| | '/psycheclone/', |
---|
| | '/topicblogs/', |
---|
| | '/bot/i', |
---|
| | '/crawler/i', |
---|
| | '/spider/i', |
---|
| | '/larbin/i', // also larbinSpider |
---|
| | '/httrack/i', |
---|
| | '/voyager/i', |
---|
| | '/acadiauniversitywebcensusclient/i', |
---|
| | '/feedchecker/i', |
---|
| | '/knowitall\(knowitall@cs\.washington\.edu\)/i', |
---|
| | '/mediapartners-google/i', |
---|
| | '/psycheclone/i', |
---|
| | '/topicblogs/i', |
---|
| | '/nutch/i', |
---|
| | ), |
---|
| | ); |
---|
| | |
---|
| | ########################################### |
---|
| |
---|
| | $logf = $log_dir . $archive_log; |
---|
| | $handle = fopen($logf, "r"); |
---|
| | while (!feof($handle)) { |
---|
| | $buffer = fgets($handle, 4096); |
---|
| | // NJS 2005-11-25 Added regexp for EPrints short URLs. |
---|
| | // NJS 2007-01-26 Added user-agent match to all regexps to enable bot detection. |
---|
| | // NJS 2007-01-29 Added missing regexp for EPrints short URLs with domain names rather than IP addresses. |
---|
| | if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*?(\"[^\"]+\")?$/i",$buffer,$matches)) || |
---|
| | (preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*?(\"[^\"]+\")?$/i",$buffer,$matches)) || |
---|
| | (preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*?(\"[^\"]+\")?$/i",$buffer,$matches)) || |
---|
| | (preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*?(\"[^\"]+\")?$/i",$buffer,$matches))) |
---|
| | /* NJS 2007-01-26 |
---|
| | Added user-agent match to all regexps to enable bot detection. |
---|
| | |
---|
| | NJS 2007-01-31 |
---|
| | Refactored regexps from four down to one, after realising |
---|
| | that (a) long EPrints URLs are a superset of the short ones, |
---|
| | and (b) a regexp that matches domain names works just as well |
---|
| | for IP addresses (the GeoIP lookup doesn't care which it |
---|
| | gets). Also fixed the pattern so it can handle an arbitrary |
---|
| | number of subdomains. Note that the latter would be the main |
---|
| | argument for keeping a separate IP address pattern, as IP |
---|
| | addresses always comprise exactly four parts. However, it's |
---|
| | not really up to the script to verify IP addresses; Apache |
---|
| | should be recording them correctly in the first place! |
---|
| | |
---|
| | The typical kinds of strings we are matching look something |
---|
| | like this: |
---|
| | |
---|
| | fetch abstract (short, long): |
---|
| | 168.192.1.1 - - [31/Jan/2007:09:15:36 +1300] "GET /1/ HTTP/1.1" 200 12345 "referer" "user-agent" |
---|
| | 168.192.1.1 - - [31/Jan/2007:09:15:36 +1300] "GET /archive/00000001/ HTTP/1.1" 200 12345 "referer" "user-agent" |
---|
| | |
---|
| | download item (short, long): |
---|
| | 168.192.1.1 - - [31/Jan/2007:09:15:37 +1300] "GET /1/01/foo.pdf HTTP/1.1" 200 12345 "referer" "user-agent" |
---|
| | 168.192.1.1 - - [31/Jan/2007:09:15:37 +1300] "GET /archive/00000001/01/foo.pdf HTTP/1.1" 200 12345 "referer" "user-agent" |
---|
| | |
---|
| | Plus any of the above with a domain name substituted for the IP |
---|
| | address (e.g., foo.bar.com instead of 168.192.1.1). |
---|
| | */ |
---|
| | if (preg_match("/^(\S+(?:\.\S+)+) - - \[(.*?)\] \"GET \/(?:archive\/0{1,8})?(\d{1,4}).*? HTTP\/1..\" 200 .*?(\"[^\"]+\")?$/i",$buffer,$matches)) |
---|
| | { |
---|
| | $counter++; |
---|
| | $country_code = ''; |
---|
| | $country_name = ''; |
---|
| |
---|
| | foreach ($bot_patterns as $id => $patterns) |
---|
| | { |
---|
| | foreach ($patterns as $pat) |
---|
| | { |
---|
| | if (preg_match($pat, $user_agent, $matches2)) |
---|
| | if (preg_match($pat, $user_agent)) |
---|
| | { |
---|
| | $found_country = TRUE; |
---|
| | break; |
---|
| | } |
---|
| |
---|
| | $eprint_id = $matches[3]; |
---|
| | $uniquebits = $buffer; |
---|
| | |
---|
| | // NJS 2005-11-25 Added regexp for EPrints short URLs. |
---|
| | if(preg_match("/GET \/archive\/0{1,8}\d{1,4}\/\d\d\//i",$buffer) || preg_match("/GET \/\d{1,4}\/\d\d\//i",$buffer)) { |
---|
| | // NJS 2007-01-31 Refactored into one regexp for both styles. |
---|
| | if (preg_match("/GET \/(?:archive\/0{1,8})?\d{1,4}\/\d\d\//i",$buffer)) { |
---|
| | $view_type = "download"; |
---|
| | } else { |
---|
| | $view_type = "abstract"; |
---|
| | } |
---|
| |
---|
| | } |
---|
| | |
---|
| | /* |
---|
| | Keep track of where we are. Should avoid duplication of results |
---|
| | if the script is run more than once on the same log file |
---|
| | if the script is run more than once on the same log file. |
---|
| | */ |
---|
| | |
---|
| | // NJS 2006-04-28 Switched value inserted to $start_time instead of $request_date. |
---|
| | $query = "INSERT into lastproc (lastproc) values('".$start_time."')"; |
---|
| |
---|
|