- Added initial support for separating out search engines.
1 parent 76e0cbf commit 5f877c6c6bd806119b03cea18ba528a9bf2894c2
nstanger authored on 26 Jan 2007
Showing 1 changed file
View
292
Repositories/statistics/scripts/eprints-usage_src.php
),
),
);
 
/* NJS 2007-01-26
Patterns to match various search engine bots. Ideally, we'd use a similar
mechanism to the $local_IPs variable above, but this isn't feasible because
we'd need to know the IP ranges for the likes of Google, for example. This
clearly isn't possible in practice.
 
Fortunately, most search bots insert a readily identifiable string into
the user-agent part of the HTTP response, which gets recorded in the Apache
log file. We can look for these and re-code log entries as appropriate.
 
The format of this list is similar to that of the $local_IPs variable. The
key is the "country name" (in this case the name of the search engine) plus
an "X" ISO 3166-1 country code, separated by a comma. Each key value has an
associated list of corresponding regular expressions that can occur in the
user-agent part of the Apache log entry. If any one of these REs matches
the user-agent part of the log entry, then we should re-code the country
appropriately.
 
Note that this means that several of the "X" country codes are now reserved
and can no longer be used in $local_IPs.
*/
$bot_patterns = array(
// Google (http://www.google.com/)
'Google,XG' => array(
'/Googlebot/',
'/http:\/\/www\.google\.com\/bot\.html/',
),
// Windows Live Search (http://search.msn.com/)
'Windows Live Search,XM' => array(
'/msnbot/',
'/http:\/\/search\.msn\.com\/msnbot\.htm/',
),
// Yahoo! (http://www.yahoo.com/)
'Yahoo!,XY' => array(
'/Yahoo! Slurp/',
'/YahooSeeker/',
'/http:\/\/help\.yahoo\.com\/help\/us\/ysearch\/slurp/',
'/yahooseeker-jp-mobile AT Yahoo!JAPAN/',
),
// Ask.com (http://www.ask.com/)
'Ask.com,XJ' => array(
'/Ask Jeeves\/Teoma/',
'/http:\/\/about\.ask\.com\/en\/docs\/about\/webmasters\.shtml/',
),
// Everything else I could find in our log files :)
'Other search engine,XZ' => array(
// TAMU Internet Research Lab (http://irl.cs.tamu.edu/)
'/http:\/\/irl\.cs\.tamu\.edu\/crawler/',
// Alexa web search (http://www.alexa.com/)
'/ia_archiver/',
// TrueKnowledge for Web (http://www.authoritativeweb.com/)
'/ConveraCrawler/',
'/http:\/\/www\.authoritativeweb\.com\/crawl/',
// Majestic 12 distributed search engine (http://www.majestic12.co.uk/)
'/MJ12bot/',
'/http:\/\/majestic12\.co\.uk\/bot\.php/',
// Picsearch (http://www.picsearch.com/)
'/psbot/',
'/http:\/\/www\.picsearch\.com\/bot\.html/',
// Exalead (http://www.exalead.com/search)
'/Exabot/',
// Cazoodle (note cazoodle.com doesn't exist)
'/CazoodleBot Crawler/',
'/http:\/\/www\.cazoodle\.com/',
'/mqbot@cazoodle\.com/',
// Gigablast (http://www.gigablast.com/)
'/Gigabot/',
'/http:\/\/www\.gigablast\.com\/spider\.html/',
// Houxou (http://www.houxou.com/)
'/HouxouCrawler/',
'/http:\/\/www\.houxou\.com\/crawler/',
'/crawler at houxou dot com/',
// IBM Almaden Research Center Computer Science group (http://www.almaden.ibm.com/cs/)
'/http:\/\/www\.almaden\.ibm\.com\/cs\/crawler/',
// Goo? (http://help.goo.ne.jp/)
'/ichiro/',
'/http:\/\/help\.goo\.ne\.jp\/door\/crawler\.html/',
// Daum Communications Corp (Korea)
'/Edacious & Intelligent Web Robot/',
'/Daum Communications Corp/',
'/DAUM Web Robot/',
'/MSIE is not me/',
'/DAUMOA/',
// Girafa (http://www.girafa.com/)
'/[Gg]irafabot/',
'/girafabot at girafa dot com/',
'/http:\/\/www\.girafa\.com/',
// The Generations Network (http://www.myfamilyinc.com/)
'/MyFamilyBot/',
'/http:\/\/www\.ancestry\.com\/learn\/bot\.aspx/',
'/http:\/\/www\.myfamilyinc\.com/',
// Naver? (http://www.naver.com/)
'/NaverBot/',
'/http:\/\/help\.naver\.com\/delete_main\.asp/',
// WiseNut (http://www.wisenutbot.com/)
'/ZyBorg/',
'/wn-[0-9]+\.zyborg@looksmart\.net/',
'/http:\/\/www\.WISEnutbot\.com/',
// Accelobot (http://www.accelobot.com/)
// This one seems particularly busy!
'/heritrix/',
'/http:\/\/www\.accelobot\.com/',
// Seeqpod (http://www.seeqpod.com/)
'/seeqpod-vertical-crawler/',
'/http:\/\/www\.seeqpod\.com/',
// University of Illinois at Urbana-Champaign, Computer Science (http://www.cs.uiuc.edu/)
'/MQBOT Crawler/',
'/http:\/\/falcon\.cs\.uiuc\.edu/',
'/mqbot@cs\.uiuc\.edu/',
// Microsoft Research (http://research.microsoft.com/)
'/MSRBOT/',
'/http:\/\/research\.microsoft\.com\/research\/sv\/msrbot\//',
// Nusearch
'/Nusearch Spider/',
'/www\.nusearch\.com/',
// SourceForge (http://www.sf.net/)
'/nutch-agent@lists\.sourceforge\.net/',
// Lucene (http://lucene.apache.org/)
'/nutch-agent@lucene\.apache\.org/',
'/raphael@unterreuth.de/',
// Computer Science, University of Washington (http://cs.washington.edu/)
'/Nutch running at UW/',
'/http:\/\/crawlers\.cs\.washington\.edu\//',
'/sycrawl@cs\.washington\.edu/',
// Chikayama & Taura Laboratory, University of Tokyo (http://www.logos.ic.i.u-tokyo.ac.jp/)
'/Shim-Crawler/',
'/http:\/\/www\.logos\.ic\.i\.u-tokyo\.ac\.jp\/crawler\//',
'/crawl@logos\.ic\.i\.u-tokyo\.ac\.jp/',
// Sproose (http://www.sproose.com/)
'/sproose bot/',
'/http:\/\/www\.sproose\.com\/bot\.html/',
'/crawler@sproose\.com/',
// Turnitin (http://www.turnitin.com/)
'/TurnitinBot/',
'/http:\/\/www\.turnitin\.com\/robot\/crawlerinfo\.html/',
// WISH Project (http://wish.slis.tsukuba.ac.jp/)
'/wish-project/',
'/http:\/\/wish\.slis\.tsukuba\.ac\.jp\//',
// WWWster
'/wwwster/',
'/gue@cis\.uni-muenchen\.de/',
// Forex Trading Network Organization (http://www.netforex.org/)
'/Forex Trading Network Organization/',
'/http:\/\/www\.netforex\.org/',
'/info@netforex\.org/',
// FunnelBack (http://www.funnelback.com/)
'/FunnelBack/',
'/http:\/\/www\.funnelback\.com\/robot\.html/',
// Baidu (http://www.baidu.com/)
'/Baiduspider/',
'/http:\/\/www\.baidu\.com\/search\/spider\.htm/',
// Brandimensions (http://www.brandimensions.com/)
'/BDFetch/',
// Blaiz Enterprises (http://www.blaiz.net/)
'/Blaiz-Bee/',
'/http:\/\/www\.blaiz\.net/',
// Boitho/SearchDaimon (http://www.boitho.com/ or http://www.searchdaimon.com/)
'/boitho\.com-dc/',
'/http:\/\/www\.boitho\.com\/dcbot\.html/',
// Celestial (OAI aggregator, see http://oai-perl.sourceforge.net/ for a little info)
'/Celestial/',
// Cipinet (http://www.cipinet.com/)
'/CipinetBot/',
'/http:\/\/www\.cipinet\.com\/bot\.html/',
// iVia (http://ivia.ucr.edu/)
'/CrawlerTest CrawlerTest/',
'/http:\/\/ivia\.ucr\.edu\/useragents\.shtml/',
// Encyclopedia of Keywords (http://keywen.com/)
'/EasyDL/',
'/http:\/\/keywen\.com\/Encyclopedia\/Bot/',
// Everest-Vulcan Inc. (http://everest.vulcan.com/)
'/Everest-Vulcan Inc/',
'/http:\/\/everest\.vulcan\.com\/crawlerhelp/',
// FactBites (http://www.factbites.com/)
'/Factbot/',
'/http:\/\/www\.factbites\.com\/webmasters\.php/',
// Scirus (http://www.scirus.com/)
'/Scirus scirus-crawler@fast\.no/',
'/http:\/\/www\.scirus\.com\/srsapp\/contactus\//',
// UOL (http://www.uol.com.br/)
'/UOLCrawler/',
'/soscrawler@uol\.com\.br/',
// Always Updated (http://www.updated.com/)
'/updated crawler/',
'/crawler@updated\.com/',
'/http:\/\/www\.updated\.com/',
// FAST Enterprise Search (http://www.fast.no/)
'/crawler@fast\.no/',
'/FAST MetaWeb Crawler/',
'/helpdesk at fastsearch dot com/',
// Deutsche Wortschatz Portal (http://wortschatz.uni-leipzig.de/)
'/findlinks/',
'/http:\/\/wortschatz\.uni-leipzig\.de\/findlinks\//',
// Gais (http://gais.cs.ccu.edu.tw/)
'/Gaisbot/',
'/robot[0-9]{2}@gais.cs.ccu.edu.tw/',
'/http:\/\/gais\.cs\.ccu\.edu\.tw\/robot\.php/',
// http://ilse.net/
'/INGRID/',
'/http:\/\/webmaster\.ilse\.nl\/jsp\/webmaster\.jsp/',
// Krugle (http://corp.krugle.com/)
'/Krugle\/Krugle/',
'/Krugle web crawler/',
'/http:\/\/corp\.krugle\.com\/crawler\/info\.html/',
'/webcrawler@krugle\.com/',
// WebWobot (http://www.webwobot.com/)
'/ScollSpider/',
'/http:\/\/www\.webwobot\.com/',
// Omni-Explorer (http://www.omni-explorer.com/)
'/OmniExplorer_Bot/',
'/http:\/\/www\.omni-explorer\.com/',
'/WorldIndexer/',
// PageBull (http://www.pagebull.com/)
'/Pagebull http:\/\/www\.pagebull\.com\//',
// dir.com (http://dir.com/)
'/Pompos/',
'/http:\/\/dir\.com\/pompos\.html/',
// Sensis (http://sensis.com.au/)
'/Sensis Web Crawler/',
'/search_comments\\\\at\\\\sensis\\\\dot\\\\com\\\\dot\\\\au/',
// Shopwiki (http://www.shopwiki.com/)
'/ShopWiki/',
'/http:\/\/www\.shopwiki\.com\//',
// Guruji (http://www.terrawiz.com/)
'/TerrawizBot/',
'/http:\/\/www\.terrawiz\.com\/bot\.html/',
// Language Observatory Project (http://www.language-observatory.org/)
'/UbiCrawler/',
'/http:\/\/gii\.nagaokaut\.ac\.jp\/~ubi\//',
// Unidentified
'/bot@bot\.bot/', // see (http://www.webmasterworld.com/search_engine_spiders/3186855.htm)
'/bot\/1.0/',
'/larbin/', // also larbinSpider
'/HTTrack/',
'/nicebot/',
'/Snapbot/',
'/sogou spider/',
'/TMCrawler/',
'/voyager/',
'/AcadiaUniversityWebCensusClient/',
'/BeijingCrawler/',
'/FeedChecker/',
'/g2Crawler \(?nobody@airmail\.net\)?/',
'/gsa-crawler/',
'/JobSpider_BA/',
'/KnowItAll\(knowitall@cs\.washington\.edu\)/',
'/Mediapartners-Google/',
'/obeys UserAgent NimbleCrawler For problems contact: crawler@healthline\.com/',
'/psycheclone/',
'/RAMPyBot - www.giveRAMP.com/',
'/Robo Crawler/',
'/ScSpider/',
'/snap\.com beta crawler v0/',
'/spider (tspyyp@tom\.com)/',
'/topicblogs/',
'/Twiceler www\.cuill\.com\/robots\.html/',
'/WebFilter Robot/',
'/WILF \(cybermetrics\.wlv\.ac\.uk\/robots\.htm\)/',
'/Bot,Robot,Spider,Crawler,aromano@cli\.di\.unipi\.it/',
),
);
 
###########################################
##
## No configuration required below here.
##
$handle = fopen($logf, "r");
while (!feof($handle)) {
$buffer = fgets($handle, 4096);
// NJS 2005-11-25 Added regexp for EPrints short URLs.
if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) ||
(preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) ||
(preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)))
if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")$/i",$buffer,$matches)) ||
(preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")$/i",$buffer,$matches)) ||
(preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*(\"[^\"]+\")$/i",$buffer,$matches)))
{
$counter++;
$country_code = '';
$country_name = '';
$country_name = geoip_country_name_by_addr($gi, $ip);
}
// end NJS 2005-12-16
// NJS 2007-01-26
// Check whether this is a bot reference.
$referer = $matches[4];
$found_country = FALSE;
foreach ($bot_patterns as $id => $patterns)
{
foreach ($patterns as $pat)
{
if (preg_match($pat, $referer, $matches2))
{
$found_country = TRUE;
break;
}
}
if ($found_country)
{
list($country_name, $country_code) = explode(',', $id);
break;
}
}
// end NJS 2007-01-26
$date = $matches[2];
$eprint_id = $matches[3];
$uniquebits = $buffer;
/* NJS 2006-04-28