GitBucket
4.21.2
Toggle navigation
Snippets
Sign in
Files
Branches
1
Releases
Issues
Pull requests
Labels
Priorities
Milestones
Wiki
Forks
nigel.stanger
/
Digital_Repository
Browse code
- Switched to timestamp based comparison for counting eprints rather than
date based, in order to fix the "hyper-inflated stats" bug.
master
1 parent
b783392
commit
7f571df8b6f147d67ad52e2cc03b95de7a4ba3d5
nstanger
authored
on 28 Apr 2006
Patch
Showing
1 changed file
Repositories/statistics/scripts/eprints-usage_src.php
Ignore Space
Show notes
View
Repositories/statistics/scripts/eprints-usage_src.php
<?php /* NJS 2006-04-28 In earlier versions of this script, which eprints to count was determined by comparing the request date of the eprint against the "lastproc" date of this script (i.e., minimum time unit one day). This was fine if you only ran the script once per day, but if you ran it more than that, it counted multiple times requests whose $request_date == $lastproc. For example, if you ran this script five times per day, all the downloads that occurred during that day would be counted EVERY TIME this script ran, thus overinflating your stats by a factor of up to five :( The solution is to use the full time stamp for comparison rather than just the date. This timestamp MUST include time zone information so that things don't get screwed up by daylight saving time. As long as this is done consistently, there's no need to do things like convert to GMT, for example. The very first thing we need to do is grab the current time stamp with time zone, which will later be stored in the database as the "lastproc" time. This needs to happen first so that we don't "lose" any requests that occur while the script is running. */ $start_time = date('Y-m-d H:i:s O'); // NJS 2005-12-09 Switched to GeoIP from GeoIP:IPfree. include("geoip.inc"); $gi = geoip_open("##GEOIP_DATABASE##",GEOIP_STANDARD); /* Apache log for ePrints uses this format: LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined If the log format differs the regular expression matching would need to be adjusted. Parse: ip date YYYY MM DD archive ID */ // Web server log files $log_dir = '##APACHE_LOG_LOCATION##'; $log_file = array( 'otago_eprints' => '##APACHE_LOG_NAME##', ); // eprintstats db $sqlserver = 'localhost'; $sqluser = 'eprintstatspriv'; $sqlpass = 'AuldGrizzel'; $sqldatabase = 'eprintstats'; // SQL details of your ePrints installation $sqlserver2 = 'localhost'; $sqluser2 = 'otago_eprints'; $sqlpass2 = 'DrSyntaxRidesAgain'; /* NJS 2005-12-16 IP address ranges for your local Intranet(s). You can have multiple ranges of IP addresses, each with a different "country name", so that they will appear as separate entries in the by country stats pages. You should use a different country code for each range (ISO 3166-1 specifies the range XA through XZ as "user-assignable", so you can use codes from there as necessary), and create flag icons as appropriate. Each address range key is the name that will appear in the statistics database (the "country name"), followed by a comma, followed by the appropriate ISO 3166-1 country code as noted above. Each entry in the range is either a single IP address, or an array specifying a lower and upper bound for a contiguous IP address range (see example below). All IP addresses must be converted to long values using the ip2long() function before being stored. Note that address ranges may overlap. The script will use the first range that matches a given IP, so list the ranges in the correct order of precedence for your needs. Example: $local_IPs = array( 'Repository Admin,XA' => array( ip2long('192.168.1.5'), ip2long('192.168.1.22'), array( lower => ip2long('192.168.1.30'), upper => ip2long('192.168.1.35'), ), ), 'Our Intranet,XI' => array( array( lower => ip2long('192.168.1.0'), upper => ip2long('192.168.255.255'), ), ), ); 'Repository Admin' covers the IP addresses 192.168.1.5, 192.168.1.22 and the range 192.168.1.30 to 192.168.1.35, inclusive. 'Our Intranet' covers the range 192.168.1.0 to 192.168.255.255, inclusive. A machine will only match the 'Our Intranet' range if it first fails to match the 'Repository Admin' range. */ $local_IPs = array( 'Repository Admin,XA' => array( ip2long('139.80.75.110'), // Nigel @ Uni ip2long('60.234.209.74'), // Nigel @ home ip2long('139.80.92.138'), // Monica & Jeremy ip2long('139.80.92.151'), // @ Uni ip2long('203.89.162.155'), // Monica @ home ip2long('139.80.81.50'), // eprints.otago.ac.nz ), 'Otago Intranet,XI' => array( array( 'lower' => ip2long('139.80.0.0'), 'upper' => ip2long('139.80.127.255'), ), ), ); ########################################### ## ## No configuration required below here. ## ########################################### $connect = mysql_pconnect ($sqlserver,$sqluser,$sqlpass); $db = mysql_select_db($sqldatabase,$connect) or die("Could not connect"); // First get the date of last update // NJS 2006-04-28 Changed this from order by timeinsert to order by id. // The is always guaranteed to increase temporally, but is otherwise // time-independent and thus not affected by things like daylight savings. $query = "select lastproc from lastproc order by id desc limit 1"; $result = mysql_query($query,$connect); $num_rows = mysql_num_rows($result); if ($num_rows > 0) { $row = mysql_fetch_assoc($result); $lastproc = $row["lastproc"]; $datetestA = strtotime($lastproc); } else { $datetestA = 0; } $connect2 = mysql_connect($sqlserver2,$sqluser2,$sqlpass2); $counter = 0; foreach($log_file as $archivename=>$archivelog) { $logf = $log_dir . $archivelog; $archive_name = $archivename; $handle = fopen($logf, "r"); while (!feof($handle)) { $buffer = fgets($handle, 4096); // NJS 2005-11-25 Added regexp for EPrints short URLs. if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) || (preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) || (preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches))) { $counter++; $country_code = ''; $country_name = ''; $insertid = ''; $eprint_name = ''; $view_type = ''; $uniquebits = ''; $ip = $matches[1]; /* NJS 2005-12-16 Determine country code and name. Check whether the IP number falls into any of the local intranet ranges. If so, then use that. */ $ip_long = ip2long($ip); $found_country = FALSE; foreach ($local_IPs as $id => $addresses) { foreach ($addresses as $ip_range) { if (is_array($ip_range)) // check against lower/upper bounds { $found_country = (($ip_long >= $ip_range['lower']) && ($ip_long <= $ip_range['upper'])); break; } else if (is_long($ip_range)) // data type sanity check { $found_country = ($ip_long == $ip_range); break; } else // something is seriously broken, ignore this entry { print "Unsupported data type " . gettype($ip_range) . " (value " . $ip_range . ") in \$local_IPs (expected long).\n"; continue; } } if ($found_country) { list($country_name, $country_code) = explode(',', $id); break; } } // Otherwise, fall back to GeoIP. if (!$found_country) { $country_code = geoip_country_code_by_addr($gi, $ip); $country_name = geoip_country_name_by_addr($gi, $ip); } // end NJS 2005-12-16 $date = $matches[2]; $archive = $matches[3]; $uniquebits = $buffer; /* NJS 2006-04-28 Switched to timestamp rather than date-based comparison. First, clean up the Apache request date into something that strtotime understands. Note that the Apache log dates include time zone info by default. */ $date = preg_replace("/:/"," ",$date,1); // Change first ":" to " ". $date = preg_replace("/\//", " ", $date); // Change all "/" to " ". $datetestB = strtotime($date); // Convert to properly formatted date string. $request_date = date('Y-m-d H:i:s O', $datetestB); if ($datetestB < $datetestA) continue; // NJS 2005-11-25 Added regexp for EPrints short URLs. if(preg_match("/GET \/archive\/0{1,8}\d{1,4}\/\d\d\//i",$buffer) || preg_match("/GET \/\d{1,4}\/\d\d\//i",$buffer)) { $view_type = "download"; } else { $view_type = "abstract"; } if(isset($eprintname[$archive])) { $eprint_name = $eprintname[$archive]; } else { $eprint_name = getePrintName($archive_name,$archive); $eprintname[$archive] = $eprint_name; } if($eprint_name=='') { // Do nothing. } else { $eprint_name = mysql_escape_string($eprint_name); /* NJS 2006-04-25 Requests containing apostrophes (') are dumped by MySQL unless we escape them. Looking in the GeoIP files I also see country names with apostrophes, so escape that as well. Everything else should be fine. */ $uniquebits = mysql_escape_string($uniquebits); $country_name = mysql_escape_string($country_name); // end NJS 2006-04-25 $query = " INSERT into view (uniquebits,archive_name,ip,request_date,archiveid,country_code,country_name,view_type,eprint_name) values('".$uniquebits."','".$archive_name."','".$ip."','".$request_date."',".$archive.",'".$country_code."','".$country_name."','".$view_type."','".$eprint_name."')"; $result = mysql_query($query,$connect); $insertid = mysql_insert_id($connect); } } else { // print "NO match" . "\n"; } } fclose($handle); } /* Keep track of where we are. Should avoid duplication of results if the script is run more than once on the same log file */ // NJS 2006-04-28 Switched value inserted to $start_time instead of $request_date. $query = "INSERT into lastproc (lastproc) values('".$start_time."')"; $result = mysql_query($query,$connect); #print "Records counted: $counter\n"; #print "Last count: $request_date\n"; mysql_close($connect2); mysql_close($connect); // Look up the title corresponding to the specified eprint id. function getePrintName($db,$eprintid) { global $connect2; $sqldatabase = $db; $db = mysql_select_db($sqldatabase,$connect2); $query3 = "select title from archive where eprintid = $eprintid"; $result3 = mysql_query($query3,$connect2); $title = ''; $suffix = ''; // NJS 2006-04-25 Added check for empty result, probably a deleted item. // Look in the deletion table for details. if (mysql_num_rows($result3) == 0) { $query3 = "select title from deletion where eprintid = $eprintid"; $result3 = mysql_query($query3,$connect2); // If it's not in deletion, then we have no clue what it is. if (mysql_num_rows($result3) == 0) { $title = "Unknown item [$eprintid]"; } else { $suffix = ' [deleted]'; } } if ($title == '') { $row = mysql_fetch_assoc($result3); $row["title"] = trim($row["title"]); $row["title"] = preg_replace("/\s+/"," ",$row["title"]); $title = $row["title"]; } return $title . $suffix; } ?>
<?php // NJS 2005-12-09 Switched to GeoIP from GeoIP:IPfree. include("geoip.inc"); $gi = geoip_open("##GEOIP_DATABASE##",GEOIP_STANDARD); /* Apache log for ePrints uses this format: LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined If the log format differs the regular expression matching would need to be adjusted. Parse: ip date YYYY MM DD archive ID */ // Web server log files $log_dir = '##APACHE_LOG_LOCATION##'; $log_file = array( 'otago_eprints' => '##APACHE_LOG_NAME##', ); // eprintstats db $sqlserver = 'localhost'; $sqluser = 'eprintstatspriv'; $sqlpass = 'AuldGrizzel'; $sqldatabase = 'eprintstats'; // SQL details of your ePrints installation $sqlserver2 = 'localhost'; $sqluser2 = 'otago_eprints'; $sqlpass2 = 'DrSyntaxRidesAgain'; /* NJS 2005-12-16 IP address ranges for your local Intranet(s). You can have multiple ranges of IP addresses, each with a different "country name", so that they will appear as separate entries in the by country stats pages. You should use a different country code for each range (ISO 3166-1 specifies the range XA through XZ as "user-assignable", so you can use codes from there as necessary), and create flag icons as appropriate. Each address range key is the name that will appear in the statistics database (the "country name"), followed by a comma, followed by the appropriate ISO 3166-1 country code as noted above. Each entry in the range is either a single IP address, or an array specifying a lower and upper bound for a contiguous IP address range (see example below). All IP addresses must be converted to long values using the ip2long() function before being stored. Note that address ranges may overlap. The script will use the first range that matches a given IP, so list the ranges in the correct order of precedence for your needs. Example: $local_IPs = array( 'Repository Admin,XA' => array( ip2long('192.168.1.5'), ip2long('192.168.1.22'), array( lower => ip2long('192.168.1.30'), upper => ip2long('192.168.1.35'), ), ), 'Our Intranet,XI' => array( array( lower => ip2long('192.168.1.0'), upper => ip2long('192.168.255.255'), ), ), ); 'Repository Admin' covers the IP addresses 192.168.1.5, 192.168.1.22 and the range 192.168.1.30 to 192.168.1.35, inclusive. 'Our Intranet' covers the range 192.168.1.0 to 192.168.255.255, inclusive. A machine will only match the 'Our Intranet' range if it first fails to match the 'Repository Admin' range. */ $local_IPs = array( 'Repository Admin,XA' => array( ip2long('139.80.75.110'), // Nigel @ Uni ip2long('60.234.209.74'), // Nigel @ home ip2long('139.80.92.138'), // Monica & Jeremy ip2long('139.80.92.151'), // @ Uni ip2long('203.89.162.155'), // Monica @ home ip2long('139.80.81.50'), // eprints.otago.ac.nz ), 'Otago Intranet,XI' => array( array( 'lower' => ip2long('139.80.0.0'), 'upper' => ip2long('139.80.127.255'), ), ), ); ########################################### ## ## No configuration required below here. ## ########################################### $connect = mysql_pconnect ($sqlserver,$sqluser,$sqlpass); $db = mysql_select_db($sqldatabase,$connect) or die("Could not connect"); // First get the date of last update $query = "select lastproc from lastproc order by timeinsert desc limit 1"; $result = mysql_query($query,$connect); $num_rows = mysql_num_rows($result); if ($num_rows > 0) { $row = mysql_fetch_assoc($result); $lastproc = $row["lastproc"]; $datetestA = strtotime($lastproc); } else { $datetestA = 0; } $connect2 = mysql_connect($sqlserver2,$sqluser2,$sqlpass2); $counter = 1; foreach($log_file as $archivename=>$archivelog) { $logf = $log_dir . $archivelog; $archive_name = $archivename; $handle = fopen($logf, "r"); while (!feof($handle)) { $buffer = fgets($handle, 4096); // NJS 2005-11-25 Added regexp for EPrints short URLs. if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) || (preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) || (preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches))) { $counter++; $country_code = ''; $country_name = ''; $insertid = ''; $eprint_name = ''; $view_type = ''; $uniquebits = ''; $ip = $matches[1]; /* NJS 2005-12-16 Determine country code and name. Check whether the IP number falls into any of the local intranet ranges. If so, then use that. */ $ip_long = ip2long($ip); $found_country = FALSE; foreach ($local_IPs as $id => $addresses) { foreach ($addresses as $ip_range) { if (is_array($ip_range)) // check against lower/upper bounds { $found_country = (($ip_long >= $ip_range['lower']) && ($ip_long <= $ip_range['upper'])); break; } else if (is_long($ip_range)) // data type sanity check { $found_country = ($ip_long == $ip_range); break; } else // something is seriously broken, ignore this entry { print "Unsupported data type " . gettype($ip_range) . " (value " . $ip_range . ") in \$local_IPs (expected long).\n"; continue; } } if ($found_country) { list($country_name, $country_code) = explode(',', $id); break; } } // Otherwise, fall back to GeoIP. if (!$found_country) { $country_code = geoip_country_code_by_addr($gi, $ip); $country_name = geoip_country_name_by_addr($gi, $ip); } // end NJS 2005-12-16 $date = $matches[2]; $archive = $matches[3]; $uniquebits = $buffer; $date = preg_replace("/:.*/","",$date); $date = preg_replace("/\//", " ", $date); $when = getdate(strtotime($date)); $request_date = $when["year"]."-".$when["mon"]."-".$when["mday"]; $datetestB = strtotime($request_date); /* NJS 2006-04-25 IMPORTANT: if you run this script more than once per day, it will count multiple times downloads whose $request_date == $lastproc. For example, if you ran this script five times per day, all the downloads that occurred during that day would be counted EVERY TIME this script ran, thus overinflating your stats by a factor of up to five :( This happens because $lastproc has one day as its base unit. If finer granularity for stats updates is desired, the solution would be to use the full timestamp rather than just the date. */ if ($datetestB < $datetestA) continue; // NJS 2005-11-25 Added regexp for EPrints short URLs. if(preg_match("/GET \/archive\/0{1,8}\d{1,4}\/\d\d\//i",$buffer) || preg_match("/GET \/\d{1,4}\/\d\d\//i",$buffer)) { $view_type = "download"; } else { $view_type = "abstract"; } if(isset($eprintname[$archive])) { $eprint_name = $eprintname[$archive]; } else { $eprint_name = getePrintName($archive_name,$archive); $eprintname[$archive] = $eprint_name; } if($eprint_name=='') { // Do nothing. } else { $eprint_name = mysql_escape_string($eprint_name); /* NJS 2006-04-25 Requests containing apostrophes (') are dumped by MySQL unless we escape them. Looking in the GeoIP files I also see country names with apostrophes, so escape that as well. Everything else should be fine. */ $uniquebits = mysql_escape_string($uniquebits); $country_name = mysql_escape_string($country_name); // end NJS 2006-04-25 $query = " INSERT into view (uniquebits,archive_name,ip,request_date,archiveid,country_code,country_name,view_type,eprint_name) values('".$uniquebits."','".$archive_name."','".$ip."','".$request_date."',".$archive.",'".$country_code."','".$country_name."','".$view_type."','".$eprint_name."')"; $result = mysql_query($query,$connect); $insertid = mysql_insert_id($connect); } } else { // print "NO match" . "\n"; } } fclose($handle); } /* Keep track of where we are. Should avoid duplication of results if the script is run more than once on the same log file */ $query = "INSERT into lastproc (lastproc) values('".$request_date."')"; $result = mysql_query($query,$connect); #print "Records counted: $counter\n"; #print "Last count: $request_date\n"; mysql_close($connect2); mysql_close($connect); function getePrintName($db,$eprintid) { global $connect2; $sqldatabase = $db; $db = mysql_select_db($sqldatabase,$connect2); $query3 = "select title from archive where eprintid = $eprintid"; $result3 = mysql_query($query3,$connect2); // NJS 2006-04-25 Added check for empty result, probably a deleted item. if (mysql_num_rows($result3) == 0) { return "Unknown item ($eprintid)"; } else { $row = mysql_fetch_assoc($result3); $row["title"] = trim($row["title"]); $row["title"] = preg_replace("/\s+/"," ",$row["title"]); return $row["title"]; } } ?>
Show line notes below