GitBucket
4.21.2
Toggle navigation
Snippets
Sign in
Files
Branches
1
Releases
Issues
Pull requests
Labels
Priorities
Milestones
Wiki
Forks
nigel.stanger
/
Digital_Repository
Browse code
- Changed <= for date tst back to <.
master
1 parent
e460fe0
commit
b78339219f8047e0be5bf1e95af8489d123365d3
nstanger
authored
on 27 Apr 2006
Patch
Showing
1 changed file
Repositories/statistics/scripts/eprints-usage_src.php
Ignore Space
Show notes
View
Repositories/statistics/scripts/eprints-usage_src.php
<?php // NJS 2005-12-09 Switched to GeoIP from GeoIP:IPfree. include("geoip.inc"); $gi = geoip_open("##GEOIP_DATABASE##",GEOIP_STANDARD); /* Apache log for ePrints uses this format: LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined If the log format differs the regular expression matching would need to be adjusted. Parse: ip date YYYY MM DD archive ID */ // Web server log files $log_dir = '##APACHE_LOG_LOCATION##'; $log_file = array( 'otago_eprints' => '##APACHE_LOG_NAME##', ); // eprintstats db $sqlserver = 'localhost'; $sqluser = 'eprintstatspriv'; $sqlpass = 'AuldGrizzel'; $sqldatabase = 'eprintstats'; // SQL details of your ePrints installation $sqlserver2 = 'localhost'; $sqluser2 = 'otago_eprints'; $sqlpass2 = 'DrSyntaxRidesAgain'; /* NJS 2005-12-16 IP address ranges for your local Intranet(s). You can have multiple ranges of IP addresses, each with a different "country name", so that they will appear as separate entries in the by country stats pages. You should use a different country code for each range (ISO 3166-1 specifies the range XA through XZ as "user-assignable", so you can use codes from there as necessary), and create flag icons as appropriate. Each address range key is the name that will appear in the statistics database (the "country name"), followed by a comma, followed by the appropriate ISO 3166-1 country code as noted above. Each entry in the range is either a single IP address, or an array specifying a lower and upper bound for a contiguous IP address range (see example below). All IP addresses must be converted to long values using the ip2long() function before being stored. Note that address ranges may overlap. The script will use the first range that matches a given IP, so list the ranges in the correct order of precedence for your needs. Example: $local_IPs = array( 'Repository Admin,XA' => array( ip2long('192.168.1.5'), ip2long('192.168.1.22'), array( lower => ip2long('192.168.1.30'), upper => ip2long('192.168.1.35'), ), ), 'Our Intranet,XI' => array( array( lower => ip2long('192.168.1.0'), upper => ip2long('192.168.255.255'), ), ), ); 'Repository Admin' covers the IP addresses 192.168.1.5, 192.168.1.22 and the range 192.168.1.30 to 192.168.1.35, inclusive. 'Our Intranet' covers the range 192.168.1.0 to 192.168.255.255, inclusive. A machine will only match the 'Our Intranet' range if it first fails to match the 'Repository Admin' range. */ $local_IPs = array( 'Repository Admin,XA' => array( ip2long('139.80.75.110'), // Nigel @ Uni ip2long('60.234.209.74'), // Nigel @ home ip2long('139.80.92.138'), // Monica & Jeremy ip2long('139.80.92.151'), // @ Uni ip2long('203.89.162.155'), // Monica @ home ip2long('139.80.81.50'), // eprints.otago.ac.nz ), 'Otago Intranet,XI' => array( array( 'lower' => ip2long('139.80.0.0'), 'upper' => ip2long('139.80.127.255'), ), ), ); ########################################### ## ## No configuration required below here. ## ########################################### $connect = mysql_pconnect ($sqlserver,$sqluser,$sqlpass); $db = mysql_select_db($sqldatabase,$connect) or die("Could not connect"); // First get the date of last update $query = "select lastproc from lastproc order by timeinsert desc limit 1"; $result = mysql_query($query,$connect); $num_rows = mysql_num_rows($result); if ($num_rows > 0) { $row = mysql_fetch_assoc($result); $lastproc = $row["lastproc"]; $datetestA = strtotime($lastproc); } else { $datetestA = 0; } $connect2 = mysql_connect($sqlserver2,$sqluser2,$sqlpass2); $counter = 1; foreach($log_file as $archivename=>$archivelog) { $logf = $log_dir . $archivelog; $archive_name = $archivename; $handle = fopen($logf, "r"); while (!feof($handle)) { $buffer = fgets($handle, 4096); // NJS 2005-11-25 Added regexp for EPrints short URLs. if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) || (preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) || (preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches))) { $counter++; $country_code = ''; $country_name = ''; $insertid = ''; $eprint_name = ''; $view_type = ''; $uniquebits = ''; $ip = $matches[1]; /* NJS 2005-12-16 Determine country code and name. Check whether the IP number falls into any of the local intranet ranges. If so, then use that. */ $ip_long = ip2long($ip); $found_country = FALSE; foreach ($local_IPs as $id => $addresses) { foreach ($addresses as $ip_range) { if (is_array($ip_range)) // check against lower/upper bounds { $found_country = (($ip_long >= $ip_range['lower']) && ($ip_long <= $ip_range['upper'])); break; } else if (is_long($ip_range)) // data type sanity check { $found_country = ($ip_long == $ip_range); break; } else // something is seriously broken, ignore this entry { print "Unsupported data type " . gettype($ip_range) . " (value " . $ip_range . ") in \$local_IPs (expected long).\n"; continue; } } if ($found_country) { list($country_name, $country_code) = explode(',', $id); break; } } // Otherwise, fall back to GeoIP. if (!$found_country) { $country_code = geoip_country_code_by_addr($gi, $ip); $country_name = geoip_country_name_by_addr($gi, $ip); } // end NJS 2005-12-16 $date = $matches[2]; $archive = $matches[3]; $uniquebits = $buffer; $date = preg_replace("/:.*/","",$date); $date = preg_replace("/\//", " ", $date); $when = getdate(strtotime($date)); $request_date = $when["year"]."-".$when["mon"]."-".$when["mday"]; $datetestB = strtotime($request_date); /* NJS 2006-04-25 IMPORTANT: if you run this script more than once per day, it will count multiple times downloads whose $request_date == $lastproc. For example, if you ran this script five times per day, all the downloads that occurred during that day would be counted EVERY TIME this script ran, thus overinflating your stats by a factor of up to five :( This happens because $lastproc has one day as its base unit. If finer granularity for stats updates is desired, the solution would be to use the full timestamp rather than just the date. */ if ($datetestB < $datetestA) continue; // NJS 2005-11-25 Added regexp for EPrints short URLs. if(preg_match("/GET \/archive\/0{1,8}\d{1,4}\/\d\d\//i",$buffer) || preg_match("/GET \/\d{1,4}\/\d\d\//i",$buffer)) { $view_type = "download"; } else { $view_type = "abstract"; } if(isset($eprintname[$archive])) { $eprint_name = $eprintname[$archive]; } else { $eprint_name = getePrintName($archive_name,$archive); $eprintname[$archive] = $eprint_name; } if($eprint_name=='') { // Do nothing. } else { $eprint_name = mysql_escape_string($eprint_name); /* NJS 2006-04-25 Requests containing apostrophes (') are dumped by MySQL unless we escape them. Looking in the GeoIP files I also see country names with apostrophes, so escape that as well. Everything else should be fine. */ $uniquebits = mysql_escape_string($uniquebits); $country_name = mysql_escape_string($country_name); // end NJS 2006-04-25 $query = " INSERT into view (uniquebits,archive_name,ip,request_date,archiveid,country_code,country_name,view_type,eprint_name) values('".$uniquebits."','".$archive_name."','".$ip."','".$request_date."',".$archive.",'".$country_code."','".$country_name."','".$view_type."','".$eprint_name."')"; $result = mysql_query($query,$connect); $insertid = mysql_insert_id($connect); } } else { // print "NO match" . "\n"; } } fclose($handle); } /* Keep track of where we are. Should avoid duplication of results if the script is run more than once on the same log file */ $query = "INSERT into lastproc (lastproc) values('".$request_date."')"; $result = mysql_query($query,$connect); #print "Records counted: $counter\n"; #print "Last count: $request_date\n"; mysql_close($connect2); mysql_close($connect); function getePrintName($db,$eprintid) { global $connect2; $sqldatabase = $db; $db = mysql_select_db($sqldatabase,$connect2); $query3 = "select title from archive where eprintid = $eprintid"; $result3 = mysql_query($query3,$connect2); // NJS 2006-04-25 Added check for empty result, probably a deleted item. if (mysql_num_rows($result3) == 0) { return "Unknown item ($eprintid)"; } else { $row = mysql_fetch_assoc($result3); $row["title"] = trim($row["title"]); $row["title"] = preg_replace("/\s+/"," ",$row["title"]); return $row["title"]; } } ?>
<?php // NJS 2005-12-09 Switched to GeoIP from GeoIP:IPfree. include("geoip.inc"); $gi = geoip_open("##GEOIP_DATABASE##",GEOIP_STANDARD); /* Apache log for ePrints uses this format: LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined If the log format differs the regular expression matching would need to be adjusted. Parse: ip date YYYY MM DD archive ID */ // Web server log files $log_dir = '##APACHE_LOG_LOCATION##'; $log_file = array( 'otago_eprints' => '##APACHE_LOG_NAME##', ); // eprintstats db $sqlserver = 'localhost'; $sqluser = 'eprintstatspriv'; $sqlpass = 'AuldGrizzel'; $sqldatabase = 'eprintstats'; // SQL details of your ePrints installation $sqlserver2 = 'localhost'; $sqluser2 = 'otago_eprints'; $sqlpass2 = 'DrSyntaxRidesAgain'; /* NJS 2005-12-16 IP address ranges for your local Intranet(s). You can have multiple ranges of IP addresses, each with a different "country name", so that they will appear as separate entries in the by country stats pages. You should use a different country code for each range (ISO 3166-1 specifies the range XA through XZ as "user-assignable", so you can use codes from there as necessary), and create flag icons as appropriate. Each address range key is the name that will appear in the statistics database (the "country name"), followed by a comma, followed by the appropriate ISO 3166-1 country code as noted above. Each entry in the range is either a single IP address, or an array specifying a lower and upper bound for a contiguous IP address range (see example below). All IP addresses must be converted to long values using the ip2long() function before being stored. Note that address ranges may overlap. The script will use the first range that matches a given IP, so list the ranges in the correct order of precedence for your needs. Example: $local_IPs = array( 'Repository Admin,XA' => array( ip2long('192.168.1.5'), ip2long('192.168.1.22'), array( lower => ip2long('192.168.1.30'), upper => ip2long('192.168.1.35'), ), ), 'Our Intranet,XI' => array( array( lower => ip2long('192.168.1.0'), upper => ip2long('192.168.255.255'), ), ), ); 'Repository Admin' covers the IP addresses 192.168.1.5, 192.168.1.22 and the range 192.168.1.30 to 192.168.1.35, inclusive. 'Our Intranet' covers the range 192.168.1.0 to 192.168.255.255, inclusive. A machine will only match the 'Our Intranet' range if it first fails to match the 'Repository Admin' range. */ $local_IPs = array( 'Repository Admin,XA' => array( ip2long('139.80.75.110'), // Nigel @ Uni ip2long('60.234.209.74'), // Nigel @ home ip2long('139.80.92.138'), // Monica & Jeremy ip2long('139.80.92.151'), // @ Uni ip2long('203.89.162.155'), // Monica @ home ip2long('139.80.81.50'), // eprints.otago.ac.nz ), 'Otago Intranet,XI' => array( array( 'lower' => ip2long('139.80.0.0'), 'upper' => ip2long('139.80.127.255'), ), ), ); ########################################### ## ## No configuration required below here. ## ########################################### $connect = mysql_pconnect ($sqlserver,$sqluser,$sqlpass); $db = mysql_select_db($sqldatabase,$connect) or die("Could not connect"); // First get the date of last update $query = "select lastproc from lastproc order by timeinsert desc limit 1"; $result = mysql_query($query,$connect); $num_rows = mysql_num_rows($result); if ($num_rows > 0) { $row = mysql_fetch_assoc($result); $lastproc = $row["lastproc"]; $datetestA = strtotime($lastproc); } else { $datetestA = 0; } $connect2 = mysql_connect($sqlserver2,$sqluser2,$sqlpass2); $counter = 1; foreach($log_file as $archivename=>$archivelog) { $logf = $log_dir . $archivelog; $archive_name = $archivename; $handle = fopen($logf, "r"); while (!feof($handle)) { $buffer = fgets($handle, 4096); // NJS 2005-11-25 Added regexp for EPrints short URLs. if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) || (preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) || (preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches))) { $counter++; $country_code = ''; $country_name = ''; $insertid = ''; $eprint_name = ''; $view_type = ''; $uniquebits = ''; $ip = $matches[1]; /* NJS 2005-12-16 Determine country code and name. Check whether the IP number falls into any of the local intranet ranges. If so, then use that. */ $ip_long = ip2long($ip); $found_country = FALSE; foreach ($local_IPs as $id => $addresses) { foreach ($addresses as $ip_range) { if (is_array($ip_range)) // check against lower/upper bounds { $found_country = (($ip_long >= $ip_range['lower']) && ($ip_long <= $ip_range['upper'])); break; } else if (is_long($ip_range)) // data type sanity check { $found_country = ($ip_long == $ip_range); break; } else // something is seriously broken, ignore this entry { print "Unsupported data type " . gettype($ip_range) . " (value " . $ip_range . ") in \$local_IPs (expected long).\n"; continue; } } if ($found_country) { list($country_name, $country_code) = explode(',', $id); break; } } // Otherwise, fall back to GeoIP. if (!$found_country) { $country_code = geoip_country_code_by_addr($gi, $ip); $country_name = geoip_country_name_by_addr($gi, $ip); } // end NJS 2005-12-16 $date = $matches[2]; $archive = $matches[3]; $uniquebits = $buffer; $date = preg_replace("/:.*/","",$date); $date = preg_replace("/\//", " ", $date); $when = getdate(strtotime($date)); $request_date = $when["year"]."-".$when["mon"]."-".$when["mday"]; $datetestB = strtotime($request_date); /* NJS 2006-04-25 Changed date comparison to <= from < to avoid the problem of counting multiple times downloads whose $request_date == $lastproc. This only occurred if you ran this script several times per day. For example, if you ran this script five times per day, all the downloads that occurred during that day would be counted EVERY TIME this script ran, thus overinflating your stats by a factor of up to five :( If finer granularity for stats updates is desired, the solution would be to use the full timestamp rather than just the date. */ if ($datetestB <= $datetestA) continue; // NJS 2005-11-25 Added regexp for EPrints short URLs. if(preg_match("/GET \/archive\/0{1,8}\d{1,4}\/\d\d\//i",$buffer) || preg_match("/GET \/\d{1,4}\/\d\d\//i",$buffer)) { $view_type = "download"; } else { $view_type = "abstract"; } if(isset($eprintname[$archive])) { $eprint_name = $eprintname[$archive]; } else { $eprint_name = getePrintName($archive_name,$archive); $eprintname[$archive] = $eprint_name; } if($eprint_name=='') { // Do nothing. } else { $eprint_name = mysql_escape_string($eprint_name); /* NJS 2006-04-25 Requests containing apostrophes (') are dumped by MySQL unless we escape them. Looking in the GeoIP files I also see country names with apostrophes, so escape that as well. Everything else should be fine. */ $uniquebits = mysql_escape_string($uniquebits); $country_name = mysql_escape_string($country_name); // end NJS 2006-04-25 $query = " INSERT into view (uniquebits,archive_name,ip,request_date,archiveid,country_code,country_name,view_type,eprint_name) values('".$uniquebits."','".$archive_name."','".$ip."','".$request_date."',".$archive.",'".$country_code."','".$country_name."','".$view_type."','".$eprint_name."')"; $result = mysql_query($query,$connect); $insertid = mysql_insert_id($connect); } } else { // print "NO match" . "\n"; } } fclose($handle); } /* Keep track of where we are. Should avoid duplication of results if the script is run more than once on the same log file */ $query = "INSERT into lastproc (lastproc) values('".$request_date."')"; $result = mysql_query($query,$connect); #print "Records counted: $counter\n"; #print "Last count: $request_date\n"; mysql_close($connect2); mysql_close($connect); function getePrintName($db,$eprintid) { global $connect2; $sqldatabase = $db; $db = mysql_select_db($sqldatabase,$connect2); $query3 = "select title from archive where eprintid = $eprintid"; $result3 = mysql_query($query3,$connect2); // NJS 2006-04-25 Added check for empty result, probably a deleted item. if (mysql_num_rows($result3) == 0) { return "Unknown item ($eprintid)"; } else { $row = mysql_fetch_assoc($result3); $row["title"] = trim($row["title"]); $row["title"] = preg_replace("/\s+/"," ",$row["title"]); return $row["title"]; } } ?>
Show line notes below