Newer
Older
Digital_Repository / Repositories / statistics / scripts / eprints-usage_src.php
  1. <?php
  2.  
  3. // NJS 2005-12-09 Switched to GeoIP from GeoIP:IPfree.
  4. include("geoip.inc");
  5.  
  6. $gi = geoip_open("##GEOIP_DATABASE##",GEOIP_STANDARD);
  7.  
  8. /*
  9.  
  10. Apache log for ePrints uses this format:
  11. LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined
  12.  
  13. If the log format differs the regular expression matching would need to be adjusted.
  14. Parse:
  15. ip
  16. date YYYY MM DD
  17. archive ID
  18.  
  19. */
  20.  
  21. // Web server log files
  22. $log_dir = '##APACHE_LOG_LOCATION##';
  23. $log_file = array(
  24. 'otago_eprints' => '##APACHE_LOG_NAME##',
  25. );
  26.  
  27.  
  28. // eprintstats db
  29. $sqlserver = 'localhost';
  30. $sqluser = 'eprintstatspriv';
  31. $sqlpass = 'AuldGrizzel';
  32. $sqldatabase = 'eprintstats';
  33.  
  34. // SQL details of your ePrints installation
  35. $sqlserver2 = 'localhost';
  36. $sqluser2 = 'otago_eprints';
  37. $sqlpass2 = 'DrSyntaxRidesAgain';
  38.  
  39. /* NJS 2005-12-16
  40. IP address ranges for your local Intranet(s). You can have multiple
  41. ranges of IP addresses, each with a different "country name", so that
  42. they will appear as separate entries in the by country stats pages.
  43. You should use a different country code for each range (ISO 3166-1
  44. specifies the range XA through XZ as "user-assignable", so you can use
  45. codes from there as necessary), and create flag icons as appropriate.
  46.  
  47. Each address range key is the name that will appear in the statistics
  48. database (the "country name"), followed by a comma, followed by the
  49. appropriate ISO 3166-1 country code as noted above. Each entry in the
  50. range is either a single IP address, or an array specifying a lower and
  51. upper bound for a contiguous IP address range (see example below).
  52.  
  53. All IP addresses must be converted to long values using the ip2long()
  54. function before being stored.
  55.  
  56. Note that address ranges may overlap. The script will use the first
  57. range that matches a given IP, so list the ranges in the correct order
  58. of precedence for your needs.
  59.  
  60. Example:
  61.  
  62. $local_IPs = array(
  63. 'Repository Admin,XA' => array(
  64. ip2long('192.168.1.5'),
  65. ip2long('192.168.1.22'),
  66. array(
  67. ip2long('192.168.1.30'),
  68. ip2long('192.168.1.35'),
  69. ),
  70. ),
  71. 'Our Intranet,XI' => array(
  72. array(
  73. lower => ip2long('192.168.1.0'),
  74. upper => ip2long('192.168.255.255'),
  75. ),
  76. ),
  77. );
  78.  
  79. 'Repository Admin' covers the IP addresses 192.168.1.5, 192.168.1.22 and
  80. the range 192.168.1.30 to 192.168.1.35, inclusive. 'Our Intranet' covers
  81. the range 192.168.1.0 to 192.168.255.255, inclusive. A machine will only
  82. match the 'Our Intranet' range if it first fails to match the
  83. 'Repository Admin' range.
  84. */
  85. $local_IPs = array(
  86. 'Repository Admin,XA' => array(
  87. ip2long('139.80.75.110'), // Nigel @ Uni
  88. ip2long('60.234.209.74'), // Nigel @ home
  89. ip2long('139.80.92.138'), // Monica & Jeremy
  90. ip2long('139.80.92.151'), // @ Uni
  91. ip2long('203.89.162.155'), // Monica @ home
  92. ip2long('139.80.81.50'), // eprints.otago.ac.nz
  93. ),
  94. 'Otago Intranet,XI' => array(
  95. array(
  96. 'lower' => ip2long('139.80.0.0'),
  97. 'upper' => ip2long('139.80.127.255'),
  98. ),
  99. ),
  100. );
  101.  
  102. ###########################################
  103. ##
  104. ## No configuration required below here.
  105. ##
  106. ###########################################
  107.  
  108. $connect = mysql_pconnect ($sqlserver,$sqluser,$sqlpass);
  109. $db = mysql_select_db($sqldatabase,$connect) or die("Could not connect");
  110.  
  111. // First get the date of last update
  112. $query = "select lastproc from lastproc order by timeinsert desc limit 1";
  113. $result = mysql_query($query,$connect);
  114. $num_rows = mysql_num_rows($result);
  115. if ($num_rows > 0) {
  116. $row = mysql_fetch_assoc($result);
  117. $lastproc = $row["lastproc"];
  118. $datetestA = strtotime($lastproc);
  119. }
  120. else {
  121. $datetestA = 0;
  122. }
  123.  
  124. $connect2 = mysql_connect($sqlserver2,$sqluser2,$sqlpass2);
  125. $counter = 1;
  126. foreach($log_file as $archivename=>$archivelog) {
  127. $logf = $log_dir . $archivelog;
  128. $archive_name = $archivename;
  129. $handle = fopen($logf, "r");
  130. while (!feof($handle)) {
  131. $buffer = fgets($handle, 4096);
  132. // NJS 2005-11-25 Added regexp for EPrints short URLs.
  133. if ((preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) ||
  134. (preg_match("/^(\S{1,}\.\S{1,}\.\S{1,}\.\S{1,}) - - \[(.*?)\] \"GET \/archive\/0{1,8}(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)) ||
  135. (preg_match("/^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(.*?)\] \"GET \/(\d{1,4}).*? HTTP\/1..\" 200 .*/i",$buffer,$matches)))
  136. {
  137. $counter++;
  138. $country_code = '';
  139. $country_name = '';
  140. $insertid = '';
  141. $eprint_name = '';
  142. $view_type = '';
  143. $uniquebits = '';
  144. $ip = $matches[1];
  145. /* NJS 2005-12-16
  146. Determine country code and name.
  147. Check whether the IP number falls into any of the local
  148. intranet ranges. If so, then use that.
  149. */
  150. $ip_long = ip2long($ip);
  151. $found_country = FALSE;
  152. foreach ($local_IPs as $id => $addresses)
  153. {
  154. foreach ($addresses as $ip_range)
  155. {
  156. if (is_array($ip_range)) // check against lower/upper bounds
  157. {
  158. $found_country = (($ip_long >= $ip_range['lower'])
  159. && ($ip_long <= $ip_range['upper']));
  160. break;
  161. }
  162. else if (is_long($ip_range)) // data type sanity check
  163. {
  164. $found_country = ($ip_long == $ip_range);
  165. break;
  166. }
  167. else // something is seriously broken, ignore this entry
  168. {
  169. print "Unsupported data type " . gettype($ip_range) .
  170. " (value " . $ip_range .
  171. ") in \$local_IPs (expected long).\n";
  172. continue;
  173. }
  174. }
  175. if ($found_country)
  176. {
  177. list($country_name, $country_code) = explode(',', $id);
  178. break;
  179. }
  180. }
  181. // Otherwise, fall back to GeoIP.
  182. if (!$found_country)
  183. {
  184. $country_code = geoip_country_code_by_addr($gi, $ip);
  185. $country_name = geoip_country_name_by_addr($gi, $ip);
  186. }
  187. // end NJS 2005-12-16
  188. $date = $matches[2];
  189. $archive = $matches[3];
  190. $uniquebits = $buffer;
  191. $date = preg_replace("/:.*/","",$date);
  192. $date = preg_replace("/\//", " ", $date);
  193. $when = getdate(strtotime($date));
  194. $request_date = $when["year"]."-".$when["mon"]."-".$when["mday"];
  195. $datetestB = strtotime($request_date);
  196. if ($datetestB < $datetestA)
  197. continue;
  198. // NJS 2005-11-25 Added regexp for EPrints short URLs.
  199. if(preg_match("/GET \/archive\/0{1,8}\d{1,4}\/\d\d\//i",$buffer) || preg_match("/GET \/\d{1,4}\/\d\d\//i",$buffer)) {
  200. $view_type = "download";
  201. } else {
  202. $view_type = "abstract";
  203. }
  204. if(isset($eprintname[$archive])) {
  205. $eprint_name = $eprintname[$archive];
  206. } else {
  207. $eprint_name = getePrintName($archive_name,$archive);
  208. $eprintname[$archive] = $eprint_name;
  209. }
  210. if($eprint_name=='') {
  211. // Do nothing.
  212. } else {
  213. $eprint_name = mysql_escape_string($eprint_name);
  214. $query = "
  215. INSERT into view (uniquebits,archive_name,ip,request_date,archiveid,country_code,country_name,view_type,eprint_name)
  216. values('".$uniquebits."','".$archive_name."','".$ip."','".$request_date."',".$archive.",'".$country_code."','".$country_name."','".$view_type."','".$eprint_name."')";
  217. $result = mysql_query($query,$connect);
  218. $insertid = mysql_insert_id($connect);
  219. }
  220.  
  221. } else {
  222. // print "NO match" . "\n";
  223. }
  224. }
  225. fclose($handle);
  226. }
  227.  
  228. /*
  229. Keep track of where we are. Should avoid duplication of results
  230. if the script is run more than once on the same log file
  231. */
  232.  
  233. $query = "INSERT into lastproc (lastproc) values('".$request_date."')";
  234. $result = mysql_query($query,$connect);
  235.  
  236. #print "Records counted: $counter\n";
  237. #print "Last count: $request_date\n";
  238. mysql_close($connect2);
  239. mysql_close($connect);
  240.  
  241. function getePrintName($db,$eprintid) {
  242. global $connect2;
  243. $sqldatabase = $db;
  244. $db = mysql_select_db($sqldatabase,$connect2);
  245. $query3 = "select title from archive where eprintid = $eprintid";
  246. $result3 = mysql_query($query3,$connect2);
  247. $row = mysql_fetch_assoc($result3);
  248. $row["title"] = trim($row["title"]);
  249. $row["title"] = preg_replace("/\s+/"," ",$row["title"]);
  250. return $row["title"];
  251. }
  252.  
  253. ?>
  254.