GitBucket
4.21.2
Toggle navigation
Snippets
Sign in
Files
Branches
1
Releases
Issues
Pull requests
Labels
Priorities
Milestones
Wiki
Forks
nigel.stanger
/
Digital_Repository
Browse code
- Updated ADT harvester code to work with EPrints 2.3.13.
master
1 parent
9926031
commit
98c7eb8cb0faec99631f5bf30d2e00353ac6fba1
nstanger
authored
on 16 Dec 2005
Patch
Showing
3 changed files
Repositories/ADT/HarvestToADT.pl
Repositories/ADT/Settings.pm
Repositories/ADT/WriteADTpage.pm
Ignore Space
Show notes
View
Repositories/ADT/HarvestToADT.pl
#!/usr/bin/perl use DBI; require Settings; sub LookUpEprintsMaxID # Find out the maximum eprintid for the archive { my $dbh = DBI->connect("DBI:mysql:$DB_NAME:$DB_HOST", $DB_USER, $DB_PASS); my $query = $dbh->prepare("SELECT eprintid FROM archive ORDER BY eprintid DESC"); $query->execute; my $numrows = $query->rows; my $maxID = 0; if ($numrows > 0) { @row = $query->fetchrow_array; $maxID = $row[0]; } $query->finish; $dbh->disconnect; return $maxID; } sub IsAnRHDThesis # Determine if an eprint is a RHD thesis or not { local ($ID, *args) = @_; my $dbh = DBI->connect("DBI:mysql:$DB_NAME:$DB_HOST", $DB_USER, $DB_PASS); my $query = $dbh->prepare("SELECT thesis_type FROM archive WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; my $isathesis = 0; if ($numrows > 0) { @row = $query->fetchrow_array; my $type = "\L$row[0]"; if (($type =~ "phd") || ($type =~ "rmaster")) { $isathesis = 1; } } $query->finish; $dbh->disconnect; return $isathesis; } sub LookUpEprintsDate # Lookup the year of publication for an eprint with an eprintID of $ID { local ($ID, *args) = @_; my $dbh = DBI->connect("DBI:mysql:$DB_NAME:$DB_HOST", $DB_USER, $DB_PASS); my $query = $dbh->prepare("SELECT date_effective FROM archive WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; my $year = 0; if ($numrows > 0) { @row = $query->fetchrow_array; $year = $row[0]; } return $year; }; require WriteADTpage; # ------------------MAIN PROCESS----------------- # If the ADT directory does not exist, create it if (!(-e $ADT_DIR)) { mkdir($ADT_DIR, 0755) || die "Failed to create adt directory\n"; } chdir($ADT_DIR) || die "Failed to change working directory\n"; my $indexfile = ">" . $ADT_DIR . "/index.html"; open(INDEXFILEHANDLE, $indexfile) or die "Failed to open/create file: $indexfile\n"; print INDEXFILEHANDLE "<html><head><title>ADT theses</title></head><body><p> </p>\n"; # Look at all documents - if they exist and are a thesis then create a directory and an index page $MaxID = LookUpEprintsMaxID(); for ($n = 1; $n <= $MaxID; $n++) { if (IsAnRHDThesis($n)) { $SubmittedDate = LookUpEprintsDate($n); $DirName = 'adt-' . $UNI_CODE . $SubmittedDate . '.' . sprintf("%04d", $n); # If the Eprint ADT directory does not exist, create it if (!(-e $DirName)) { mkdir($DirName, 0755) || die "Failed to create adt directory\n"; } $FileName = $DirName . '/index.html'; $ThesisURL = $BASE_URL . $ADT_URL . "/" . $FileName; if (!(WriteADTpage($FileName, $n))) { print "Missing metadata for eprint ID: $n\n"; } else { print INDEXFILEHANDLE "<p><a href=$ThesisURL>$DirName</a></p>\n"; } } } print INDEXFILEHANDLE "</body></html>\n"; close INDEXFILEHANDLE;
#!/usr/bin/perl use DBI; require Settings; sub LookUpEprintsMaxID # Find out the maximum eprintid for the archive { my $dbh = DBI->connect("DBI:mysql:$DB_NAME:$DB_HOST", $DB_USER, $DB_PASS); my $query = $dbh->prepare("SELECT eprintid FROM archive ORDER BY eprintid DESC"); $query->execute; my $numrows = $query->rows; my $maxID = 0; if ($numrows > 0) { @row = $query->fetchrow_array; $maxID = $row[0]; } $query->finish; $dbh->disconnect; return $maxID; } sub IsAnRHDThesis # Determine if an eprint is a RHD thesis or not { local ($ID, *args) = @_; my $dbh = DBI->connect("DBI:mysql:$DB_NAME:$DB_HOST", $DB_USER, $DB_PASS); my $query = $dbh->prepare("SELECT thesistype FROM archive WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; my $isathesis = 0; if ($numrows > 0) { @row = $query->fetchrow_array; my $type = "\L$row[0]"; if (($type =~ "phd") || ($type =~ "rmaster")) { $isathesis = 1; } } $query->finish; $dbh->disconnect; return $isathesis; } sub LookUpEprintsDate # Lookup the year of publication for an eprint with an eprintID of $ID { local ($ID, *args) = @_; my $dbh = DBI->connect("DBI:mysql:$DB_NAME:$DB_HOST", $DB_USER, $DB_PASS); my $query = $dbh->prepare("SELECT year FROM archive WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; my $year = 0; if ($numrows > 0) { @row = $query->fetchrow_array; $year = $row[0]; } return $year; }; require WriteADTpage; # ------------------MAIN PROCESS----------------- # If the ADT directory does not exist, create it if (!(-e $ADT_DIR)) { mkdir($ADT_DIR, 0755) || die "Failed to create adt directory\n"; } chdir($ADT_DIR) || die "Failed to change working directory\n"; my $indexfile = ">" . $ADT_DIR . "/index.html"; open(INDEXFILEHANDLE, $indexfile) or die "Failed to open/create file: $indexfile\n"; print INDEXFILEHANDLE "<html><head><title>ADT theses</title></head><body><p> </p>\n"; # Look at all documents - if they exist and are a thesis then create a directory and an index page $MaxID = LookUpEprintsMaxID(); for ($n = 1; $n <= $MaxID; $n++) { if (IsAnRHDThesis($n)) { $SubmittedDate = LookUpEprintsDate($n); $DirName = 'adt-' . $UNI_CODE . $SubmittedDate . '.' . sprintf("%04d", $n); # If the Eprint ADT directory does not exist, create it if (!(-e $DirName)) { mkdir($DirName, 0755) || die "Failed to create adt directory\n"; } $FileName = $DirName . '/index.html'; $ThesisURL = $BASE_URL . $ADT_URL . "/" . $FileName; if (!(WriteADTpage($FileName, $n))) { print "Missing metadata for eprint ID: $n\n"; } else { print INDEXFILEHANDLE "<p><a href=$ThesisURL>$DirName</a></p>\n"; } } } print INDEXFILEHANDLE "</body></html>\n"; close INDEXFILEHANDLE;
Ignore Space
Show notes
View
Repositories/ADT/Settings.pm
# MySQL database parameters $DB_NAME = "otago_eprints"; $DB_HOST = "localhost"; $DB_USER= "otago_adt"; $DB_PASS = "ComethTheThesis"; # University Specific Code $UNI_CODE = 'OU'; # University Description Text $UNI_TEXT = 'University of Otago'; # E-print Archive Short Name $EPRINT_SHORT_NAME = "Otago EPrints"; # Where the eprints archive is physically located $ARCHIVE_DIR = "/usr/local/eprints/archives/otago_eprints/html/en"; # Base Eprints URL $BASE_URL = "http://ou075110.otago.ac.nz:8008"; # ADT relative URL $ADT_URL = "/ADT"; # Where to store the ADT exported HTML files $ADT_DIR = $ARCHIVE_DIR . "/ADT"; # Starting URL for eprints attached documents $DOCS_URL = "http://ou075110.otago.ac.nz:8008/archive/"; # University copyright URL $COPY_URL = ""; 1;
# MySQL database parameters $DB_NAME = "UTasER"; $DB_HOST = "localhost"; $DB_USER= "eprintsADT"; $DB_PASS = "genADTdata"; # University Specific Code $UNI_CODE = 'TU'; # University Description Text $UNI_TEXT = 'University of Tasmania'; # E-print Archive Short Name $EPRINT_SHORT_NAME = "UTasER"; # Where the eprints archive is physically located $ARCHIVE_DIR = "/home/leven/eprints/archives/UTasER/html/en"; # Base Eprints URL $BASE_URL = "http://eprints.comp.utas.edu.au:81"; # ADT relative URL $ADT_URL = "/ADT"; # Where to store the ADT exported HTML files $ADT_DIR = $ARCHIVE_DIR . "/ADT"; # Starting URL for eprints attached documents $DOCS_URL = "http://eprints.comp.utas.edu.au:81/archive/"; # University copyright URL $COPY_URL = "http://www.utas.edu.au/copyright/copyright_disclaimers.html"; 1;
Ignore Space
Show notes
View
Repositories/ADT/WriteADTpage.pm
sub padZeros # Pad $number with zeros up to a length of $length { local ($number, $length, @args) = @_; my $num_length = length ($number); my $num_zeros = $length - $num_length; my $zeros = ""; for ($i=1;$i<=$num_zeros;$i++) { $zeros = $zeros . "0"; } $number = $zeros . $number; return $number; } sub WriteADTpage # Convert an eprint with ID $ID to HTML and store it as file $FName { local ($FName, $ID, @args) = @_; # --------------------------- GET THE EPRINTS METADATA ----------------------------- # These values would be dragged out from the database, not assigned my $dbh = DBI->connect("DBI:mysql:$DB_NAME:$DB_HOST", $DB_USER, $DB_PASS); my $query = $dbh->prepare("SELECT * FROM archive WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; if ($numrows > 0) { $row = $query->fetchrow_hashref; $title = $row->{title}; $abstract = $row->{abstract}; $publisheddate = $row->{date_effective}; $keywords = $row->{keywords}; $dept = $row->{department}; $thesis_type = $row->{thesis_type}; $depositID = $row->{userid}; $depositedon = $row->{datestamp}; } else { # die "No records found in table 'archive' for eprint id $ID\n"; return 0; } $query->finish; my $query = $dbh->prepare("SELECT * FROM archive_creators WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; if ($numrows > 0) { $row = $query->fetchrow_hashref; $lastname = $row->{creators_family}; $firstname = $row->{creators_given}; $salutation = $row->{creators_honourific}; } else { # die "No records found in table 'archive_creators' for eprint id $ID\n"; return 0; } $query->finish; my $query = $dbh->prepare("SELECT * FROM users WHERE userid ='$depositID'"); $query->execute; my $numrows = $query->rows; if ($numrows > 0) { $row = $query->fetchrow_hashref; $depositedby = $row->{name_family} . ", " . $row->{name_given}; } else { # die "No records found in table 'users' for user id $depositID\n"; return 0; } $query->finish; my $query = $dbh->prepare("SELECT * FROM document WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; if ($numrows > 0) { $i = 0; undef @documentIDs; undef @documentFORMATs; undef @documentFORMATDESCs; undef @documentMAINs; while ($row = $query->fetchrow_hashref) { $documentIDs[$i] = $row->{docid}; $documentFORMATs[$i] = $row->{format}; $documentFORMATDESCs[$i] = $row->{formatdesc}; $documentMAINs[$i] = $row->{main}; $i++; } } else { # die "No records found in table 'document' for eprint id $ID\n"; return 0; } $query->finish; my $query = $dbh->prepare("SELECT * FROM archive_subjects WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; if ($numrows > 0) { $i = 0; undef @subjectIDs; while ($row = $query->fetchrow_hashref) { $subjectIDs[$i] = $row->{subjects}; $i++; } } else { # die "No records found in table 'archive_subjects' for eprint id $ID\n"; return 0; } $query->finish; $dbh->disconnect; # OK, so now generate derived variables, first authors $author = $firstname . ' ' . $lastname; $authorrev = $lastname . ', ' . $firstname; $authorallrev = $lastname . ', ' . $salutation . ' ' . $firstname; # Now the URI $URI = $DOCS_URL . padZeros ($ID, 8) . "/"; # Convert thesis_type to actual text if ($thesis_type eq "phd") { $thesis_type = "PhD thesis"; } elsif ($thesis_type eq "honours") { $thesis_type = "Honours thesis"; } elsif ($thesis_type eq "cmaster") { $thesis_type = "Coursework Master thesis"; } elsif ($thesis_type eq "rmaster") { $thesis_type = "Research Master thesis"; } elsif ($thesis_type eq "other") { $thesis_type = "Other Degree thesis"; } # Convert abstract so that blank lines become <BR> $converted_Abstract = $abstract; # $converted_Abstract =~ s/(\n)/<BR>/g; # Perhaps leave that up to ADT to format. $converted_Abstract =~ s/"/"/g; $abstract = $converted_Abstract; # ------------------------ START WRITING THE FILE -------------------------- my $filename = "> " . $FName; open(FILEHANDLE, $filename) or die "Failed to open/create file: $FName\n"; # So write the file print FILEHANDLE "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"DTD/xhtml1-transitional.dtd\"> <html><head>"; # --------------------------- WRITE THE TITLE ----------------------------- print FILEHANDLE "<title>$title</title>\n"; # --------------------------- WRITE THE ADT METADATA ----------------------------- print FILEHANDLE "<meta name=\"DC.Title\" content=\"$title\"/>\n"; print FILEHANDLE "<meta name=\"DC.Creator\" content=\"$authorrev\"/>\n"; print FILEHANDLE "<meta name=\"DC.Creator.personalName\" content=\"$authorrev\"/>\n"; # generate sequence of keywords if ($keywords =~ ",") { @keywordslist = split (/,/,$keywords); } else { @keywordslist = split (/\ /,$keywords); } my $nokeywordslist = @keywordslist; for ($i = 0; $i < $nokeywordslist; $i++) { $keywordslist[$i] =~ s/^( +)//; # remove all spaces at start print FILEHANDLE "<meta name=\"DC.subject\" content=\"$keywordslist[$i]\"/>\n"; } print FILEHANDLE "<meta name=\"DC.Description.abstract\" content=\"$abstract\"/>\n"; print FILEHANDLE "<meta name=\"DC.Date\" scheme=\"W3CDTF\" content=\"$publisheddate\"/>\n"; print FILEHANDLE "<meta name=\"DC.Date.valid\" scheme=\"W3CDTF\" content=\"$publisheddate\"/>\n"; print FILEHANDLE "<meta name=\"DC.Language\" scheme=\"RFC1766\" content=\"en\"/>\n"; print FILEHANDLE "<meta name=\"DC.Publisher\" content=\"$UNI_TEXT, $dept\"/>\n"; print FILEHANDLE "<meta name=\"DC.Rights\" content=\"$COPY_URL\"/>\n"; print FILEHANDLE "<meta name=\"DC.Rights\" content=\"(c) Copyright $publisheddate $author\"/>\n"; print FILEHANDLE "<meta name=\"DC.Identifier\" scheme=\"URI\" content=\"$URI\"/>\n"; #// --------------------------- WRITE THE PAGE ----------------------------- print FILEHANDLE "</head> <body bgcolor=\"#ffffff\" text=\"#000000\"> <!-- Thesis title --> <h1 class=\"pagetitle\">$title</h1> <!-- Citation --> <p> $authorallrev ($publisheddate) <i>$title</i>. $thesis_type, $dept, $UNI_TEXT.</p> <!-- ID code --> <p>Eprints ID Code: $ID</p> </body> </html>\n"; #// END OF FUNCTION } 1;
sub padZeros # Pad $number with zeros up to a length of $length { local ($number, $length, @args) = @_; my $num_length = length ($number); my $num_zeros = $length - $num_length; my $zeros = ""; for ($i=1;$i<=$num_zeros;$i++) { $zeros = $zeros . "0"; } $number = $zeros . $number; return $number; } sub WriteADTpage # Convert an eprint with ID $ID to HTML and store it as file $FName { local ($FName, $ID, @args) = @_; # --------------------------- GET THE EPRINTS METADATA ----------------------------- # These values would be dragged out from the database, not assigned my $dbh = DBI->connect("DBI:mysql:$DB_NAME:$DB_HOST", $DB_USER, $DB_PASS); my $query = $dbh->prepare("SELECT * FROM archive WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; if ($numrows > 0) { $row = $query->fetchrow_hashref; $title = $row->{title}; $abstract = $row->{abstract}; $publisheddate = $row->{year}; $keywords = $row->{keywords}; $dept = $row->{department}; $thesistype = $row->{thesistype}; $depositID = $row->{userid}; $depositedon = $row->{datestamp}; } else { # die "No records found in table 'archive' for eprint id $ID\n"; return 0; } $query->finish; my $query = $dbh->prepare("SELECT * FROM archive_authors WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; if ($numrows > 0) { $row = $query->fetchrow_hashref; $lastname = $row->{authors_family}; $firstname = $row->{authors_given}; $salutation = $row->{authors_honourific}; } else { # die "No records found in table 'archive_authors' for eprint id $ID\n"; return 0; } $query->finish; my $query = $dbh->prepare("SELECT * FROM users WHERE userid ='$depositID'"); $query->execute; my $numrows = $query->rows; if ($numrows > 0) { $row = $query->fetchrow_hashref; $depositedby = $row->{name_family} . ", " . $row->{name_given}; } else { # die "No records found in table 'users' for user id $depositID\n"; return 0; } $query->finish; my $query = $dbh->prepare("SELECT * FROM document WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; if ($numrows > 0) { $i = 0; undef @documentIDs; undef @documentFORMATs; undef @documentFORMATDESCs; undef @documentMAINs; while ($row = $query->fetchrow_hashref) { $documentIDs[$i] = $row->{docid}; $documentFORMATs[$i] = $row->{format}; $documentFORMATDESCs[$i] = $row->{formatdesc}; $documentMAINs[$i] = $row->{main}; $i++; } } else { # die "No records found in table 'document' for eprint id $ID\n"; return 0; } $query->finish; my $query = $dbh->prepare("SELECT * FROM archive_subjects WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; if ($numrows > 0) { $i = 0; undef @subjectIDs; while ($row = $query->fetchrow_hashref) { $subjectIDs[$i] = $row->{subjects}; $i++; } } else { # die "No records found in table 'archive_subjects' for eprint id $ID\n"; return 0; } $query->finish; $dbh->disconnect; # OK, so now generate derived variables, first authors $author = $firstname . ' ' . $lastname; $authorrev = $lastname . ', ' . $firstname; $authorallrev = $lastname . ', ' . $salutation . ' ' . $firstname; # Now the URI $URI = $DOCS_URL . padZeros ($ID, 8) . "/"; # Convert thesistype to actual text if ($thesistype eq "phd") { $thesistype = "PhD thesis"; } elsif ($thesistype eq "honours") { $thesistype = "Honours thesis"; } elsif ($thesistype eq "cmaster") { $thesistype = "Coursework Master thesis"; } elsif ($thesistype eq "rmaster") { $thesistype = "Research Master thesis"; } elsif ($thesistype eq "other") { $thesistype = "Other Degree thesis"; } # Convert abstract so that blank lines become <BR> $converted_Abstract = $abstract; # $converted_Abstract =~ s/(\n)/<BR>/g; # Perhaps leave that up to ADT to format. $converted_Abstract =~ s/"/"/g; $abstract = $converted_Abstract; # ------------------------ START WRITING THE FILE -------------------------- my $filename = "> " . $FName; open(FILEHANDLE, $filename) or die "Failed to open/create file: $FName\n"; # So write the file print FILEHANDLE "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"DTD/xhtml1-transitional.dtd\"> <html><head>"; # --------------------------- WRITE THE TITLE ----------------------------- print FILEHANDLE "<title>$title</title>\n"; # --------------------------- WRITE THE ADT METADATA ----------------------------- print FILEHANDLE "<meta name=\"DC.Title\" content=\"$title\"/>\n"; print FILEHANDLE "<meta name=\"DC.Creator\" content=\"$authorrev\"/>\n"; print FILEHANDLE "<meta name=\"DC.Creator.personalName\" content=\"$authorrev\"/>\n"; # generate sequence of keywords if ($keywords =~ ",") { @keywordslist = split (/,/,$keywords); } else { @keywordslist = split (/\ /,$keywords); } my $nokeywordslist = @keywordslist; for ($i = 0; $i < $nokeywordslist; $i++) { $keywordslist[$i] =~ s/^( +)//; # remove all spaces at start print FILEHANDLE "<meta name=\"DC.subject\" content=\"$keywordslist[$i]\"/>\n"; } print FILEHANDLE "<meta name=\"DC.Description.abstract\" content=\"$abstract\"/>\n"; print FILEHANDLE "<meta name=\"DC.Date\" scheme=\"W3CDTF\" content=\"$publisheddate\"/>\n"; print FILEHANDLE "<meta name=\"DC.Date.valid\" scheme=\"W3CDTF\" content=\"$publisheddate\"/>\n"; print FILEHANDLE "<meta name=\"DC.Language\" scheme=\"RFC1766\" content=\"en\"/>\n"; print FILEHANDLE "<meta name=\"DC.Publisher\" content=\"$UNI_TEXT, $dept\"/>\n"; print FILEHANDLE "<meta name=\"DC.Rights\" content=\"$COPY_URL\"/>\n"; print FILEHANDLE "<meta name=\"DC.Rights\" content=\"(c) Copyright $publisheddate $author\"/>\n"; print FILEHANDLE "<meta name=\"DC.Identifier\" scheme=\"URI\" content=\"$URI\"/>\n"; #// --------------------------- WRITE THE PAGE ----------------------------- print FILEHANDLE "</head> <body bgcolor=\"#ffffff\" text=\"#000000\"> <!-- Thesis title --> <h1 class=\"pagetitle\">$title</h1> <!-- Citation --> <p> $authorallrev ($publisheddate) <i>$title</i>. $thesistype, $dept, $UNI_TEXT.</p> <!-- ID code --> <p>Eprints ID Code: $ID</p> </body> </html>\n"; #// END OF FUNCTION } 1;
Show line notes below