- Added base ADT harvesting source.

nigel.stanger / Digital_Repository

Browse code - Added base ADT harvesting source. master
1 parent f378f70 commit 99260314d60ab1cab4ff40d08cf93e52d9292436 nstanger authored on 16 Dec 2005

Patch

Showing 3 changed files

Ignore Space Show notes View Repositories/ADT/HarvestToADT.pl 0 → 100755
#!/usr/bin/perl use DBI; require Settings; sub LookUpEprintsMaxID # Find out the maximum eprintid for the archive { my $dbh = DBI->connect("DBI:mysql:$DB_NAME:$DB_HOST", $DB_USER, $DB_PASS); my $query = $dbh->prepare("SELECT eprintid FROM archive ORDER BY eprintid DESC"); $query->execute; my $numrows = $query->rows; my $maxID = 0; if ($numrows > 0) { @row = $query->fetchrow_array; $maxID = $row[0]; } $query->finish; $dbh->disconnect; return $maxID; } sub IsAnRHDThesis # Determine if an eprint is a RHD thesis or not { local ($ID, args) = @_; my $dbh = DBI->connect("DBI:mysql:$DB_NAME:$DB_HOST", $DB_USER, $DB_PASS); my $query = $dbh->prepare("SELECT thesistype FROM archive WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; my $isathesis = 0; if ($numrows > 0) { @row = $query->fetchrow_array; my $type = "\L$row[0]"; if (($type =~ "phd") \|\| ($type =~ "rmaster")) { $isathesis = 1; } } $query->finish; $dbh->disconnect; return $isathesis; } sub LookUpEprintsDate # Lookup the year of publication for an eprint with an eprintID of $ID { local ($ID, args) = @_; my $dbh = DBI->connect("DBI:mysql:$DB_NAME:$DB_HOST", $DB_USER, $DB_PASS); my $query = $dbh->prepare("SELECT year FROM archive WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; my $year = 0; if ($numrows > 0) { @row = $query->fetchrow_array; $year = $row[0]; } return $year; }; require WriteADTpage; # ------------------MAIN PROCESS----------------- # If the ADT directory does not exist, create it if (!(-e $ADT_DIR)) { mkdir($ADT_DIR, 0755) \|\| die "Failed to create adt directory\n"; } chdir($ADT_DIR) \|\| die "Failed to change working directory\n"; my $indexfile = ">" . $ADT_DIR . "/index.html"; open(INDEXFILEHANDLE, $indexfile) or die "Failed to open/create file: $indexfile\n"; print INDEXFILEHANDLE "<html><head><title>ADT theses</title></head><body><p> </p>\n"; # Look at all documents - if they exist and are a thesis then create a directory and an index page $MaxID = LookUpEprintsMaxID(); for ($n = 1; $n <= $MaxID; $n++) { if (IsAnRHDThesis($n)) { $SubmittedDate = LookUpEprintsDate($n); $DirName = 'adt-' . $UNI_CODE . $SubmittedDate . '.' . sprintf("%04d", $n); # If the Eprint ADT directory does not exist, create it if (!(-e $DirName)) { mkdir($DirName, 0755) \|\| die "Failed to create adt directory\n"; } $FileName = $DirName . '/index.html'; $ThesisURL = $BASE_URL . $ADT_URL . "/" . $FileName; if (!(WriteADTpage($FileName, $n))) { print "Missing metadata for eprint ID: $n\n"; } else { print INDEXFILEHANDLE "<p><a href=$ThesisURL>$DirName</a></p>\n"; } } } print INDEXFILEHANDLE "</body></html>\n"; close INDEXFILEHANDLE;

Ignore Space Show notes View

Repositories/ADT/HarvestToADT.pl 0 → 100755

#!/usr/bin/perl

use DBI;

require Settings;

sub LookUpEprintsMaxID
# Find out the maximum eprintid for the archive
{
	my $dbh = DBI->connect("DBI:mysql:$DB_NAME:$DB_HOST", $DB_USER, $DB_PASS);
	my $query = $dbh->prepare("SELECT eprintid FROM archive ORDER BY eprintid DESC");
	$query->execute;
	my $numrows = $query->rows;

my $maxID = 0;
	if ($numrows > 0) {
        	@row = $query->fetchrow_array;
		$maxID = $row[0];
        }

$query->finish;
	$dbh->disconnect;

return $maxID;
}

sub IsAnRHDThesis
# Determine if an eprint is a RHD thesis or not
{
	local ($ID, *args) = @_;
	my $dbh = DBI->connect("DBI:mysql:$DB_NAME:$DB_HOST", $DB_USER, $DB_PASS);
	my $query = $dbh->prepare("SELECT thesistype FROM archive WHERE eprintid ='$ID'");
	$query->execute;
	my $numrows = $query->rows;

my $isathesis = 0;
	if ($numrows > 0) {
        	@row = $query->fetchrow_array;
		my $type = "\L$row[0]";
		if (($type =~ "phd") || ($type =~ "rmaster")) {
			$isathesis = 1;
		}
        }

$query->finish;
	$dbh->disconnect;

return $isathesis;
}

sub LookUpEprintsDate
# Lookup the year of publication for an eprint with an eprintID of $ID
{
	local ($ID, *args) = @_;
	my $dbh = DBI->connect("DBI:mysql:$DB_NAME:$DB_HOST", $DB_USER, $DB_PASS);
	my $query = $dbh->prepare("SELECT year FROM archive WHERE eprintid ='$ID'");
	$query->execute;
	my $numrows = $query->rows;

my $year = 0;
	if ($numrows > 0) {
        	@row = $query->fetchrow_array;
		$year = $row[0];
        }
	return $year;
};

require WriteADTpage;

# ------------------MAIN PROCESS-----------------

# If the ADT directory does not exist, create it
if (!(-e $ADT_DIR)) {
	mkdir($ADT_DIR, 0755) || die "Failed to create adt directory\n";
}
chdir($ADT_DIR) || die "Failed to change working directory\n";

my $indexfile = ">" . $ADT_DIR . "/index.html";
open(INDEXFILEHANDLE, $indexfile) or die "Failed to open/create file: $indexfile\n";
print INDEXFILEHANDLE "<html><head><title>ADT theses</title></head><body>&nbsp;\n";

# Look at all documents - if they exist and are a thesis then create a directory and an index page
$MaxID = LookUpEprintsMaxID();
for ($n = 1; $n <= $MaxID; $n++) {
	if (IsAnRHDThesis($n)) {
		$SubmittedDate = LookUpEprintsDate($n);
		$DirName = 'adt-' . $UNI_CODE . $SubmittedDate . '.' . sprintf("%04d", $n);
		# If the Eprint ADT directory does not exist, create it
		if (!(-e $DirName)) {
			mkdir($DirName, 0755) || die "Failed to create adt directory\n";
		}
		$FileName = $DirName . '/index.html';
		$ThesisURL = $BASE_URL . $ADT_URL . "/" . $FileName;
		if (!(WriteADTpage($FileName, $n))) {
			print "Missing metadata for eprint ID: $n\n";
		}
		else {
			print INDEXFILEHANDLE "<a href=$ThesisURL>$DirName</a>\n";
		}
	}
}

print INDEXFILEHANDLE "</body></html>\n";
close INDEXFILEHANDLE;

Ignore Space Show notes View Repositories/ADT/Settings.pm 0 → 100755
# MySQL database parameters $DB_NAME = "UTasER"; $DB_HOST = "localhost"; $DB_USER= "eprintsADT"; $DB_PASS = "genADTdata"; # University Specific Code $UNI_CODE = 'TU'; # University Description Text $UNI_TEXT = 'University of Tasmania'; # E-print Archive Short Name $EPRINT_SHORT_NAME = "UTasER"; # Where the eprints archive is physically located $ARCHIVE_DIR = "/home/leven/eprints/archives/UTasER/html/en"; # Base Eprints URL $BASE_URL = "http://eprints.comp.utas.edu.au:81"; # ADT relative URL $ADT_URL = "/ADT"; # Where to store the ADT exported HTML files $ADT_DIR = $ARCHIVE_DIR . "/ADT"; # Starting URL for eprints attached documents $DOCS_URL = "http://eprints.comp.utas.edu.au:81/archive/"; # University copyright URL $COPY_URL = "http://www.utas.edu.au/copyright/copyright_disclaimers.html"; 1;

Ignore Space Show notes View

Repositories/ADT/Settings.pm 0 → 100755

Ignore Space Show notes View Repositories/ADT/WriteADTpage.pm 0 → 100755
sub padZeros # Pad $number with zeros up to a length of $length { local ($number, $length, @args) = @_; my $num_length = length ($number); my $num_zeros = $length - $num_length; my $zeros = ""; for ($i=1;$i<=$num_zeros;$i++) { $zeros = $zeros . "0"; } $number = $zeros . $number; return $number; } sub WriteADTpage # Convert an eprint with ID $ID to HTML and store it as file $FName { local ($FName, $ID, @args) = @_; # --------------------------- GET THE EPRINTS METADATA ----------------------------- # These values would be dragged out from the database, not assigned my $dbh = DBI->connect("DBI:mysql:$DB_NAME:$DB_HOST", $DB_USER, $DB_PASS); my $query = $dbh->prepare("SELECT * FROM archive WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; if ($numrows > 0) { $row = $query->fetchrow_hashref; $title = $row->{title}; $abstract = $row->{abstract}; $publisheddate = $row->{year}; $keywords = $row->{keywords}; $dept = $row->{department}; $thesistype = $row->{thesistype}; $depositID = $row->{userid}; $depositedon = $row->{datestamp}; } else { # die "No records found in table 'archive' for eprint id $ID\n"; return 0; } $query->finish; my $query = $dbh->prepare("SELECT * FROM archive_authors WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; if ($numrows > 0) { $row = $query->fetchrow_hashref; $lastname = $row->{authors_family}; $firstname = $row->{authors_given}; $salutation = $row->{authors_honourific}; } else { # die "No records found in table 'archive_authors' for eprint id $ID\n"; return 0; } $query->finish; my $query = $dbh->prepare("SELECT * FROM users WHERE userid ='$depositID'"); $query->execute; my $numrows = $query->rows; if ($numrows > 0) { $row = $query->fetchrow_hashref; $depositedby = $row->{name_family} . ", " . $row->{name_given}; } else { # die "No records found in table 'users' for user id $depositID\n"; return 0; } $query->finish; my $query = $dbh->prepare("SELECT * FROM document WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; if ($numrows > 0) { $i = 0; undef @documentIDs; undef @documentFORMATs; undef @documentFORMATDESCs; undef @documentMAINs; while ($row = $query->fetchrow_hashref) { $documentIDs[$i] = $row->{docid}; $documentFORMATs[$i] = $row->{format}; $documentFORMATDESCs[$i] = $row->{formatdesc}; $documentMAINs[$i] = $row->{main}; $i++; } } else { # die "No records found in table 'document' for eprint id $ID\n"; return 0; } $query->finish; my $query = $dbh->prepare("SELECT * FROM archive_subjects WHERE eprintid ='$ID'"); $query->execute; my $numrows = $query->rows; if ($numrows > 0) { $i = 0; undef @subjectIDs; while ($row = $query->fetchrow_hashref) { $subjectIDs[$i] = $row->{subjects}; $i++; } } else { # die "No records found in table 'archive_subjects' for eprint id $ID\n"; return 0; } $query->finish; $dbh->disconnect; # OK, so now generate derived variables, first authors $author = $firstname . ' ' . $lastname; $authorrev = $lastname . ', ' . $firstname; $authorallrev = $lastname . ', ' . $salutation . ' ' . $firstname; # Now the URI $URI = $DOCS_URL . padZeros ($ID, 8) . "/"; # Convert thesistype to actual text if ($thesistype eq "phd") { $thesistype = "PhD thesis"; } elsif ($thesistype eq "honours") { $thesistype = "Honours thesis"; } elsif ($thesistype eq "cmaster") { $thesistype = "Coursework Master thesis"; } elsif ($thesistype eq "rmaster") { $thesistype = "Research Master thesis"; } elsif ($thesistype eq "other") { $thesistype = "Other Degree thesis"; } # Convert abstract so that blank lines become <BR> $converted_Abstract = $abstract; # $converted_Abstract =~ s/(\n)/<BR>/g; # Perhaps leave that up to ADT to format. $converted_Abstract =~ s/"/"/g; $abstract = $converted_Abstract; # ------------------------ START WRITING THE FILE -------------------------- my $filename = "> " . $FName; open(FILEHANDLE, $filename) or die "Failed to open/create file: $FName\n"; # So write the file print FILEHANDLE "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"DTD/xhtml1-transitional.dtd\"> <html><head>"; # --------------------------- WRITE THE TITLE ----------------------------- print FILEHANDLE "<title>$title</title>\n"; # --------------------------- WRITE THE ADT METADATA ----------------------------- print FILEHANDLE "<meta name=\"DC.Title\" content=\"$title\"/>\n"; print FILEHANDLE "<meta name=\"DC.Creator\" content=\"$authorrev\"/>\n"; print FILEHANDLE "<meta name=\"DC.Creator.personalName\" content=\"$authorrev\"/>\n"; # generate sequence of keywords if ($keywords =~ ",") { @keywordslist = split (/,/,$keywords); } else { @keywordslist = split (/\ /,$keywords); } my $nokeywordslist = @keywordslist; for ($i = 0; $i < $nokeywordslist; $i++) { $keywordslist[$i] =~ s/^( +)//; # remove all spaces at start print FILEHANDLE "<meta name=\"DC.subject\" content=\"$keywordslist[$i]\"/>\n"; } print FILEHANDLE "<meta name=\"DC.Description.abstract\" content=\"$abstract\"/>\n"; print FILEHANDLE "<meta name=\"DC.Date\" scheme=\"W3CDTF\" content=\"$publisheddate\"/>\n"; print FILEHANDLE "<meta name=\"DC.Date.valid\" scheme=\"W3CDTF\" content=\"$publisheddate\"/>\n"; print FILEHANDLE "<meta name=\"DC.Language\" scheme=\"RFC1766\" content=\"en\"/>\n"; print FILEHANDLE "<meta name=\"DC.Publisher\" content=\"$UNI_TEXT, $dept\"/>\n"; print FILEHANDLE "<meta name=\"DC.Rights\" content=\"$COPY_URL\"/>\n"; print FILEHANDLE "<meta name=\"DC.Rights\" content=\"(c) Copyright $publisheddate $author\"/>\n"; print FILEHANDLE "<meta name=\"DC.Identifier\" scheme=\"URI\" content=\"$URI\"/>\n"; #// --------------------------- WRITE THE PAGE ----------------------------- print FILEHANDLE "</head> <body bgcolor=\"#ffffff\" text=\"#000000\"> <!-- Thesis title --> <h1 class=\"pagetitle\">$title</h1> <!-- Citation --> <p> $authorallrev ($publisheddate) <i>$title</i>. $thesistype, $dept, $UNI_TEXT.</p> <!-- ID code --> <p>Eprints ID Code: $ID</p> </body> </html>\n"; #// END OF FUNCTION } 1;

Ignore Space Show notes View

Repositories/ADT/WriteADTpage.pm 0 → 100755

sub padZeros
# Pad $number with zeros up to a length of $length
{
	local ($number, $length, @args) = @_;
	my $num_length = length ($number);
	my $num_zeros = $length - $num_length;
	my $zeros = "";
	for ($i=1;$i<=$num_zeros;$i++) {
		$zeros = $zeros . "0";
	}
	$number = $zeros . $number;
	return $number;
}

sub WriteADTpage
# Convert an eprint with ID $ID to HTML and store it as file $FName
{
	local ($FName, $ID, @args) = @_;

# --------------------------- GET THE EPRINTS METADATA -----------------------------
	# These values would be dragged out from the database, not assigned
	my $dbh = DBI->connect("DBI:mysql:$DB_NAME:$DB_HOST", $DB_USER, $DB_PASS);
	my $query = $dbh->prepare("SELECT * FROM archive WHERE eprintid ='$ID'");
	$query->execute;
	my $numrows = $query->rows;

if ($numrows > 0) {
		$row = $query->fetchrow_hashref;
                $title = $row->{title};
                $abstract = $row->{abstract};
                $publisheddate = $row->{year};
                $keywords = $row->{keywords};
                $dept = $row->{department};
                $thesistype = $row->{thesistype};
		$depositID = $row->{userid};
		$depositedon = $row->{datestamp};
        }
	else {
#		die "No records found in table 'archive' for eprint id $ID\n";
		return 0;
	}

$query->finish;

my $query = $dbh->prepare("SELECT * FROM archive_authors WHERE eprintid ='$ID'");
	$query->execute;
	my $numrows = $query->rows;

if ($numrows > 0) {
		$row = $query->fetchrow_hashref;
                $lastname = $row->{authors_family};
                $firstname = $row->{authors_given};
                $salutation = $row->{authors_honourific};
        }
	else {
#		die "No records found in table 'archive_authors' for eprint id $ID\n";
		return 0;
	}

$query->finish;

my $query = $dbh->prepare("SELECT * FROM users WHERE userid ='$depositID'");
	$query->execute;
	my $numrows = $query->rows;

if ($numrows > 0) {
		$row = $query->fetchrow_hashref;
                $depositedby = $row->{name_family} . ", " . $row->{name_given};
        }
	else {
#		die "No records found in table 'users' for user id $depositID\n";
		return 0;
	}

$query->finish;

my $query = $dbh->prepare("SELECT * FROM document WHERE eprintid ='$ID'");
	$query->execute;
	my $numrows = $query->rows;

if ($numrows > 0) {
		$i = 0;
		undef @documentIDs;
		undef @documentFORMATs;
		undef @documentFORMATDESCs;
		undef @documentMAINs;
		while ($row = $query->fetchrow_hashref) {
                	$documentIDs[$i] = $row->{docid};
                	$documentFORMATs[$i] = $row->{format};
                	$documentFORMATDESCs[$i] = $row->{formatdesc};
                	$documentMAINs[$i] = $row->{main};
			$i++;
		}
        }
	else {
#		die "No records found in table 'document' for eprint id $ID\n";
		return 0;
	}

$query->finish;

my $query = $dbh->prepare("SELECT * FROM archive_subjects WHERE eprintid ='$ID'");
	$query->execute;
	my $numrows = $query->rows;

if ($numrows > 0) {
		$i = 0;
		undef @subjectIDs;
		while ($row = $query->fetchrow_hashref) {
                	$subjectIDs[$i] = $row->{subjects};
			$i++;
		}
        }
	else {
#		die "No records found in table 'archive_subjects' for eprint id $ID\n";
		return 0;
	}

$query->finish;

$dbh->disconnect;

# OK, so now generate derived variables, first authors

$author = $firstname . ' ' . $lastname;
	$authorrev = $lastname . ', ' . $firstname;
	$authorallrev = $lastname . ', ' . $salutation . ' ' . $firstname;

# Now the URI

$URI = $DOCS_URL . padZeros ($ID, 8) . "/";

# Convert thesistype to actual text

if ($thesistype eq "phd") {
		$thesistype = "PhD thesis";
	}
	elsif ($thesistype eq "honours") {
		$thesistype = "Honours thesis";
	}
	elsif ($thesistype eq "cmaster") {
		$thesistype = "Coursework Master thesis";
	}
	elsif ($thesistype eq "rmaster") {
		$thesistype = "Research Master thesis";
	}
	elsif ($thesistype eq "other") {
		$thesistype = "Other Degree thesis";
	}

# Convert abstract so that blank lines become

$converted_Abstract = $abstract;
#	$converted_Abstract =~ s/(\n)/ /g; # Perhaps leave that up to ADT to format.
	$converted_Abstract =~ s/"/&quot;/g;
	$abstract = $converted_Abstract;

# ------------------------ START WRITING THE FILE --------------------------

my $filename = "> " . $FName;
	open(FILEHANDLE, $filename) or die "Failed to open/create file: $FName\n";
	# So write the file
print FILEHANDLE 
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"DTD/xhtml1-transitional.dtd\">
<html><head>";
			
# --------------------------- WRITE THE TITLE -----------------------------

print FILEHANDLE "<title>$title</title>\n";

# --------------------------- WRITE THE ADT METADATA -----------------------------

print FILEHANDLE "<meta name=\"DC.Title\" content=\"$title\"/>\n";
	
print FILEHANDLE "<meta name=\"DC.Creator\" content=\"$authorrev\"/>\n";
print FILEHANDLE "<meta name=\"DC.Creator.personalName\" content=\"$authorrev\"/>\n";
	
	# generate sequence of keywords
	if ($keywords =~ ",") {
		@keywordslist = split (/,/,$keywords);
	}
	else {
		@keywordslist = split (/\ /,$keywords);
	}
	my $nokeywordslist = @keywordslist;
	for ($i = 0; $i < $nokeywordslist; $i++) {
		$keywordslist[$i] =~ s/^( +)//; # remove all spaces at start
		print FILEHANDLE "<meta name=\"DC.subject\" content=\"$keywordslist[$i]\"/>\n";
	}
	
print FILEHANDLE "<meta name=\"DC.Description.abstract\" content=\"$abstract\"/>\n";
print FILEHANDLE "<meta name=\"DC.Date\" scheme=\"W3CDTF\" content=\"$publisheddate\"/>\n";
print FILEHANDLE "<meta name=\"DC.Date.valid\" scheme=\"W3CDTF\" content=\"$publisheddate\"/>\n";
print FILEHANDLE "<meta name=\"DC.Language\" scheme=\"RFC1766\" content=\"en\"/>\n";
print FILEHANDLE "<meta name=\"DC.Publisher\" content=\"$UNI_TEXT, $dept\"/>\n";
print FILEHANDLE "<meta name=\"DC.Rights\" content=\"$COPY_URL\"/>\n";
print FILEHANDLE "<meta name=\"DC.Rights\" content=\"(c) Copyright $publisheddate $author\"/>\n";
print FILEHANDLE "<meta name=\"DC.Identifier\" scheme=\"URI\" content=\"$URI\"/>\n";

#// --------------------------- WRITE THE PAGE -----------------------------
print FILEHANDLE
"</head>

<body bgcolor=\"#ffffff\" text=\"#000000\">
 
 
		<h1 class=\"pagetitle\">$title</h1>
 
		
		$authorallrev ($publisheddate) $title. $thesistype, $dept, $UNI_TEXT.
		
		Eprints ID Code: $ID
 
 </body>
</html>\n";
#// END OF FUNCTION
}

Show line notes below