Newer
Older
Digital_Repository / OARiNZ / DIY / deb_package / eprints-3.0 / perl_lib / ParaTools / Utils.pm
######################################################################
#
# ParaTools::Utils;
#
######################################################################
#
#  This file is part of ParaCite Tools
#
#  Copyright (c) 2002 University of Southampton, UK. SO17 1BJ.
#
#  ParaTools is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  ParaTools is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with ParaTools; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
######################################################################

package ParaTools::Utils;

use 5.006;
use strict;
use warnings;
use LWP::UserAgent;
use File::Temp qw/ tempfile tempdir /;
use URI;

# This hash maps from file extension to a command that can
# convert to plaintext. Replace the source file with _IN_, and
# the destination (plaintext) file with _OUT_.

=pod

=head1 NAME

B<ParaTools::Utils> - handy utility functions

=head1 DESCRIPTION

ParaTools::Utils provides several utility functions for the other
modules.

=head1 METHODS

=over 4

=cut

my %converters =
(
	doc => "wvText _IN_ _OUT_",
	pdf => "pdftotext -raw _IN_ _OUT_",
	ps => "pstotext -output _OUT_ _IN_",
	htm => "links --dump _IN_ > _OUT_",
	html => "links --dump _IN_ > _OUT_",
);

=pod

=item $content = ParaTools::Utils::get_content($location)

This function takes either a filename or a URL as a parameter, and
aims to return a string containing the lines in the file. A hash of
converters is provided in ParaTools/Utils.pm, which should be customised
for your system.

For URLs, the file is first downloaded to a temporary directory, then
converted, whereas local files are copied straight into the temporary
directory. For this reason, some care should be taken when handling very
large files.

=cut

sub get_content
{
	my($location) = @_;

	# Get some temporary files ready.
	my $dir = tempdir( CLEANUP => 1 );
	my (undef, $tofile)  = tempfile(UNLINK => 1, DIR => $dir, SUFFIX => ".txt");

	my $type = "txt";
	my $converter = "";

	# Set up the type. 
	if ($location =~ /\.(\w+?)$/)
	{
		$type = $1;
	}	

	if ($location =~ /^http:\/\//)
	{
		if (!$type)	
		{
			print STDERR "Unknown type - assuming HTML\n";
			$type = "html";
		}
	}
	else
	{
		if (!$type)
		{
			print STDERR "Unknown type - assuming plaintext\n";
			$type = "txt";
		}		
	}

	my (undef, $fromfile) = tempfile(UNLINK => 1, DIR => $dir, SUFFIX => ".$type");

	# Now we know the type, grab the files. 
	if ($location =~ /^http:\/\//)
        {
		# If it's remote, use the LWP mirror function to grab it.
		my $ua = new LWP::UserAgent();
              	$ua->mirror($location, $fromfile);
	}
	else
	{
		# If it's local, mirror it straight to the $fromfile.
		open(FIN, $location);
                open(FOUT, ">$fromfile");
                foreach(<FIN>) { print FOUT $_; }
                close FOUT;
                close FIN;
	}
	
	if ($type ne "txt")
	{
		# Convert from the $fromfile to the $tofile.
		if (!$converters{$type})
		{
			print STDERR "Sorry, no converters available for type $type\n";
			return;
		}
		else
		{
			$converter = $converters{$type};
			$converter =~ s/_IN_/$fromfile/g;
			$converter =~ s/_OUT_/$tofile/g;
		}
		system($converter);
	}
	else
	{
		# If we have text, just use the fromfile.
		$tofile = $fromfile;
	}

	my $content = "";
	open( INPUT, $tofile ) or return;
    	read( INPUT, $content, -s INPUT );
	close INPUT;

	return $content;
}

=pod

=item $escaped_url = ParaTools::Utils::url_escape($string)

Simple function to convert a string into an encoded
URL (i.e. spaces to %20, etc). Takes the unencoded
URL as a parameter, and returns the encoded version.

=cut

sub url_escape
{
        my( $url ) = @_;
	$url =~ s/</%3C/g;
	$url =~ s/>/%3E/g;
	$url =~ s/#/%23/g;
	$url =~ s/;/%3B/g;
	$url =~ s/&/%26/g;
        my $uri = URI->new( $url );
	my $out = $uri->as_string;
        return $out;
}

__END__

=pod

=back

=head1 AUTHOR

Mike Jewell <moj@ecs.soton.ac.uk>

=cut

1;