Newer
Older
Digital_Repository / OARiNZ / DIY / deb_package / eprints-3.0 / perl_lib / EPrints / Plugin / Import / BibTeX.pm
=pod

=head1 FILE FORMAT

=head2 Supported fields

=over 8

=item Entry Key

B<eprintid>

=item Entry Type. Supported types:

=over 8

=item article B<article>

=item book B<book>

=item conference B<conference_item>

=item inbook B<book_section>

=item incollection B<book_section>

=item inproceedings B<conference_item>

=item manual B<monograph>, B<monograph_type>=manual|documentation

=item mastersthesis B<thesis>, B<thesis_type>=masters

=item misc B<other>

=item phdthesis B<thesis>, B<thesis_type>=phd

=item proceedings B<book>

=item techreport B<monograph>, B<monograph_type>!=manual|documentation

=item unpublished B<status>=unpub

=back

=item address

Address of publisher or institution B<place_of_pub>

=item author 

Name(s) of authors B<creators_name>

B<FORMAT:> Multiple authors separated by 'and'

=item booktitle

Title of Book (incollection, inproceedings) B<book_title>

=item editor

Name(s) of editors B<editors_name>

B<FORMAT:> Multiple authors separated by 'and'

=item institution

Sponsoring institution (techreport) B<institution>

=item journal

Journal name B<publication>

=item month

=over 8

=item Month written (unpublished) B<date>

=item Month published (Other Types) B<date>

=back

B<FORMAT:> three letter abbreviations

=item note

Additional information B<note>

=item number

=over 8

=item ID Number (techreport) B<id_number>

=item Number (Other Types)

=back

=item organization

=over 8

=item Organization (manual) B<institution>

=item Sponsor (inproceedings)

=back

=item pages

Page numbers B<pagerange>

B<FORMAT:> A--B

=item publisher

Publisher B<publisher>

=item school

School (mastersthesis, phdthesis) B<institution>

=item series

Series B<series>

=item title

Title B<title>

=item type

=over 8

=item Type of report (techreport) B<monograph_type>

=item Sectional unit (incollection)

=item Different type of thesis (mastersthesis, phdthesis)

=back

=item volume

Volume B<volume>

=item year

=over 8

=item Year written (unpublished) B<date>

=item Year published (Other Types) B<date>

=back

=back

=head2 Not strictly BibTeX but often used

=over 8

=item abstract B<abstract>

Abstract

=item keywords B<keywords>

Keywords

=item url B<official_url>

URL

=back

Abstract

=head2 Unsupported fields

=over 8

=item annote

Annotation

=item chapter

Chapter number

=item crossref

Database key of entry being cross-referenced

=item edition

Edition of a book

=item howpublished

How something strange was published

=item key

Label

=back

=head1 SEE ALSO

L<Text::BibTeX>, <EPrints::Plugin::Export::BibTeX>

=cut

package EPrints::Plugin::Import::BibTeX;

use Encode;
use strict;

our @ISA = qw/ EPrints::Plugin::Import /;

sub new
{
	my( $class, %params ) = @_;

	my $self = $class->SUPER::new( %params );

	$self->{name} = "BibTeX";
	$self->{visible} = "all";
	$self->{produce} = [ 'list/eprint', 'dataobj/eprint' ];

	my $rc = EPrints::Utils::require_if_exists("Text::BibTeX");
	unless( $rc ) 
	{
		$self->{visible} = "";
		$self->{error} = "Failed to load required module Text::BibTeX";
	}

	$self->{decode_tex} = 1;
	$rc = EPrints::Utils::require_if_exists("TeX::Encode");
	unless( $rc ) 
	{
		$self->{decode_tex} = 0;
	}

	return $self;
}

sub input_fh
{
	my( $plugin, %opts ) = @_;
	
	$plugin->error("Does not support input from file handles");

	return undef;
}

sub input_file
{
	my( $plugin, %opts ) = @_;

	if( $opts{filename} eq '-' )
	{
		$plugin->error("Does not support input from STDIN");

		return undef;
	}

	my $bibfile = Text::BibTeX::File->new( $opts{filename} );

	my @ids;

	while ( my $entry = Text::BibTeX::Entry->new( $bibfile ) )
	{
		next unless $entry->parse_ok;

		my $epdata = $plugin->convert_input( $entry );
		next unless( defined $epdata );

		my $dataobj = $plugin->epdata_to_dataobj( $opts{dataset}, $epdata );
		if( defined $dataobj )
		{
			push @ids, $dataobj->get_id;
		}
	}
	
	return EPrints::List->new( 
		dataset => $opts{dataset}, 
		session => $plugin->{session},
		ids=>\@ids );
}

sub input_dataobj
{
	my( $plugin, $data ) = @_;

	my $entry = Text::BibTeX::Entry->new;
	$entry->parse_s( $data );
	if( $entry->parse_ok )
	{
		my $epdata = $plugin->convert_input( $entry );
		return $plugin->epdata_to_dataobj( $plugin->{dataset}, $epdata );
	}
	return undef;
}

sub convert_input 
{
	my ( $plugin, $input_data ) = @_;
	my $epdata = ();

	# Decode latex
	if( $plugin->{decode_tex} )
	{
		for( $input_data->fieldlist )
		{
			my $value = $input_data->get( $_ );
			$value = decode( "latex", $value );
			utf8::encode( $value );
			$input_data->set( $_, $value );
		}
	}

	# Entry Type
	my $input_data_type = $input_data->type;
	$epdata->{type} = "article" if $input_data_type eq "article";
	$epdata->{type} = "book" if $input_data_type eq "book";
	$epdata->{type} = "book" if $input_data_type eq "proceedings";
	$epdata->{type} = "book_section" if $input_data_type eq "inbook";
	$epdata->{type} = "book_section" if $input_data_type eq "incollection";
	$epdata->{type} = "conference_item" if $input_data_type eq "inproceedings";
	$epdata->{type} = "conference_item" if $input_data_type eq "conference";
	$epdata->{type} = "other" if $input_data_type eq "misc";
	if( $input_data_type eq "manual" )
	{
		$epdata->{type} = "monograph";
		$epdata->{monograph_type} = "manual";
	}
	if( $input_data_type eq "techreport" )
	{
		$epdata->{type} = "monograph";
		$epdata->{monograph_type} = "technical_report";
	}
	if( $input_data_type eq "mastersthesis" )
	{
		$epdata->{type} = "thesis";
		$epdata->{thesis_type} = "masters";
	}
	if( $input_data_type eq "phdthesis" )
	{
		$epdata->{type} = "thesis";
		$epdata->{thesis_type} = "phd";
	}
	if( $input_data_type eq "unpublished" )
	{
		$epdata->{type} = "other";
		$epdata->{ispublished} = "unpub";
	}
	if( !defined $epdata->{type} )
	{
		$plugin->warning( $plugin->phrase( "unsupported_cite_type", type => $input_data_type ) );
		return undef;
	}

	# address
	$epdata->{place_of_pub} = $input_data->get( "address" ) if $input_data->exists( "address" );

	# author
	for( $input_data->names( "author" ) )
	{
		my $a_name;
		$a_name->{given} = join( " ", $_->part( "first" ) ) if scalar $_->part( "first" );
		$a_name->{family} = join( " ", $_->part( "von" ) ) if scalar $_->part( "von" );
		$a_name->{family} .= join( " ", $_->part( "last" ) ) if scalar $_->part( "last" );
		$a_name->{lineage} = join( " ", $_->part( "jr" ) ) if scalar $_->part( "jr" );
		push @{ $epdata->{creators_name} }, $a_name;
	}
	
	# booktitle
	if( $input_data_type eq "incollection" )
	{
		$epdata->{book_title} = $input_data->get( "booktitle" ) if $input_data->exists( "booktitle" );
	}
	elsif( $input_data_type eq "inproceedings" )
	{
		$epdata->{event_title} = $input_data->get( "booktitle" ) if $input_data->exists( "booktitle" );
	}

	# editor
	for( $input_data->names( "editor" ) )
	{
		my $e_name;
		$e_name->{given} = join( " ", $_->part( "first" ) ) if scalar $_->part( "first" );
		$e_name->{family} = join( " ", $_->part( "von" ) ) if scalar $_->part( "von" );
		$e_name->{family} .= join( " ", $_->part( "last" ) ) if scalar $_->part( "last" );
		$e_name->{lineage} = join( " ", $_->part( "jr" ) ) if scalar $_->part( "jr" );
		push @{ $epdata->{editors_name} }, $e_name;
	}

	# institution
	if( $input_data_type eq "techreport" )
	{
		$epdata->{institution} = $input_data->get( "institution" ) if $input_data->exists( "institution" );
	}

	# journal
	$epdata->{publication} = $input_data->get( "journal" ) if $input_data->exists( "journal" );

	# note	
	$epdata->{note} = $input_data->get( "note" ) if $input_data->exists( "note" );

	# number
	if( $input_data_type eq "techreport" || $input_data_type eq "manual" )
	{
		$epdata->{id_number} = $input_data->get( "number" ) if $input_data->exists( "number" );
	}
	else
	{
		$epdata->{number} = $input_data->get( "number" ) if $input_data->exists( "number" );
	}

	# organization
	if( $input_data_type eq "manual" )
	{
		$epdata->{institution} = $input_data->get( "organization" ) if $input_data->exists( "organization" );
	}

	# pages
	if( $input_data->exists( "pages" ) )
	{
		$epdata->{pagerange} = $input_data->get( "pages" );
		$epdata->{pagerange} =~ s/--/-/;
	}

	# publisher
	$epdata->{publisher} = $input_data->get( "publisher" ) if $input_data->exists( "publisher" );

	# school
	if( $input_data_type eq "phdthesis" || $input_data_type eq "mastersthesis" )
	{
		$epdata->{institution} = $input_data->get( "school" ) if $input_data->exists( "school" );
	}

	# series
	$epdata->{series} = $input_data->get( "series" ) if $input_data->exists( "series" );

	# title
	$epdata->{title} = $input_data->get( "title" ) if $input_data->exists( "title" );

	# type
	if( $input_data_type eq "techreport")
	{
		# TODO: regexps
		#$epdata->{monograph_type} = $input_data->get( "" ) if $input_data->exists( "" );
	}

	# volume
	$epdata->{volume} = $input_data->get( "volume" ) if $input_data->exists( "volume" );

	# year
	if( $input_data->exists( "year" ) )
	{
		my $year = $input_data->get( "year" );
		if( $year =~ /^[0-9]{4}$/ )
		{
			$epdata->{date} = $year;
		}
		else
		{
			$plugin->warning( $plugin->phrase( "skip_year", year => $year ) );
		}
	}
	
	# month
	if( $input_data->exists( "month" ) )
	{
		my %months = (
			jan => "01",
			feb => "02",
			mar => "03",
			apr => "04",
			may => "05",
			jun => "06",
			jul => "07",
			aug => "08",
			sep => "09",
			oct => "10",
			nov => "11",
			dec => "12",
		);
		my $month = substr( lc( $input_data->get( "month" ) ), 0, 3 );
		if( defined $months{$month} )
		{
			$epdata->{date} .= "-" . $months{$month}; 
		}
		else
		{
			$plugin->warning( $plugin->phrase( "skip_month", month => $month ) );
		}
	}

	# abstract
	$epdata->{abstract} = $input_data->get( "abstract" ) if $input_data->exists( "abstract" );
	# keywords
	$epdata->{keywords} = $input_data->get( "keywords" ) if $input_data->exists( "keywords" );
	# url
	$epdata->{official_url} = $input_data->get( "url" ) if $input_data->exists( "url" );
	return $epdata;
}

1;