Newer
Older
spamdb / Scripts / parse_email
#!/usr/bin/perl

use Mail::Address;
use Mail::Field;
use MIME::Parser;
use Digest::MD5 qw(md5_hex);
use Date::Parse qw(str2time);
use Date::Format qw(time2str);
use Pg;

$mime_parser = new MIME::Parser;
$mime_parser->output_under("./tmp");

# Read the message from the source file and run it through the MIME::Parser,
# giving a MIME:Entity object.
open INFILE, "<$ARGV[0]" or die "ERROR: couldn't open input file '$ARGV[0]'.\n";

# $message = new Mail::Internet(\*INFILE);
$message = $mime_parser->parse(\*INFILE);
# We'll let the headers get re-folded --- the content won't be any
# different, and we'll probably unfold them before we store them in
# the database anyway.

close INFILE;


# Set up some defaults.
%message_values =	(	'priority'		=>	'Normal',
						'is_html'		=>	false,
						'is_multipart'	=>	false,	);

$message_part_boundary = '';


# Extract the various bits of the message.

# Raw headers:
$message_headers = $message->head();
$message_values{'raw_headers'} = fix_quotes($message_headers->as_string()); 

# Raw content (body):
$message_lines = $message->body();
# For some reason printf and sprintf only print the first line if
# given the entire body array, so we have to build the body line by
# line, just to be annoying.
foreach $line (@$message_lines)
{
	$message_content .= sprintf "%s", fix_quotes($line);
}
$message_values{'raw_content'} = $message_content;
$message_values{'content_md5'} = Digest::MD5::md5_hex($message_content);

# Subject:
chomp ($message_subject = $message_headers->get('Subject'));
$message_values{'subject'} = fix_quotes($message_subject);

# Sender:
$message_sender = new Mail::Field('From', $message_headers->get('From'));
# There can be only one...
chomp ($message_values{'from_name'} = sprintf "%s", fix_quotes($message_sender->names()));
chomp ($message_values{'from_addr'} = sprintf "%s", $message_sender->addresses());


# Date:
# Note we don't worry about parsing this just yet. See below.
chomp ($message_raw_date = $message_headers->get('Date'));
$message_values{'date'} = $message_raw_date;

# Priority:
$message_priority = new Mail::Field('Priority', find_priority());
chomp ($message_values{'priority'} = $message_priority->stringify());


# $message_content_type = new Mail::Field('Content-type', $message_headers->get('Content-type'));
# ($type, $subtype) = split('/', $message_content_type->type());
# $message_is_html = ($subtype eq 'html');

# Get the message recipients. There shouldn't be any BCC headers on incoming
# messages, so we won't bother looking.
# $message_to_recipients = new Mail::Field('To', $message_headers->get('To'));
# $message_cc_recipients = new Mail::Field('Cc', $message_headers->get('CC'));

# if ($message->is_multipart())
# {
# 	$message_parts = $message->parts();
# 	printf "This is a multipart message with %d parts.\n", $#message_parts + 1;
# }

# print "This is an HTML message.\n" if ($message_is_html);

print "Subject: [$message_values{'subject'}]\n";

print "From: [$message_values{'from_name'} <$message_values{'from_addr'}>]\n";

print "Date: [$message_values{'date'}]\n";

# chomp ($to = $message_to_recipients->stringify());
# print "To: [$to]\n";
# 
# chomp ($cc = $message_to_recipients->stringify());
# print "CC: [$cc]\n";

# chomp ($content_type = $message_content_type->stringify());
# print "Content type: [$content_type]\n";

print "Priority: [$message_values{'priority'}]\n";

# printf "Body has %d lines:\n", scalar(@$message_body);
print "MD5: $message_values{'content_md5'}\n";
# print @$message_body;

# $message->dump_skeleton();

# Try to connect to the database.
print "Connecting to database...";

$conn = Pg::connectdb('dbname=spam');
($conn->status == Pg::PGRES_CONNECTION_OK) or die "failed\n" . $conn->errorMessage;
$message_values{'connection'} = $conn;

print "OK\n";

# Try to insert the message.
print 'Inserting...';
$not_parsed_yet = true;

# Loop until we either get a successful insert, a fatal error occurs,
# or we've tried reinserting the date after parsing and that failed too.
while (true)
{
	eval { try_insert(%message_values); };
	
	last unless ($@ =~ /ERROR/);

	# If we get a "Bad timestamp" error, parse the date
	# with Date::Parse then try again.
	if ($@ =~ /Bad timestamp/)
	{
		$not_parsed_yet or die $@;
		print 'parsing date...';
		$message_values{'date'} = parse_date($message_raw_date);
		$not_parsed_yet = false;
	}
	else
	{
		die $@;
	}
}

print "OK\n";


################################################################################
# find_priority()
#     Arguments: none
#     Returns: string
#
# Get the message priority (if one exists). We check for all the likely headers:
#   1. First look for a Priority: header.
#   2. If that fails, look for an X-Priority header.
#   3. If that fails, look for an X-MSMail-Priority header (although it's
#      probably unlikely that this will exist alone without some other priority
#      header also).
#   4. If all else fails, default to "Normal".
#
sub find_priority
{
	$message_headers->get('Priority') || $message_headers->get('X-Priority') ||
		$message_headers->get('X-MSMail-Priority') || 'Normal';
}

################################################################################
# try_insert()
#     Arguments: hash containing all the values pertinent to a message
#     Returns: string
#
# Take a string containing containing an SQL INSERT statement and execute it
# against the database. If an error occurs, die with an appropriate exception,
# otherwise return the empty string.
#
sub try_insert
{
	my $message_values = @_;
	
	my $insert_string = 'insert into message (subject, sender_name, sender_address, ' .
		'time_sent, priority, raw_headers, raw_content, content_hash, raw_source, ' .
		'is_html, is_multipart) values (' .
		"'$message_values{'subject'}', " .
		"'$message_values{'from_name'}', '$message_values{'from_addr'}', " .
		"'$message_values{'date'}', " .
		"'$message_values{'priority'}', " .
		"'$message_values{'raw_headers'}', " .
		"'$message_values{'raw_content'}', " .
		"'$message_values{'content_md5'}', " .
		"'$message_values{'raw_headers'}" . "\n" . "$message_values{'raw_content'}', " .
		"'$message_values{'is_html'}', " .
		"'$message_values{'is_multipart'}');";
	
	my $conn = $message_values{'connection'};
	my $result = $conn->exec($insert_string);
	my $result_status = $result->resultStatus;
	
	return '' if ($result_status == Pg::PGRES_COMMAND_OK);
	
	die $conn->errorMessage;
}

################################################################################
# parse_date()
#     Arguments: string containing a date value
#     Returns: string containing a (hopefully) better-formed date value
#
# Take a string containing a date value that the DBMS doesn't like, and
# attempt to parse it into something that the DBMS will accept. Particularly
# misguided dates are simply converted into the epoch. I may look at doing
# something a bit more sophisticated with this later.
#
# Here are some encountered examples of broken dates. The ones marked * can't
# be parsed by str2time and will be need to be dealt with separately, if at
# all.
#
#   Invalid time zones
#   Fri, 02 Aug 2002 21:27:39 -0580
#   Fri, 06 Sep 2002 12:09:39 -1900
#   Mon, 02 Sep 2002 16:34:36 -1600
#   Mon, 14 Oct 2002 07:12:46 -1800
#   Mon, 25 Nov 2002 04:10:48 -2000
#   Mon, 28 Oct 2002 19:31:48 +1300	 (although quite feasible with DST in NZ)
#   Sun, 08 Sep 2002 14:25:51 -1700
#
#   Broken time zones
#   Mon, 16 Sep 2002 02:08:20 00000
#   Fri, 30 Aug 2002 9:56:13 +-0700  (seems to be read as +0700)
#  *Fri, 06 Dec 2002 12:38:52 +0-900
#  *Sat, 09 Feb 2002 11:15:40 +0-200
#  *Fri, 27 Sep 02 04:35:46 中国标准时间  (foreign characters)
#  *Thu, 28 Nov 02 05:47:07 Eastern Standard Time
#  *Wed, 04 Sep 02 17:26:37 Srednja Evropa - poletni 鑑s
#  *Wed, 13 Nov 2002 坷饶 12:35:24 +0900
#
#
#   Just plain weird
#  *Sat, 12 55 2002 04:55:53 -0800  (month = 55)
#  *Sat, 26 13 2002 07:13:37 -0800  (month = 13 --- only in the Islamic calendar :)
#  *sam., 28 sept. 2002 16:47:03    (German)
#  *ma, 28 okt 2002 15:56:31        (who knows?)
#  *Sat, 30121899 0:0:0 -000 (MST)  (what can I say?)
#
sub parse_date
{
	my $input_date = str2time(shift);
	my $timezone = time2str('%z', $input_date);
	$timezone = sprintf " %s:%s", substr($timezone, 0, 3), substr($timezone, 3, 2);
	return time2str('%a, %e %b %T %Y', $input_date) . $timezone;
}


################################################################################
# fix_quotes()
#     Arguments: string
#     Returns: string with all ' replaced with ''
#
# Take a string and replace all single apostrophes with double apostrophes so
# that SQL will insert them correctly.
#
sub fix_quotes
{
	my $input_string = shift;
	$input_string =~ s/'/''/g;
	return $input_string;
}