#!/usr/bin/perl
use Mail::Address;
use Mail::Field;
use MIME::Parser;
use Digest::MD5 qw(md5_hex);
use Date::Parse qw(str2time);
use Date::Format qw(time2str);
use Pg;
$mime_parser = new MIME::Parser;
$mime_parser->output_under("./tmp");
# Read the message from the source file and run it through the MIME::Parser,
# giving a MIME:Entity object.
open INFILE, "<$ARGV[0]" or die "ERROR: couldn't open input file '$ARGV[0]'.\n";
# $message = new Mail::Internet(\*INFILE);
$message = $mime_parser->parse(\*INFILE);
# We'll let the headers get re-folded --- the content won't be any
# different, and we'll probably unfold them before we store them in
# the database anyway.
close INFILE;
# Set up some defaults.
%message_values = ( 'priority' => 'Normal',
'is_html' => false,
'is_multipart' => false, );
$message_part_boundary = '';
# Extract the various bits of the message.
# Raw headers:
$message_headers = $message->head();
$message_values{'raw_headers'} = fix_quotes($message_headers->as_string());
# Raw content (body):
$message_lines = $message->body();
# For some reason printf and sprintf only print the first line if
# given the entire body array, so we have to build the body line by
# line, just to be annoying.
foreach $line (@$message_lines)
{
$message_content .= sprintf "%s", fix_quotes($line);
}
$message_values{'raw_content'} = $message_content;
$message_values{'content_md5'} = Digest::MD5::md5_hex($message_content);
# Subject:
chomp ($message_subject = $message_headers->get('Subject'));
$message_values{'subject'} = fix_quotes($message_subject);
# Sender:
$message_sender = new Mail::Field('From', $message_headers->get('From'));
# There can be only one...
chomp ($message_values{'from_name'} = sprintf "%s", fix_quotes($message_sender->names()));
chomp ($message_values{'from_addr'} = sprintf "%s", $message_sender->addresses());
# Date:
# Note we don't worry about parsing this just yet. See below.
chomp ($message_raw_date = $message_headers->get('Date'));
$message_values{'date'} = $message_raw_date;
# Priority:
$message_priority = new Mail::Field('Priority', find_priority());
chomp ($message_values{'priority'} = $message_priority->stringify());
# $message_content_type = new Mail::Field('Content-type', $message_headers->get('Content-type'));
# ($type, $subtype) = split('/', $message_content_type->type());
# $message_is_html = ($subtype eq 'html');
# Get the message recipients. There shouldn't be any BCC headers on incoming
# messages, so we won't bother looking.
# $message_to_recipients = new Mail::Field('To', $message_headers->get('To'));
# $message_cc_recipients = new Mail::Field('Cc', $message_headers->get('CC'));
# if ($message->is_multipart())
# {
# $message_parts = $message->parts();
# printf "This is a multipart message with %d parts.\n", $#message_parts + 1;
# }
# print "This is an HTML message.\n" if ($message_is_html);
print "Subject: [$message_values{'subject'}]\n";
print "From: [$message_values{'from_name'} <$message_values{'from_addr'}>]\n";
print "Date: [$message_values{'date'}]\n";
# chomp ($to = $message_to_recipients->stringify());
# print "To: [$to]\n";
#
# chomp ($cc = $message_to_recipients->stringify());
# print "CC: [$cc]\n";
# chomp ($content_type = $message_content_type->stringify());
# print "Content type: [$content_type]\n";
print "Priority: [$message_values{'priority'}]\n";
# printf "Body has %d lines:\n", scalar(@$message_body);
print "MD5: $message_values{'content_md5'}\n";
# print @$message_body;
# $message->dump_skeleton();
# Try to connect to the database.
print "Connecting to database...";
$conn = Pg::connectdb('dbname=spam');
($conn->status == Pg::PGRES_CONNECTION_OK) or die "failed\n" . $conn->errorMessage;
$message_values{'connection'} = $conn;
print "OK\n";
# Try to insert the message.
print 'Inserting...';
$not_parsed_yet = true;
# Loop until we either get a successful insert, a fatal error occurs,
# or we've tried reinserting the date after parsing and that failed too.
while (true)
{
eval { try_insert(%message_values); };
last unless ($@ =~ /ERROR/);
# If we get a "Bad timestamp" error, parse the date
# with Date::Parse then try again.
if ($@ =~ /Bad timestamp/)
{
$not_parsed_yet or die $@;
print 'parsing date...';
$message_values{'date'} = parse_date($message_raw_date);
$not_parsed_yet = false;
}
else
{
die $@;
}
}
print "OK\n";
################################################################################
# find_priority()
# Arguments: none
# Returns: string
#
# Get the message priority (if one exists). We check for all the likely headers:
# 1. First look for a Priority: header.
# 2. If that fails, look for an X-Priority header.
# 3. If that fails, look for an X-MSMail-Priority header (although it's
# probably unlikely that this will exist alone without some other priority
# header also).
# 4. If all else fails, default to "Normal".
#
sub find_priority
{
$message_headers->get('Priority') || $message_headers->get('X-Priority') ||
$message_headers->get('X-MSMail-Priority') || 'Normal';
}
################################################################################
# try_insert()
# Arguments: hash containing all the values pertinent to a message
# Returns: string
#
# Take a string containing containing an SQL INSERT statement and execute it
# against the database. If an error occurs, die with an appropriate exception,
# otherwise return the empty string.
#
sub try_insert
{
my $message_values = @_;
my $insert_string = 'insert into message (subject, sender_name, sender_address, ' .
'time_sent, priority, raw_headers, raw_content, content_hash, raw_source, ' .
'is_html, is_multipart) values (' .
"'$message_values{'subject'}', " .
"'$message_values{'from_name'}', '$message_values{'from_addr'}', " .
"'$message_values{'date'}', " .
"'$message_values{'priority'}', " .
"'$message_values{'raw_headers'}', " .
"'$message_values{'raw_content'}', " .
"'$message_values{'content_md5'}', " .
"'$message_values{'raw_headers'}" . "\n" . "$message_values{'raw_content'}', " .
"'$message_values{'is_html'}', " .
"'$message_values{'is_multipart'}');";
my $conn = $message_values{'connection'};
my $result = $conn->exec($insert_string);
my $result_status = $result->resultStatus;
return '' if ($result_status == Pg::PGRES_COMMAND_OK);
die $conn->errorMessage;
}
################################################################################
# parse_date()
# Arguments: string containing a date value
# Returns: string containing a (hopefully) better-formed date value
#
# Take a string containing a date value that the DBMS doesn't like, and
# attempt to parse it into something that the DBMS will accept. Particularly
# misguided dates are simply converted into the epoch. I may look at doing
# something a bit more sophisticated with this later.
#
# Here are some encountered examples of broken dates. The ones marked * can't
# be parsed by str2time and will be need to be dealt with separately, if at
# all.
#
# Invalid time zones
# Fri, 02 Aug 2002 21:27:39 -0580
# Fri, 06 Sep 2002 12:09:39 -1900
# Mon, 02 Sep 2002 16:34:36 -1600
# Mon, 14 Oct 2002 07:12:46 -1800
# Mon, 25 Nov 2002 04:10:48 -2000
# Mon, 28 Oct 2002 19:31:48 +1300 (although quite feasible with DST in NZ)
# Sun, 08 Sep 2002 14:25:51 -1700
#
# Broken time zones
# Mon, 16 Sep 2002 02:08:20 00000
# Fri, 30 Aug 2002 9:56:13 +-0700 (seems to be read as +0700)
# *Fri, 06 Dec 2002 12:38:52 +0-900
# *Sat, 09 Feb 2002 11:15:40 +0-200
# *Fri, 27 Sep 02 04:35:46 中国标准时间 (foreign characters)
# *Thu, 28 Nov 02 05:47:07 Eastern Standard Time
# *Wed, 04 Sep 02 17:26:37 Srednja Evropa - poletni 鑑s
# *Wed, 13 Nov 2002 坷饶 12:35:24 +0900
#
#
# Just plain weird
# *Sat, 12 55 2002 04:55:53 -0800 (month = 55)
# *Sat, 26 13 2002 07:13:37 -0800 (month = 13 --- only in the Islamic calendar :)
# *sam., 28 sept. 2002 16:47:03 (German)
# *ma, 28 okt 2002 15:56:31 (who knows?)
# *Sat, 30121899 0:0:0 -000 (MST) (what can I say?)
#
sub parse_date
{
my $input_date = str2time(shift);
my $timezone = time2str('%z', $input_date);
$timezone = sprintf " %s:%s", substr($timezone, 0, 3), substr($timezone, 3, 2);
return time2str('%a, %e %b %T %Y', $input_date) . $timezone;
}
################################################################################
# fix_quotes()
# Arguments: string
# Returns: string with all ' replaced with ''
#
# Take a string and replace all single apostrophes with double apostrophes so
# that SQL will insert them correctly.
#
sub fix_quotes
{
my $input_string = shift;
$input_string =~ s/'/''/g;
return $input_string;
}