diff --git a/Scripts/parse_email b/Scripts/parse_email index 9ce3912..32bc86f 100755 --- a/Scripts/parse_email +++ b/Scripts/parse_email @@ -2,139 +2,271 @@ use Mail::Address; use Mail::Field; +use MIME::Parser; +use Digest::MD5 qw(md5_hex); +use Date::Parse qw(str2time); +use Date::Format qw(time2str); +use Pg; -open INFILE, "<$ARGV[0]" or die "Argh!\n"; +$mime_parser = new MIME::Parser; +$mime_parser->output_under("./tmp"); -# TRUE if we're currently processing the headers. -$process_headers = 1; +# Read the message from the source file and run it through the MIME::Parser, +# giving a MIME:Entity object. +open INFILE, "<$ARGV[0]" or die "ERROR: couldn't open input file '$ARGV[0]'.\n"; -# Variable for holding header information. -$current_header_name = ''; -$current_header_value = ''; -$header_count = 0; -@header_names = (); -@header_values = (); +# $message = new Mail::Internet(\*INFILE); +$message = $mime_parser->parse(\*INFILE); +# We'll let the headers get re-folded --- the content won't be any +# different, and we'll probably unfold them before we store them in +# the database anyway. -# Variables for holding body information. -$body_line_count = 0; +close INFILE; + # Set up some defaults. -$message_priority = 'Normal'; -$message_charset = 'iso-8859-1'; -$message_raw_headers = ''; -$message_raw_body = ''; -$message_raw_source = ''; -$message_has_html = 0; +%message_values = ( 'priority' => 'Normal', + 'is_html' => false, + 'is_multipart' => false, ); -$message_is_multipart = 0; $message_part_boundary = ''; -%priorities_lookup = ( 1 => 'Highest', - 2 => 'High', - 3 => 'Normal', - 4 => 'Low', - 5 => 'Lowest', - ); -while () +# Extract the various bits of the message. + +# Raw headers: +$message_headers = $message->head(); +$message_values{'raw_headers'} = fix_quotes($message_headers->as_string()); + +# Raw content (body): +$message_lines = $message->body(); +# For some reason printf and sprintf only print the first line if +# given the entire body array, so we have to build the body line by +# line, just to be annoying. +foreach $line (@$message_lines) { - # Read lines until we hit the first blank line, which separates the - # headers from the body. - if ($process_headers) + $message_content .= sprintf "%s", fix_quotes($line); +} +$message_values{'raw_content'} = $message_content; +$message_values{'content_md5'} = Digest::MD5::md5_hex($message_content); + +# Subject: +chomp ($message_subject = $message_headers->get('Subject')); +$message_values{'subject'} = fix_quotes($message_subject); + +# Sender: +$message_sender = new Mail::Field('From', $message_headers->get('From')); +# There can be only one... +chomp ($message_values{'from_name'} = sprintf "%s", fix_quotes($message_sender->names())); +chomp ($message_values{'from_addr'} = sprintf "%s", $message_sender->addresses()); + + +# Date: +# Note we don't worry about parsing this just yet. See below. +chomp ($message_raw_date = $message_headers->get('Date')); +$message_values{'date'} = $message_raw_date; + +# Priority: +$message_priority = new Mail::Field('Priority', find_priority()); +chomp ($message_values{'priority'} = $message_priority->stringify()); + + +# $message_content_type = new Mail::Field('Content-type', $message_headers->get('Content-type')); +# ($type, $subtype) = split('/', $message_content_type->type()); +# $message_is_html = ($subtype eq 'html'); + +# Get the message recipients. There shouldn't be any BCC headers on incoming +# messages, so we won't bother looking. +# $message_to_recipients = new Mail::Field('To', $message_headers->get('To')); +# $message_cc_recipients = new Mail::Field('Cc', $message_headers->get('CC')); + +# if ($message->is_multipart()) +# { +# $message_parts = $message->parts(); +# printf "This is a multipart message with %d parts.\n", $#message_parts + 1; +# } + +# print "This is an HTML message.\n" if ($message_is_html); + +print "Subject: [$message_values{'subject'}]\n"; + +print "From: [$message_values{'from_name'} <$message_values{'from_addr'}>]\n"; + +print "Date: [$message_values{'date'}]\n"; + +# chomp ($to = $message_to_recipients->stringify()); +# print "To: [$to]\n"; +# +# chomp ($cc = $message_to_recipients->stringify()); +# print "CC: [$cc]\n"; + +# chomp ($content_type = $message_content_type->stringify()); +# print "Content type: [$content_type]\n"; + +print "Priority: [$message_values{'priority'}]\n"; + +# printf "Body has %d lines:\n", scalar(@$message_body); +print "MD5: $message_values{'content_md5'}\n"; +# print @$message_body; + +# $message->dump_skeleton(); + +# Try to connect to the database. +print "Connecting to database..."; + +$conn = Pg::connectdb('dbname=spam'); +($conn->status == Pg::PGRES_CONNECTION_OK) or die "failed\n" . $conn->errorMessage; +$message_values{'connection'} = $conn; + +print "OK\n"; + +# Try to insert the message. +print 'Inserting...'; +$not_parsed_yet = true; + +# Loop until we either get a successful insert, a fatal error occurs, +# or we've tried reinserting the date after parsing and that failed too. +while (true) +{ + eval { try_insert(%message_values); }; + + last unless ($@ =~ /ERROR/); + + # If we get a "Bad timestamp" error, parse the date + # with Date::Parse then try again. + if ($@ =~ /Bad timestamp/) { - if (/^$/) - { - # End of headers, switch to body processing mode. - $process_headers = 0; - - # But don't forget to store the last header that we found! - @header_names[$header_count] = $current_header_name; - @header_values[$header_count++] = $current_header_value; - - # Drop out now so that we don't append the blank line - # to $message_raw_headers. - next; - } - - elsif (/^([-\w]+): (.*)$/) - { - # We've found the beginning of a new header. File away the current - # header for later reference. Also store the "special" headers (like - # date, sender, etc.) in the appropriate variables. - if ($current_header_name) - { - @header_names[$header_count] = $current_header_name; - @header_values[$header_count++] = $current_header_value; - } - - $current_header_name = $1; - $current_header_value = $2; - - if ($current_header_name eq 'Subject') - { - $message_subject = $current_header_value; - } - elsif ($current_header_name eq 'From') - { - $message_sender = $current_header_value; - } - elsif ($current_header_name eq 'Date') - { - $message_time_sent = $current_header_value; - } - elsif (/Priority: ([1-5])/) - { - $message_priority = $priorities_lookup{$1}; - } - elsif (/Priority:/) - { - $message_priority = $current_header_value; - } - - # this needs work - elsif (/Content-Type: (.*); [Cc]harset ?= ?"(.*)"/) - { - $message_has_html = ($1 eq 'text/html'); - - $message_is_multipart - $message_charset = $2; - } - } - - elsif (/^((\s)+.*)$/) - { - # This line's still part of the current header (i.e., the header's - # been folded). Just add this line to the current header. - $current_header_value .= "\n$1"; - - if (/[Cc]harset ?= ?"?(.*)"?/) - { - $message_charset = $1; - } - } - $message_raw_headers .= $_; + $not_parsed_yet or die $@; + print 'parsing date...'; + $message_values{'date'} = parse_date($message_raw_date); + $not_parsed_yet = false; } else { - $message_raw_body .= $_; - $body_line_count++; + die $@; } } -$message_raw_source = "$message_raw_headers\n$message_raw_body"; +print "OK\n"; -print "Found $header_count headers:\n"; -for ($i = 0; $i < $header_count; $i++) +################################################################################ +# find_priority() +# Arguments: none +# Returns: string +# +# Get the message priority (if one exists). We check for all the likely headers: +# 1. First look for a Priority: header. +# 2. If that fails, look for an X-Priority header. +# 3. If that fails, look for an X-MSMail-Priority header (although it's +# probably unlikely that this will exist alone without some other priority +# header also). +# 4. If all else fails, default to "Normal". +# +sub find_priority { - print "[$header_names[$i]] = [$header_values[$i]]\n"; + $message_headers->get('Priority') || $message_headers->get('X-Priority') || + $message_headers->get('X-MSMail-Priority') || 'Normal'; } -print "\nSubject: $message_subject\n"; -print "Sender: $message_sender\n"; -print "Time sent: $message_time_sent\n"; -print "Priority: $message_priority\n"; -print "Character set: $message_charset\n"; -print ($message_has_html ? "Message contains HTML\n" : "Message doesn't contain HTML\n"); +################################################################################ +# try_insert() +# Arguments: hash containing all the values pertinent to a message +# Returns: string +# +# Take a string containing containing an SQL INSERT statement and execute it +# against the database. If an error occurs, die with an appropriate exception, +# otherwise return the empty string. +# +sub try_insert +{ + my $message_values = @_; + + my $insert_string = 'insert into message (subject, sender_name, sender_address, ' . + 'time_sent, priority, raw_headers, raw_content, content_hash, raw_source, ' . + 'is_html, is_multipart) values (' . + "'$message_values{'subject'}', " . + "'$message_values{'from_name'}', '$message_values{'from_addr'}', " . + "'$message_values{'date'}', " . + "'$message_values{'priority'}', " . + "'$message_values{'raw_headers'}', " . + "'$message_values{'raw_content'}', " . + "'$message_values{'content_md5'}', " . + "'$message_values{'raw_headers'}" . "\n" . "$message_values{'raw_content'}', " . + "'$message_values{'is_html'}', " . + "'$message_values{'is_multipart'}');"; + + my $conn = $message_values{'connection'}; + my $result = $conn->exec($insert_string); + my $result_status = $result->resultStatus; + + return '' if ($result_status == Pg::PGRES_COMMAND_OK); + + die $conn->errorMessage; +} -print "\nBody has $body_line_count lines.\n"; +################################################################################ +# parse_date() +# Arguments: string containing a date value +# Returns: string containing a (hopefully) better-formed date value +# +# Take a string containing a date value that the DBMS doesn't like, and +# attempt to parse it into something that the DBMS will accept. Particularly +# misguided dates are simply converted into the epoch. I may look at doing +# something a bit more sophisticated with this later. +# +# Here are some encountered examples of broken dates. The ones marked * can't +# be parsed by str2time and will be need to be dealt with separately, if at +# all. +# +# Invalid time zones +# Fri, 02 Aug 2002 21:27:39 -0580 +# Fri, 06 Sep 2002 12:09:39 -1900 +# Mon, 02 Sep 2002 16:34:36 -1600 +# Mon, 14 Oct 2002 07:12:46 -1800 +# Mon, 25 Nov 2002 04:10:48 -2000 +# Mon, 28 Oct 2002 19:31:48 +1300 (although quite feasible with DST in NZ) +# Sun, 08 Sep 2002 14:25:51 -1700 +# +# Broken time zones +# Mon, 16 Sep 2002 02:08:20 00000 +# Fri, 30 Aug 2002 9:56:13 +-0700 (seems to be read as +0700) +# *Fri, 06 Dec 2002 12:38:52 +0-900 +# *Sat, 09 Feb 2002 11:15:40 +0-200 +# *Fri, 27 Sep 02 04:35:46 �й���׼ʱ�� (foreign characters) +# *Thu, 28 Nov 02 05:47:07 Eastern Standard Time +# *Wed, 04 Sep 02 17:26:37 Srednja Evropa - poletni �as +# *Wed, 13 Nov 2002 ���� 12:35:24 +0900 +# +# +# Just plain weird +# *Sat, 12 55 2002 04:55:53 -0800 (month = 55) +# *Sat, 26 13 2002 07:13:37 -0800 (month = 13 --- only in the Islamic calendar :) +# *sam., 28 sept. 2002 16:47:03 (German) +# *ma, 28 okt 2002 15:56:31 (who knows?) +# *Sat, 30121899 0:0:0 -000 (MST) (what can I say?) +# +sub parse_date +{ + my $input_date = str2time(shift); + my $timezone = time2str('%z', $input_date); + $timezone = sprintf " %s:%s", substr($timezone, 0, 3), substr($timezone, 3, 2); + return time2str('%a, %e %b %T %Y', $input_date) . $timezone; +} + + +################################################################################ +# fix_quotes() +# Arguments: string +# Returns: string with all ' replaced with '' +# +# Take a string and replace all single apostrophes with double apostrophes so +# that SQL will insert them correctly. +# +sub fix_quotes +{ + my $input_string = shift; + $input_string =~ s/'/''/g; + return $input_string; +} +