#!/usr/bin/perl use Mail::Address; use Mail::Field; use MIME::Parser; use Digest::MD5 qw(md5_hex); use Date::Parse qw(str2time); use Date::Format qw(time2str); use Pg; $mime_parser = new MIME::Parser; $mime_parser->output_under("./tmp"); # Read the message from the source file and run it through the MIME::Parser, # giving a MIME:Entity object. open INFILE, "<$ARGV[0]" or die "ERROR: couldn't open input file '$ARGV[0]'.\n"; # $message = new Mail::Internet(\*INFILE); $message = $mime_parser->parse(\*INFILE); # We'll let the headers get re-folded --- the content won't be any # different, and we'll probably unfold them before we store them in # the database anyway. close INFILE; # Set up some defaults. %message_values = ( 'priority' => 'Normal', 'is_html' => false, 'is_multipart' => false, ); $message_part_boundary = ''; # Extract the various bits of the message. # Raw headers: $message_headers = $message->head(); $message_values{'raw_headers'} = fix_quotes($message_headers->as_string()); # Raw content (body): $message_lines = $message->body(); # For some reason printf and sprintf only print the first line if # given the entire body array, so we have to build the body line by # line, just to be annoying. foreach $line (@$message_lines) { $message_content .= sprintf "%s", fix_quotes($line); } $message_values{'raw_content'} = $message_content; $message_values{'content_md5'} = Digest::MD5::md5_hex($message_content); # Subject: chomp ($message_subject = $message_headers->get('Subject')); $message_values{'subject'} = fix_quotes($message_subject); # Sender: $message_sender = new Mail::Field('From', $message_headers->get('From')); # There can be only one... chomp ($message_values{'from_name'} = sprintf "%s", fix_quotes($message_sender->names())); chomp ($message_values{'from_addr'} = sprintf "%s", $message_sender->addresses()); # Date: # Note we don't worry about parsing this just yet. See below. chomp ($message_raw_date = $message_headers->get('Date')); $message_values{'date'} = $message_raw_date; # Priority: $message_priority = new Mail::Field('Priority', find_priority()); chomp ($message_values{'priority'} = $message_priority->stringify()); # $message_content_type = new Mail::Field('Content-type', $message_headers->get('Content-type')); # ($type, $subtype) = split('/', $message_content_type->type()); # $message_is_html = ($subtype eq 'html'); # Get the message recipients. There shouldn't be any BCC headers on incoming # messages, so we won't bother looking. # $message_to_recipients = new Mail::Field('To', $message_headers->get('To')); # $message_cc_recipients = new Mail::Field('Cc', $message_headers->get('CC')); # if ($message->is_multipart()) # { # $message_parts = $message->parts(); # printf "This is a multipart message with %d parts.\n", $#message_parts + 1; # } # print "This is an HTML message.\n" if ($message_is_html); print "Subject: [$message_values{'subject'}]\n"; print "From: [$message_values{'from_name'} <$message_values{'from_addr'}>]\n"; print "Date: [$message_values{'date'}]\n"; # chomp ($to = $message_to_recipients->stringify()); # print "To: [$to]\n"; # # chomp ($cc = $message_to_recipients->stringify()); # print "CC: [$cc]\n"; # chomp ($content_type = $message_content_type->stringify()); # print "Content type: [$content_type]\n"; print "Priority: [$message_values{'priority'}]\n"; # printf "Body has %d lines:\n", scalar(@$message_body); print "MD5: $message_values{'content_md5'}\n"; # print @$message_body; # $message->dump_skeleton(); # Try to connect to the database. print "Connecting to database..."; $conn = Pg::connectdb('dbname=spam'); ($conn->status == Pg::PGRES_CONNECTION_OK) or die "failed\n" . $conn->errorMessage; $message_values{'connection'} = $conn; print "OK\n"; # Try to insert the message. print 'Inserting...'; $not_parsed_yet = true; # Loop until we either get a successful insert, a fatal error occurs, # or we've tried reinserting the date after parsing and that failed too. while (true) { eval { try_insert(%message_values); }; last unless ($@ =~ /ERROR/); # If we get a "Bad timestamp" error, parse the date # with Date::Parse then try again. if ($@ =~ /Bad timestamp/) { $not_parsed_yet or die $@; print 'parsing date...'; $message_values{'date'} = parse_date($message_raw_date); $not_parsed_yet = false; } else { die $@; } } print "OK\n"; ################################################################################ # find_priority() # Arguments: none # Returns: string # # Get the message priority (if one exists). We check for all the likely headers: # 1. First look for a Priority: header. # 2. If that fails, look for an X-Priority header. # 3. If that fails, look for an X-MSMail-Priority header (although it's # probably unlikely that this will exist alone without some other priority # header also). # 4. If all else fails, default to "Normal". # sub find_priority { $message_headers->get('Priority') || $message_headers->get('X-Priority') || $message_headers->get('X-MSMail-Priority') || 'Normal'; } ################################################################################ # try_insert() # Arguments: hash containing all the values pertinent to a message # Returns: string # # Take a string containing containing an SQL INSERT statement and execute it # against the database. If an error occurs, die with an appropriate exception, # otherwise return the empty string. # sub try_insert { my $message_values = @_; my $insert_string = 'insert into message (subject, sender_name, sender_address, ' . 'time_sent, priority, raw_headers, raw_content, content_hash, raw_source, ' . 'is_html, is_multipart) values (' . "'$message_values{'subject'}', " . "'$message_values{'from_name'}', '$message_values{'from_addr'}', " . "'$message_values{'date'}', " . "'$message_values{'priority'}', " . "'$message_values{'raw_headers'}', " . "'$message_values{'raw_content'}', " . "'$message_values{'content_md5'}', " . "'$message_values{'raw_headers'}" . "\n" . "$message_values{'raw_content'}', " . "'$message_values{'is_html'}', " . "'$message_values{'is_multipart'}');"; my $conn = $message_values{'connection'}; my $result = $conn->exec($insert_string); my $result_status = $result->resultStatus; return '' if ($result_status == Pg::PGRES_COMMAND_OK); die $conn->errorMessage; } ################################################################################ # parse_date() # Arguments: string containing a date value # Returns: string containing a (hopefully) better-formed date value # # Take a string containing a date value that the DBMS doesn't like, and # attempt to parse it into something that the DBMS will accept. Particularly # misguided dates are simply converted into the epoch. I may look at doing # something a bit more sophisticated with this later. # # Here are some encountered examples of broken dates. The ones marked * can't # be parsed by str2time and will be need to be dealt with separately, if at # all. # # Invalid time zones # Fri, 02 Aug 2002 21:27:39 -0580 # Fri, 06 Sep 2002 12:09:39 -1900 # Mon, 02 Sep 2002 16:34:36 -1600 # Mon, 14 Oct 2002 07:12:46 -1800 # Mon, 25 Nov 2002 04:10:48 -2000 # Mon, 28 Oct 2002 19:31:48 +1300 (although quite feasible with DST in NZ) # Sun, 08 Sep 2002 14:25:51 -1700 # # Broken time zones # Mon, 16 Sep 2002 02:08:20 00000 # Fri, 30 Aug 2002 9:56:13 +-0700 (seems to be read as +0700) # *Fri, 06 Dec 2002 12:38:52 +0-900 # *Sat, 09 Feb 2002 11:15:40 +0-200 # *Fri, 27 Sep 02 04:35:46 中国标准时间 (foreign characters) # *Thu, 28 Nov 02 05:47:07 Eastern Standard Time # *Wed, 04 Sep 02 17:26:37 Srednja Evropa - poletni 鑑s # *Wed, 13 Nov 2002 坷饶 12:35:24 +0900 # # # Just plain weird # *Sat, 12 55 2002 04:55:53 -0800 (month = 55) # *Sat, 26 13 2002 07:13:37 -0800 (month = 13 --- only in the Islamic calendar :) # *sam., 28 sept. 2002 16:47:03 (German) # *ma, 28 okt 2002 15:56:31 (who knows?) # *Sat, 30121899 0:0:0 -000 (MST) (what can I say?) # sub parse_date { my $input_date = str2time(shift); my $timezone = time2str('%z', $input_date); $timezone = sprintf " %s:%s", substr($timezone, 0, 3), substr($timezone, 3, 2); return time2str('%a, %e %b %T %Y', $input_date) . $timezone; } ################################################################################ # fix_quotes() # Arguments: string # Returns: string with all ' replaced with '' # # Take a string and replace all single apostrophes with double apostrophes so # that SQL will insert them correctly. # sub fix_quotes { my $input_string = shift; $input_string =~ s/'/''/g; return $input_string; }