Newer
Older
spamdb / Scripts / parse_email
nstanger on 6 Dec 2002 3 KB Initial import of spam DB stuff.
#!/usr/bin/perl

use Mail::Address;
use Mail::Field;

open INFILE, "<$ARGV[0]" or die "Argh!\n";

# TRUE if we're currently processing the headers.
$process_headers = 1;

# Variable for holding header information.
$current_header_name = '';
$current_header_value = '';
$header_count = 0;
@header_names = ();
@header_values = ();

# Variables for holding body information.
$body_line_count = 0;

# Set up some defaults.
$message_priority = 'Normal';
$message_charset = 'iso-8859-1';
$message_raw_headers = '';
$message_raw_body = '';
$message_raw_source = '';
$message_has_html = 0;

$message_is_multipart = 0;
$message_part_boundary = '';

%priorities_lookup =	(	1	=>	'Highest',
							2	=>	'High',
							3	=>	'Normal',
							4	=>	'Low',
							5	=>	'Lowest',
						);

while (<INFILE>)
{
	# Read lines until we hit the first blank line, which separates the
	# headers from the body.
	if ($process_headers)
	{
		if (/^$/)
		{
			# End of headers, switch to body processing mode.
			$process_headers = 0;
			
			# But don't forget to store the last header that we found!
			@header_names[$header_count] = $current_header_name;
			@header_values[$header_count++] = $current_header_value;
			
			# Drop out now so that we don't append the blank line
			# to $message_raw_headers.
			next;
		}

		elsif (/^([-\w]+): (.*)$/)
		{
			# We've found the beginning of a new header. File away the current
			# header for later reference. Also store the "special" headers (like
			# date, sender, etc.) in the appropriate variables.
			if ($current_header_name)
			{
				@header_names[$header_count] = $current_header_name;
				@header_values[$header_count++] = $current_header_value;
			}
			
			$current_header_name = $1;
			$current_header_value = $2;

			if ($current_header_name eq 'Subject')
			{
				$message_subject = $current_header_value;
			}
			elsif ($current_header_name eq 'From')
			{
				$message_sender = $current_header_value;
			}
			elsif ($current_header_name eq 'Date')
			{
				$message_time_sent = $current_header_value;
			}
 			elsif (/Priority: ([1-5])/)
 			{
 				$message_priority = $priorities_lookup{$1};
 			}
 			elsif (/Priority:/)
 			{
 				$message_priority = $current_header_value;
 			}

 			# this needs work
 			elsif (/Content-Type: (.*); [Cc]harset ?= ?"(.*)"/)
 			{
				$message_has_html = ($1 eq 'text/html');
				
				$message_is_multipart
 				$message_charset = $2;
 			}
		}

		elsif (/^((\s)+.*)$/)
		{
			# This line's still part of the current header (i.e., the header's
			# been folded). Just add this line to the current header.
			$current_header_value .= "\n$1";

 			if (/[Cc]harset ?= ?"?(.*)"?/)
 			{
 				$message_charset = $1;
 			}
		}
		$message_raw_headers .= $_;
	}
	else
	{
		$message_raw_body .= $_;
		$body_line_count++;
	}
}

$message_raw_source = "$message_raw_headers\n$message_raw_body";

print "Found $header_count headers:\n";

for ($i = 0; $i < $header_count; $i++)
{
	print "[$header_names[$i]] = [$header_values[$i]]\n";
}

print "\nSubject: $message_subject\n";
print "Sender: $message_sender\n";
print "Time sent: $message_time_sent\n";
print "Priority: $message_priority\n";
print "Character set: $message_charset\n";
print ($message_has_html ? "Message contains HTML\n" : "Message doesn't contain HTML\n");

print "\nBody has $body_line_count lines.\n";