GitBucket
4.21.2
Toggle navigation
Snippets
Sign in
Files
Branches
1
Releases
Issues
Pull requests
Labels
Priorities
Milestones
Wiki
Forks
nigel.stanger
/
spamdb
Browse code
Second wave of updates.
master
1 parent
020b792
commit
e606761e271c0356d0e88a3770b1a3d678ba29fd
nstanger
authored
on 12 Dec 2002
Patch
Showing
2 changed files
Scripts/Dump messages.applescript
Scripts/Extract messages.applescript
Ignore Space
Show notes
View
Scripts/Dump messages.applescript
0 → 100644
set destPath to "Waiareka HD:Users:nstanger:Documents:Research:Spam filtering:Corpus:" tell application "Microsoft Entourage" to set theMessages to (get current messages) repeat with thisMsg in theMessages tell application "Microsoft Entourage" set theFilename to ID of thisMsg as string set theSource to source of thisMsg as string end tell set outFile to open for access file (destPath & theFilename & ".txt") with write permission write theSource to outFile close access outFile end repeat beep
Show notes
View
Scripts/Extract messages.applescript
100644 → 0
property WorkingDirectory : "Waiareka HD:Users:nstanger:tmp:" property WorkingPOSIX : POSIX path of "Waiareka HD:Users:nstanger:tmp:" global c, cQ, Qc, QcQ on run set c to ", " set cQ to ", '" set Qc to "', " set QcQ to "', '" try set msgFile to open for access file (WorkingDirectory & "message.sql") with write permission on error e my doError(e) return end try tell application "Microsoft Entourage" to  set theMessages to get current messages repeat with thisMsg in theMessages -- Most of the headers we can just grab from the standard message properties. tell application "Microsoft Entourage" set theSubject to change "'" into "''" in ((subject of thisMsg) as string) set theSender to change "'" into "''" in ((address of sender of thisMsg) as string) set thePriority to (priority of thisMsg) as string set theHeaders to change "'" into "''" in ((headers of thisMsg) as string) set theContent to change "'" into "''" in ((content of thisMsg) as text) set theSource to change "'" into "''" in ((source of thisMsg) as string) set theCharacterSet to character set of thisMsg set theHTML to has html of thisMsg end tell -- application "Microsoft Entourage" -- Grab the headers as a list so we can extract individual headers. set headerList to my parseHeaders(paragraphs of theHeaders) -- We can't just use the time sent property because if the date is weird, Entourage -- invariably mangles into something else. We therefore manually extract the sent date -- from the headers; at least this way we have a fighting chance of getting something -- resembling the actual message date (as long as it's not _too_ misguided). set theTimeSent to my extractSentDate(headerList) set insertMessage to "insert into message (subject, sender, time_sent, " &  "priority, headers, content, source, character_set, has_html) " &  "values ('" & theSubject & QcQ & theSender & QcQ & theTimeSent & QcQ &  thePriority & QcQ & theHeaders & QcQ & theContent & QcQ & theSource &  QcQ & theCharacterSet & Qc & theHTML & ");" try set eof msgFile to 0 write insertMessage to msgFile on error e my doError(e) close access msgFile return end try do shell script "cat " & WorkingPOSIX & "message.sql | tr '\\r' '\\n' > " & WorkingPOSIX &  "message_lf.sql; rm " & WorkingPOSIX & "message.sql" set theResult to (do shell script "psql -d spam -f " & WorkingPOSIX & "message_lf.sql") if (theResult "") then set theMessageID to my retrieveMessageID(theResult) my processRecipients(thisMsg, theMessageID) my processHeaders(headerList, theMessageID) else beep end if end repeat -- with thisMsg in theMessages close access msgFile end run -- Get the recipients of a message. Return as a list in the form -- {{"blah@foo.com", "to"}, "bar@quux.net", "cc"} on getRecipients(thisMsg) local recipientList set recipientList to {} tell application "Microsoft Entourage" repeat with thisRecipient in recipients of thisMsg set recipientList to recipientList &  {{address of address of thisRecipient, first word of (recipient type of thisRecipient as string)}} end repeat end tell return recipientList end getRecipients -- Build and return a list of message headers in the format -- {{"header name", "header value"}, {"header name", "header value"}, {...}}. on parseHeaders(headerList) local parsedHeaders, headerName, headerValue local thisHeader, oldDelim set parsedHeaders to {} set headerName to "" set headerValue to "" set oldDelim to AppleScript's text item delimiters set AppleScript's text item delimiters to {":"} -- Skip the last item of headerList because it's the blank line terminating the headers. repeat with thisHeader in items 1 thru -2 of headerList -- If a line starts with white space, the header has been folded, so add this line to -- the current header instead of creating a new header. if (character 1 of thisHeader is in {" ", " "}) then set headerValue to headerValue & return & " " & contents of thisHeader else -- It's a new header. -- If we're already working on a header, store it before starting the new one. if (headerName "") then  set parsedHeaders to parsedHeaders & {{headerName, headerValue}} -- The header name is the text before the first colon (text item 1). set headerName to text item 1 of (contents of thisHeader) -- Everything else is the header value, but we need to strip off the first space; -- hence text 2 thru -1. set headerValue to text 2 thru -1 of ((text items 2 thru -1 of (contents of thisHeader)) as string) end if -- character 1 of thisHeader is in {" ", " "} end repeat -- with thisHeader in items 1 thru -2 of headerList set AppleScript's text item delimiters to oldDelim -- Add the last header to the list before we finish. set parsedHeaders to parsedHeaders & {{headerName, headerValue}} return parsedHeaders end parseHeaders -- Grabs the date header from the header list. Returns "" if it isn't found. on extractSentDate(headerList) repeat with thisHeader in headerList if (item 1 of thisHeader = "Date") then return item 2 of thisHeader end repeat -- with thisHeader in headerList return "" end extractSentDate -- Retrieve the message ID for a message given the result string from inserting it -- (which contains the PostgreSQL OID of the row). on retrieveMessageID(theOIDString) set theOID to second word of theOIDString -- Use the OID to retrieve the message ID of the message row we just inserted. return (do shell script "psql -d spam -c \"select message_id from message where oid = " &  theOID & ";\" -P tuples_only -P format=unaligned") end retrieveMessageID -- Process and store the recipients of a message. on processRecipients(thisMsg, theMessageID) tell application "Microsoft Entourage" repeat with thisRecipient in recipients of thisMsg set recipientAddress to address of address of thisRecipient set recipientName to change "'" into "''" in (display name of address of thisRecipient as string) set recipientType to first word of (recipient type of thisRecipient as string) do shell script "psql -d spam -c \"insert into recipient (address, recipient_name, recipient_type, message_id) " &  "values ('" & recipientAddress & QcQ & recipientName & QcQ & recipientType & Qc & theMessageID & ");\"" end repeat end tell end processRecipients -- Process and store the headers of a message. on processHeaders(headerList, theMessageID) repeat with thisHeader in headerList set theResult to do shell script "psql -d spam -c \"insert into header (header_name, header_value, message_id) " &  "values ('" & item 1 of thisHeader & QcQ & item 2 of thisHeader & Qc & theMessageID & ");\"" tell me to display dialog theResult end repeat end processHeaders -- Error handler. on doError(e) beep display dialog e end doError
Show line notes below