• Added initial files.

nigel.stanger / Publications

Browse code • Added initial files. DP_2017
1 parent eddf6f3 commit d188ab844cee6537f5aa14391fb6d0873fe46d83 Nigel Stanger authored on 30 May 2017

Patch

Showing 4 changed files

Ignore Space Show notes View DP_2017/DP.bib 0 → 100644
@techreport{Brown.A-2006a-Automatic, Address = {London, UK}, Author = {Adrian Brown}, Institution = {The National Archives}, Month = {7~} # mar, Number = {DPTP-01}, Title = {Automatic Format Identification Using PRONOM and DROID}, Type = {Digital Preservation Technical Paper}, Url = {http://www.nationalarchives.gov.uk/aboutapps/fileformat/pdf/automatic\_format\_identification.pdf}, Year = {2006}} @techreport{Brown.A-2006a-PRONOM, Address = {London, UK}, Author = {Adrian Brown}, Institution = {The National Archives}, Month = {27~} # jul, Number = {DPTP-02}, Title = {The PRONOM Unique Identifier Scheme}, Type = {Digital Preservation Technical Paper}, Url = {http://www.nationalarchives.gov.uk/aboutapps/pronom/pdf/pronom\_unique\_identifier\_scheme.pdf}, Year = {2006}} @misc{Cerf.V-2015a-Digital, Author = {Vinton Cerf}, Howpublished = {Presentation at the American Association for the Advancement of Science 2015 Annual Meeting, San Jose, California, USA}, Month = {13~} # feb, Title = {Digital vellum}, Url = {https://aaas.confex.com/aaas/2015/webprogram/Paper14064.html}, Year = {2015}} @techreport{Gillespie.J-2004a-Coping, Address = {Sydney, Australia}, Author = {Julian Gillespie and Patrick Fair and Adrian Lawrence and David Vaile}, Institution = {Cyberspace Law {\&} Policy Centre, University of New South Wales}, Note = {Accessed on 12 May 2015}, Title = {Coping when everything is digital? {D}igital documents and issues in document retention}, Type = {White paper}, Url = {http://www.cyberlawcentre.org/ddr/}, Year = {2004}} @misc{JHOVE-2009a, Author = {{JHOVE}}, Note = {Accessed on 12 May 2015}, Title = {{JSTOR}/{H}arvard {O}bject {V}alidation {E}nvironment}, Url = {http://hul.harvard.edu/jhove/}, Year = {2009}} @article{Kolowich.S-2009a-Archiving, Author = {Steve Kolowich}, Journal = {The Chronicle of Higher Education}, Month = apr # {~10}, Note = {Accessed on 12 May 2015}, Title = {Archiving writers' work in the age of e-mail}, Url = {http://chronicle.com/article/Archiving-Writers-Work-in/22770}, Year = {2009}} @inproceedings{Li.Q-2010a-SVM, Author = {Qiming Li and Alvin Y. Ong and Ponnuthurai N. Suganthan and Vrizlynn L. L. Thing}, Crossref = {WDFIA-2010a-Proceedings}, Title = {A novel support vector machine approach to high entropy data fragment classification}, Url = {http://isis.poly.edu/~qiming/publications/wdfia10.pdf}} @misc{Microsoft-2014a-Word2013, Author = {{Microsoft Corporation}}, Note = {Accessed on 12 May 2015}, Title = {File formats that are supported in {W}ord 2013}, Url = {https://technet.microsoft.com/en-us/library/dd797428(v=office.15).aspx\#section1}, Year = {2014}} @inproceedings{Mokhov.S-2008a-File, Author = {Serguei A. Mokhov and Mourad Debbabi}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Crossref = {Gobel.O-2008a-IMF}, Pages = {73--86}, Title = {File type analysis using signal processing techniques and machine learning vs.\ file {U}nix utility for forensic analysis}, Url = {http://subs.emis.de/LNI/Proceedings/Proceedings140/gi-proc-140-007.pdf}} @article{Mount.R-2009a-Data, Author = {Richard Mount}, Journal = {Ariadne}, Month = jan, Note = {Accessed on 12 May 2015}, Number = {58}, Title = {Data preservation and long-term analysis in high-energy physics}, Url = {http://www.ariadne.ac.uk/issue58/dplta-hep-rpt/}, Year = {2009}} @misc{NatLib.NZ-2007a-Metadata, Author = {{National Library of New Zealand}}, Date-Added = {2017-05-08 09:45:26 +0000}, Date-Modified = {2017-05-08 09:46:27 +0000}, Note = {Accessed on 12 May 2015}, Title = {Metadata {E}xtraction {T}ool}, Url = {http://meta-extractor.sourceforge.net/}, Year = {2007}} @article{Pringle.H-2010a-NASA, Author = {Heather Pringle}, Doi = {10.1126/science.327.5971.1322}, Journal = {Science}, Month = mar, Number = {5971}, Pages = {1322--1323}, Title = {{NASA} dives into its past to retrieve vintage satellite data}, Volume = {327}, Year = {2010}} @inproceedings{Roussev.V-2009a-File, Author = {Vassil Roussev and Simson L. Garfinkel}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Crossref = {SADFE-2009a-Proceedings}, Doi = {10.1109/SADFE.2009.21}, Pages = {3--14}, Title = {File fragment classification---{T}he case for specialized approaches}, Year = {2009}} # Crossrefs @proceedings{Gobel.O-2008a-IMF, Address = {Mannheim, Germany}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {Proceedings of the 2008 Conference on IT-Incidents Management and IT-Forensics (IMF 2008)}, Editor = {Oliver G{\"o}bel and Sandra Frings and Detlef G{\"u}nther and Jens Nedon and Dirk Schadt}, Isbn = {978-3-88579-234-5}, Month = sep # {~23-25}, Publisher = {Gesellschaft f{\"u}r Informatik---GI (German Computer Society)}, Series = {Lecture Notes in Informatics}, Title = {Proceedings of the 2008 Conference on IT-Incidents Management and IT-Forensics (IMF 2008)}, Volume = {140}, Year = {2008}} @proceedings{SADFE-2009a-Proceedings, Address = {Berkeley, California, USA}, Bibsource = {DBLP, http://dblp.uni-trier.de}, Booktitle = {Proceedings of the Fourth International IEEE Workshop on Systematic Approaches to Digital Forensic Engineering (SADFE 2009)}, Isbn = {978-0-7695-3792-4}, Month = may # {~21}, Publisher = {IEEE Computer Society}, Title = {Proceedings of the Fourth International IEEE Workshop on Systematic Approaches to Digital Forensic Engineering (SADFE 2009)}, Year = {2009}} @proceedings{WDFIA-2010a-Proceedings, Address = {Port Elizabeth, South Africa}, Booktitle = {Proceedings of the Fifth Annual Workshop on Digital Forensics and Incident Analysis (WDFIA)}, Date-Added = {2011-05-13 15:54:22 +1200}, Date-Modified = {2011-05-13 15:55:34 +1200}, Month = {17--18~} # may, Title = {Proceedings of the Fifth Annual Workshop on Digital Forensics and Incident Analysis (WDFIA)}, Year = {2010}}

Ignore Space Show notes View

DP_2017/DP.bib 0 → 100644

@techreport{Brown.A-2006a-Automatic,
	Address = {London, UK},
	Author = {Adrian Brown},
	Institution = {The National Archives},
	Month = {7~} # mar,
	Number = {DPTP-01},
	Title = {Automatic Format Identification Using PRONOM and DROID},
	Type = {Digital Preservation Technical Paper},
	Url = {http://www.nationalarchives.gov.uk/aboutapps/fileformat/pdf/automatic\_format\_identification.pdf},
	Year = {2006}}

@techreport{Brown.A-2006a-PRONOM,
	Address = {London, UK},
	Author = {Adrian Brown},
	Institution = {The National Archives},
	Month = {27~} # jul,
	Number = {DPTP-02},
	Title = {The PRONOM Unique Identifier Scheme},
	Type = {Digital Preservation Technical Paper},
	Url = {http://www.nationalarchives.gov.uk/aboutapps/pronom/pdf/pronom\_unique\_identifier\_scheme.pdf},
	Year = {2006}}

@misc{Cerf.V-2015a-Digital,
	Author = {Vinton Cerf},
	Howpublished = {Presentation at the American Association for the Advancement of Science 2015 Annual Meeting, San Jose, California, USA},
	Month = {13~} # feb,
	Title = {Digital vellum},
	Url = {https://aaas.confex.com/aaas/2015/webprogram/Paper14064.html},
	Year = {2015}}

@techreport{Gillespie.J-2004a-Coping,
	Address = {Sydney, Australia},
	Author = {Julian Gillespie and Patrick Fair and Adrian Lawrence and David Vaile},
	Institution = {Cyberspace Law {\&} Policy Centre, University of New South Wales},
	Note = {Accessed on 12 May 2015},
	Title = {Coping when everything is digital? {D}igital documents and issues in document retention},
	Type = {White paper},
	Url = {http://www.cyberlawcentre.org/ddr/},
	Year = {2004}}

@misc{JHOVE-2009a,
	Author = {{JHOVE}},
	Note = {Accessed on 12 May 2015},
	Title = {{JSTOR}/{H}arvard {O}bject {V}alidation {E}nvironment},
	Url = {http://hul.harvard.edu/jhove/},
	Year = {2009}}

@article{Kolowich.S-2009a-Archiving,
	Author = {Steve Kolowich},
	Journal = {The Chronicle of Higher Education},
	Month = apr # {~10},
	Note = {Accessed on 12 May 2015},
	Title = {Archiving writers' work in the age of e-mail},
	Url = {http://chronicle.com/article/Archiving-Writers-Work-in/22770},
	Year = {2009}}

@inproceedings{Li.Q-2010a-SVM,
	Author = {Qiming Li and Alvin Y. Ong and Ponnuthurai N. Suganthan and Vrizlynn L. L. Thing},
	Crossref = {WDFIA-2010a-Proceedings},
	Title = {A novel support vector machine approach to high entropy data fragment classification},
	Url = {http://isis.poly.edu/~qiming/publications/wdfia10.pdf}}

@misc{Microsoft-2014a-Word2013,
	Author = {{Microsoft Corporation}},
	Note = {Accessed on 12 May 2015},
	Title = {File formats that are supported in {W}ord 2013},
	Url = {https://technet.microsoft.com/en-us/library/dd797428(v=office.15).aspx\#section1},
	Year = {2014}}

@inproceedings{Mokhov.S-2008a-File,
	Author = {Serguei A. Mokhov and Mourad Debbabi},
	Bibsource = {DBLP, http://dblp.uni-trier.de},
	Crossref = {Gobel.O-2008a-IMF},
	Pages = {73--86},
	Title = {File type analysis using signal processing techniques and machine learning vs.\ file {U}nix utility for forensic analysis},
	Url = {http://subs.emis.de/LNI/Proceedings/Proceedings140/gi-proc-140-007.pdf}}

@article{Mount.R-2009a-Data,
	Author = {Richard Mount},
	Journal = {Ariadne},
	Month = jan,
	Note = {Accessed on 12 May 2015},
	Number = {58},
	Title = {Data preservation and long-term analysis in high-energy physics},
	Url = {http://www.ariadne.ac.uk/issue58/dplta-hep-rpt/},
	Year = {2009}}
	
@misc{NatLib.NZ-2007a-Metadata,
	Author = {{National Library of New Zealand}},
	Date-Added = {2017-05-08 09:45:26 +0000},
	Date-Modified = {2017-05-08 09:46:27 +0000},
	Note = {Accessed on 12 May 2015},
	Title = {Metadata {E}xtraction {T}ool},
	Url = {http://meta-extractor.sourceforge.net/},
	Year = {2007}}

@article{Pringle.H-2010a-NASA,
	Author = {Heather Pringle},
	Doi = {10.1126/science.327.5971.1322},
	Journal = {Science},
	Month = mar,
	Number = {5971},
	Pages = {1322--1323},
	Title = {{NASA} dives into its past to retrieve vintage satellite data},
	Volume = {327},
	Year = {2010}}

@inproceedings{Roussev.V-2009a-File,
	Author = {Vassil Roussev and Simson L. Garfinkel},
	Bibsource = {DBLP, http://dblp.uni-trier.de},
	Crossref = {SADFE-2009a-Proceedings},
	Doi = {10.1109/SADFE.2009.21},
	Pages = {3--14},
	Title = {File fragment classification---{T}he case for specialized approaches},
	Year = {2009}}

# Crossrefs

@proceedings{Gobel.O-2008a-IMF,
	Address = {Mannheim, Germany},
	Bibsource = {DBLP, http://dblp.uni-trier.de},
	Booktitle = {Proceedings of the 2008 Conference on IT-Incidents Management and IT-Forensics (IMF 2008)},
	Editor = {Oliver G{\"o}bel and Sandra Frings and Detlef G{\"u}nther and Jens Nedon and Dirk Schadt},
	Isbn = {978-3-88579-234-5},
	Month = sep # {~23-25},
	Publisher = {Gesellschaft f{\"u}r Informatik---GI (German Computer Society)},
	Series = {Lecture Notes in Informatics},
	Title = {Proceedings of the 2008 Conference on IT-Incidents Management and IT-Forensics (IMF 2008)},
	Volume = {140},
	Year = {2008}}

@proceedings{SADFE-2009a-Proceedings,
	Address = {Berkeley, California, USA},
	Bibsource = {DBLP, http://dblp.uni-trier.de},
	Booktitle = {Proceedings of the Fourth International IEEE Workshop on Systematic Approaches to Digital Forensic Engineering (SADFE 2009)},
	Isbn = {978-0-7695-3792-4},
	Month = may # {~21},
	Publisher = {IEEE Computer Society},
	Title = {Proceedings of the Fourth International IEEE Workshop on Systematic Approaches to Digital Forensic Engineering (SADFE 2009)},
	Year = {2009}}

@proceedings{WDFIA-2010a-Proceedings,
	Address = {Port Elizabeth, South Africa},
	Booktitle = {Proceedings of the Fifth Annual Workshop on Digital Forensics and Incident Analysis (WDFIA)},
	Date-Added = {2011-05-13 15:54:22 +1200},
	Date-Modified = {2011-05-13 15:55:34 +1200},
	Month = {17--18~} # may,
	Title = {Proceedings of the Fifth Annual Workshop on Digital Forensics and Incident Analysis (WDFIA)},
	Year = {2010}}

Ignore Space Show notes View DP_2017/DP.tex 0 → 100644
\documentclass[12pt]{article} \usepackage[margin=1in]{geometry} \usepackage{fontspec} \usepackage{harvard} \setmainfont{Minion Pro} \setmonofont{Letter Gothic 12 Pitch} \title{title} \author{Nigel Stanger \and Brendon Woodford \and William Sanson} \pagestyle{empty} \begin{document} \maketitle \thispagestyle{empty} \begin{abstract} It’s now common to be unable to open older digital documents because either the creating software has been discontinued, or it no longer supports that format. Worse, modern versions of software may open old documents but lose elements of the original (e.g., formatting). More precise identification of the software version that created a file would enable better recovery or migration of the file. This paper describes preliminary work on attempts to extract such information from old Microsoft Word documents. \end{abstract} \section{Introduction} The world is awash with digital documents going back several decades, with many of significant historical, cultural, scientific, or legal importance. Most people will probably access only very recent documents during normal activities, but there are many reasons to access older digital documents, such as studying the works of important writers \cite{Kolowich.S-2009a-Archiving}, re-analysing old research data using new methods \cite{Mount.R-2009a-Data,Pringle.H-2010a-NASA}, and finding new evidence for a “cold case” through forensic examination. However, digital documents tend to become progressively more difficult to open over time as the software that created them evolves, or is even discontinued. Even documents younger than 20 years are not safe: e.g., Microsoft Word 2013 cannot open documents created by Word 95 or earlier \cite{Microsoft-2014a-Word2013}, and it can be hard to even find a computer that can run such “antique” software. Vinton \citeasnoun{Cerf.V-2015a-Digital}, one of the creators of the Internet, recently warned that we are in danger of a “forgotten generation, or even a forgotten century” due to this “bit rot”. A less obvious issue is that even when modern software \emph{can} open an old document, it may not accurately reproduce the document’s original form due to changes in functionality. This is analogous to human languages, where grammar and meaning change significantly over time. A person who knows only modern English will struggle to accurately comprehend Chaucer. Similarly, Word 2013 may struggle to accurately interpret a document created by Word 98. This issue arises much sooner with digital documents due to the rapid pace of software evolution, and may manifest as anything from subtle layout changes through to entire elements (e.g., graphics) being omitted. Thus, when we open an old document with modern software \textbf{we cannot guarantee that it truly represents the original in both appearance and content}. This is a significant issue with historical or cultural material \cite{Kolowich.S-2009a-Archiving}, and is extremely dangerous in a forensic or legal context, where the ability to accurately reproduce a document in its original form may be crucial \cite{Gillespie.J-2004a-Coping}. Imagine, for example, if the Treaty of Waitangi was originally created in digital form, but when opened 20 years later, important parts were either not displayed at all or were differently formatted. This could completely change the meaning of the document. It is therefore essential from a preservation perspective to open a digital document with the same—or the nearest possible—version of the software that was used to create it. Unfortunately, identifying the correct version is not always a simple task for older digital documents. There are several resources for identifying document formats and extracting useful metadata from them, including the Unix \texttt{file} tool, \citeasnoun{JHOVE-2009a}, DROID \cite{Brown.A-2006a-Automatic}, the UK National Archives’ PRONOM database \cite{Brown.A-2006a-PRONOM}, and the \possessivecite{NatLib.NZ-2007a-Metadata} Metadata Extraction Tool. Most of these use well-known patterns or “signatures” specific to particular document formats. Signature-based methods can typically identify at least the broad class of document format (e.g., Microsoft Word), and can sometimes be more specific (e.g., Word 6/95 vs.\ Word 97–2003). They cannot however identify the specific software version used (e.g., Word 95 version 1.1, or even Word 6 vs.\ Word 95), except in very limited cases. This is because the key differences across software versions are more likely related to the functionality offered (e.g., a new version of a word processor might add “tables”) than to the document format, which may be the same across several different software versions. \textbf{Features or characteristics indicating specific functionality} may thus help identify the range of possible software versions. \textbf{This is a classification problem that is amenable to automated machine learning}. Machine learning has already been used in digital forensics to identify document formats, but only in the contexts of more reliably identifying the \emph{general type} of a document \cite{Mokhov.S-2008a-File} rather than which specific software version created it, and identifying the format of file \emph{fragments} rather than complete documents \cite{Li.Q-2010a-SVM,Roussev.V-2009a-File}. This research will therefore extend prior work in this area, provide important document classification tools for the digital preservation and digital forensics communities, and open a new application area for machine learning researchers. \bibliographystyle{dcu} \bibliography{DP} \end{document}

Ignore Space Show notes View

DP_2017/DP.tex 0 → 100644

\documentclass[12pt]{article}

\usepackage[margin=1in]{geometry}
\usepackage{fontspec}
\usepackage{harvard}

\setmainfont{Minion Pro}
\setmonofont{Letter Gothic 12 Pitch}

\title{title}
\author{Nigel Stanger \and Brendon Woodford \and William Sanson}

\pagestyle{empty}

\begin{document}

\maketitle
\thispagestyle{empty}

\begin{abstract}
It’s now common to be unable to open older digital documents because either the creating software has been discontinued, or it no longer supports that format. Worse, modern versions of software may open old documents but lose elements of the original (e.g., formatting). More precise identification of the software version that created a file would enable better recovery or migration of the file. This paper describes preliminary work on attempts to extract such information from old Microsoft Word documents.
\end{abstract}

\section{Introduction}

The world is awash with digital documents going back several decades, with many of significant historical, cultural, scientific, or legal importance. Most people will probably access only very recent documents during normal activities, but there are many reasons to access older digital documents, such as studying the works of important writers \cite{Kolowich.S-2009a-Archiving}, re-analysing old research data using new methods \cite{Mount.R-2009a-Data,Pringle.H-2010a-NASA}, and finding new evidence for a “cold case” through forensic examination. However, digital documents tend to become progressively more difficult to open over time as the software that created them evolves, or is even discontinued. Even documents younger than 20 years are not safe: e.g., Microsoft Word 2013 cannot open documents created by Word 95 or earlier \cite{Microsoft-2014a-Word2013}, and it can be hard to even find a computer that can run such “antique” software. Vinton \citeasnoun{Cerf.V-2015a-Digital}, one of the creators of the Internet, recently warned that we are in danger of a “forgotten generation, or even a forgotten century” due to this “bit rot”.

A less obvious issue is that even when modern software \emph{can} open an old document, it may not accurately reproduce the document’s original form due to changes in functionality. This is analogous to human languages, where grammar and meaning change significantly over time. A person who knows only modern English will struggle to accurately comprehend Chaucer. Similarly, Word 2013 may struggle to accurately interpret a document created by Word 98. This issue arises much sooner with digital documents due to the rapid pace of software evolution, and may manifest as anything from subtle layout changes through to entire elements (e.g., graphics) being omitted. Thus, when we open an old document with modern software \textbf{we cannot guarantee that it truly represents the original in both appearance and content}. This is a significant issue with historical or cultural material \cite{Kolowich.S-2009a-Archiving}, and is extremely dangerous in a forensic or legal context, where the ability to accurately reproduce a document in its original form may be crucial \cite{Gillespie.J-2004a-Coping}. Imagine, for example, if the Treaty of Waitangi was originally created in digital form, but when opened 20 years later, important parts were either not displayed at all or were differently formatted. This could completely change the meaning of the document.

It is therefore essential from a preservation perspective to open a digital document with the same—or the nearest possible—version of the software that was used to create it. Unfortunately, identifying the correct version is not always a simple task for older digital documents. There are several resources for identifying document formats and extracting useful metadata from them, including the Unix \texttt{file} tool, \citeasnoun{JHOVE-2009a}, DROID \cite{Brown.A-2006a-Automatic}, the UK National Archives’ PRONOM database \cite{Brown.A-2006a-PRONOM}, and the \possessivecite{NatLib.NZ-2007a-Metadata} Metadata Extraction Tool. Most of these use well-known patterns or “signatures” specific to particular document formats. Signature-based methods can typically identify at least the broad class of document format (e.g., Microsoft Word), and can sometimes be more specific (e.g., Word 6/95 vs.\ Word 97–2003). They cannot however identify the specific software version used (e.g., Word 95 version 1.1, or even Word 6 vs.\ Word 95), except in very limited cases. This is because the key differences across software versions are more likely related to the functionality offered (e.g., a new version of a word processor might add “tables”) than to the document format, which may be the same across several different software versions.

\textbf{Features or characteristics indicating specific functionality} may thus help identify the range of possible software versions. \textbf{This is a classification problem that is amenable to automated machine learning}. Machine learning has already been used in digital forensics to identify document formats, but only in the contexts of more reliably identifying the \emph{general type} of a document \cite{Mokhov.S-2008a-File} rather than which specific software version created it, and identifying the format of file \emph{fragments} rather than complete documents \cite{Li.Q-2010a-SVM,Roussev.V-2009a-File}. This research will therefore extend prior work in this area, provide important document classification tools for the digital preservation and digital forensics communities, and open a new application area for machine learning researchers.

\bibliographystyle{dcu}
\bibliography{DP}

\end{document}

Ignore Space Show notes View DP_2017/Project_Summary.docx 0 → 100644
Not supported

Ignore Space Show notes View DP_2017/Sanson_William_Supervisors_Completion_Report_2016_17.docx 0 → 100644
Not supported

Show line notes below