from __future__ import absolute_import
import os
import re
import time
import string
import glob
import httplib
import mimetypes
import tempfile
import hashlib
import subprocess
import lz4
try:
import cStringIO as StringIO
except ImportError:
import StringIO
try:
import cPickle as pickle
except ImportError:
import pickle
from xml.sax.saxutils import escape
from warnings import warn
from lxml import etree
from lxml import html
from lxml.builder import ElementMaker
from base64 import b64encode, b64decode
from zipfile import ZipFile
from docutils.core import publish_string
# Intra-package imports
from cheshire3.baseObjects import PreParser
from cheshire3.document import StringDocument
from cheshire3.internal import CONFIG_NS
from cheshire3.marc_utils import MARC
from cheshire3.utils import getShellResult, gen_uuid
from cheshire3.exceptions import ConfigFileException, ExternalSystemException,\
MissingDependencyException
# TODO: All PreParsers should set mimetype, and record in/out mimetype
class TypedPreParser(PreParser):
_possibleSettings = {
"inMimeType": {
'docs': "The mimetype expected for incoming documents"
},
"outMimeType": {
'docs': "The mimetype set on outgoing documents"
}
}
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.inMimeType = self.get_setting(session, 'inMimeType', '')
self.outMimeType = self.get_setting(session, 'outMimeType', '')
[docs]class NormalizerPreParser(PreParser):
""" Calls a named Normalizer to do the conversion."""
_possiblePaths = {
'normalizer': {
'docs': "Normalizer identifier to call to do the transformation",
'required': True
}
}
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.normalizer = self.get_path(session, 'normalizer', None)
if self.normalizer is None:
msg = "Normalizer for {0} does not exist.".format(self.id)
raise ConfigFileException(msg)
def process_document(self, session, doc):
data = doc.get_raw(session)
new = self.normalizer.process_string(session, data)
return StringDocument(new, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
[docs]class UnicodeDecodePreParser(PreParser):
"""PreParser to turn non-unicode into Unicode Documents.
A UnicodeDecodePreParser should accept a Document with content encoded in
a non-unicode character encoding scheme and return a Document with the
same content decoded to Python's Unicode implementation.
"""
_possibleSettings = {
'codec': {
'docs': 'Codec to use to decode to unicode. Defaults to UTF-8'
}
}
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.codec = self.get_setting(session, 'codec', 'utf-8')
def process_document(self, session, doc):
try:
data = doc.get_raw(session).decode(self.codec)
except UnicodeDecodeError as e:
raise e
return StringDocument(data, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
[docs]class CmdLinePreParser(TypedPreParser):
_possiblePaths = {
'executable': {'docs': "Name of the executable to run"},
'executablePath': {'docs': "Path to the executable"},
'workingPath': {'docs': 'Path to be in when executing command'}
}
_possibleSettings = {
'commandLine': {
'docs': """\
Command line to use. %INDOC% is substituted to create a temporary file to
read, and %OUTDOC% is substituted for a temporary file for the process to
write to"""
}
}
def __init__(self, session, config, parent):
TypedPreParser.__init__(self, session, config, parent)
exe = self.get_path(session, 'executable', '')
if not exe:
msg = "Missing mandatory 'executable' path in {0}".format(self.id)
raise ConfigFileException(msg)
tp = self.get_path(session, 'executablePath', '')
if tp:
exe = os.path.join(tp, exe)
cl = self.get_setting(session, 'commandLine', '')
self.cmd = exe + ' ' + cl
self.working = self.get_path(session, 'workingPath', '')
def process_document(self, session, doc):
cmd = self.cmd
stdIn = cmd.find('%INDOC%') == -1
stdOut = cmd.find('%OUTDOC%') == -1
if not stdIn:
# Create temp file for incoming data
if doc.mimeType or doc.filename:
# Guess our extn~n
try:
suff = mimetypes.guess_extension(doc.mimeType)
except:
suff = ''
if not suff:
suff = mimetypes.guess_extension(doc.filename)
if not suff:
(foofn, suff) = os.path.splitext(doc.filename)
if suff:
(qq, infn) = tempfile.mkstemp(suff)
else:
(qq, infn) = tempfile.mkstemp()
else:
(qq, infn) = tempfile.mkstemp()
os.close(qq)
fh = open(infn, 'w')
fh.write(doc.get_raw(session))
fh.close()
cmd = cmd.replace("%INDOC%", infn)
if not stdOut:
# Create temp file to outgoing data
if self.outMimeType:
# Guess our extn~n
suff = mimetypes.guess_extension(self.outMimeType)
(qq, outfn) = tempfile.mkstemp(suff)
else:
(qq, outfn) = tempfile.mkstemp()
cmd = cmd.replace("%OUTDOC%", outfn)
os.close(qq)
if self.working:
old = os.getcwd()
os.chdir(self.working)
else:
old = ''
if stdIn:
pipe = subprocess.Popen(cmd, bufsize=0, shell=True,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
pipe.stdin.write(doc.get_raw(session))
pipe.stdin.close()
result = pipe.stdout.read()
pipe.stdout.close()
pipe.stderr.close()
del pipe
else:
# Result will read stdout+err regardless
result = getShellResult(cmd)
os.remove(infn)
if not stdOut:
if os.path.exists(outfn) and os.path.getsize(outfn) > 0:
fh = open(outfn)
else:
# Command probably appended something to the filename
# Annoying! Have to glob for it
matches = glob.glob(outfn + "*")
# Or maybe ignored absolute path and put it in pwd...
matches2 = glob.glob(os.path.split(outfn)[-1] + '*')
for m in matches + matches2:
if os.path.getsize(m) > 0:
fh = open(m)
break
try:
try:
result = fh.read()
except:
msg = '{0}: {1}'.format(cmd, result)
raise ExternalSystemException(msg)
else:
fh.close()
finally:
os.remove(outfn)
try:
# Clean up when data written elsewhere
os.remove(fh.name)
except OSError:
pass
if old:
os.chdir(old)
mt = self.outMimeType
if not mt:
mt = doc.mimeType
return StringDocument(result, self.id, doc.processHistory,
mimeType=mt, parent=doc.parent,
filename=doc.filename)
[docs]class FileUtilPreParser(TypedPreParser):
"""Call 'file' util to find out the current type of file."""
def __init__(self, session, config, parent):
TypedPreParser.__init__(self, session, config, parent)
warn(
'{0} is deprecated in favour of objects available from the'
'cheshire3.formats package.'.format(self.__class__.__name__),
DeprecationWarning,
stacklevel=6
)
def process_document(self, session, doc):
cmd = "file -i -b %INDOC%"
(qq, infn) = tempfile.mkstemp()
os.close(qq)
fh = open(infn, 'w')
fh.write(doc.get_raw(session))
fh.close()
cmd = cmd.replace("%INDOC%", infn)
res = getShellResult(cmd)
mt = res.strip()
if mt.find(';') > -1:
bits = mt.split(';')
mt = bits[0]
for b in bits[1:]:
# just stuff them on doc for now
(type, value) = b.split('=')
setattr(doc, type, value)
if mt == "text/plain":
# Might be sgml, xml, text etc
res = getShellResult("file -b {0}".format(infn))
mt2 = res.strip()
if mt2 == "exported SGML document text":
mt = "text/sgml"
elif mt2 == "XML document text":
mt = "text/xml"
# Others include java, etc. but not very useful to us
doc.mimeType = mt
doc.processHistory.append(self.id)
return doc
[docs]class MagicRedirectPreParser(TypedPreParser):
"""Map to appropriate PreParser based on incoming MIME type."""
def _handleLxmlConfigNode(self, session, node):
# Handle config in the form:
# <hash>
# <object mimeType="" ref=""/>
# ...
# </hash>
if node.tag in ['hash', '{%s}hash' % CONFIG_NS]:
for c in node.iterchildren(tag=etree.Element):
if c.tag in ['object', '{%s}object' % CONFIG_NS]:
mt = c.attrib['mimeType']
ref = c.attrib['ref']
self.mimeTypeHash[mt] = ref
def _handleConfigNode(self, session, node):
# Handle config in the form:
# <hash>
# <object mimeType="" ref=""/>
# ...
# </hash>
if node.localName == "hash":
for c in node.childNodes:
if c.nodeType == elementType and c.localName == "object":
mt = c.getAttributeNS(None, 'mimeType')
ref = c.getAttributeNS(None, 'ref')
self.mimeTypeHash[mt] = ref
def __init__(self, session, config, parent):
self.mimeTypeHash = {
"application/x-gzip": "GunzipPreParser",
"application/postscript": "PsPdfPreParser",
"application/pdf": "PdfXmlPreParser",
"text/html": "HtmlSmashPreParser",
"text/plain": "TxtToXmlPreParser",
"text/prs.fallenstein.rst": "RstToXmlPreParser",
"text/sgml": "SgmlPreParser",
"application/x-bzip2": "BzipPreParser",
"application/zip": "ZIPToMETSPreParser",
("application/vnd.openxmlformats-officedocument."
"wordprocessingml.document"): "ZIPToMETSPreParser", # Word
("application/vnd.openxmlformats-officedocument."
"presentationml.presentation"): "ZIPToMETSPreParser", # PPT
("application/vnd.openxmlformats-officedocument."
"spreadsheetml.sheet"): "ZIPToMETSPreParser", # Excel
("application/vnd.oasis.opendocument."
"text"): "ZIPToMETSPreParser", # ODF Text
("application/vnd.oasis.opendocument."
"presentation"): "ZIPToMETSPreParser", # ODF Presentation
("application/vnd.oasis.opendocument."
"spreadsheet"): "ZIPToMETSPreParser", # ODF Spreadsheet(s)
("application/vnd.oasis.opendocument."
"graphics"): "ZIPToMETSPreParser" # ODF Graphic
# "application/x-zip": "single zip preparser ?"
}
# Now override from config in init:
TypedPreParser.__init__(self, session, config, parent)
def process_document(self, session, doc):
mt = doc.mimeType
# Need Database from which to fetch potentially custom PreParsers
db = session.server.get_object(session, session.database)
if not mt:
# Nasty kludge - use FileUtilPreParser to determine MIME type
fu = db.get_object(session, 'FileUtilPreParser')
doc2 = fu.process_document(session, doc)
mt = doc2.mimeType
if not mt and doc.filename:
# Try and guess from filename
mts = mimetypes.guess_type(doc.filename)
if mts and mts[0]:
mt = mts[0]
if mt in self.mimeTypeHash or "*" in self.mimeTypeHash:
if mt not in self.mimeTypeHash:
# There is a * mime-type
# Something to be done for any unmatched type
mt = '*'
redirect = db.get_object(session, self.mimeTypeHash[mt])
if isinstance(redirect, PreParser):
return redirect.process_document(session, doc)
else:
# Only other thing it could legitimately be is workflow
return redirect.process(session, doc)
else:
# Return unaltered Document
# It may be that it is already the desired mime-type (e.g. XML)
return doc
# --- HTML PreParsers ---
[docs]class HtmlSmashPreParser(PreParser):
""" Attempts to reduce HTML to its raw text """
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.body = re.compile('<body(.*?)</body>', re.S | re.I)
self.tagstrip = re.compile('<[^>]+>')
self.title = re.compile('<title[^>]*>(.+?)</title>', re.S | re.I)
self.script = re.compile('<script(.*?)</script>', re.S | re.I)
self.style = re.compile('<style(.*?)</style>', re.S | re.I)
self.comment = re.compile('<!--(.*?)-->', re.S | re.I)
def process_document(self, session, doc):
data = self.script.sub('', doc.get_raw(session))
data = self.style.sub('', data)
data = self.comment.sub('', data)
tm = self.title.search(data)
if tm:
title = data[tm.start():tm.end()]
else:
title = ""
m = self.body.search(data)
if m:
body = data[m.start():m.end()]
else:
body = data
text = self.tagstrip.sub(' ', body)
text = text.replace('<', '<')
text = text.replace('>', '>')
text = text.replace(" ", ' ')
text = text.replace(" ", ' ')
l = text.split()
text = ' '.join(l)
data = "<html><head>%s</head><body>%s</body></html>" % (title, text)
return StringDocument(data, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
class HtmlFixupPreParser(PreParser):
"""Attempt to fix up HTML to make it complete and parseable XML.
Uses the lxml.html package so as to preserve as much of the intended
structure as possible.
"""
def process_document(self, session, doc):
root = html.document_fromstring(doc.get_raw(session))
try:
# Remove any xmlns to avoid duplication, and hence failed parsing
del root.attrib['xmlns']
except KeyError:
pass
data = etree.tostring(root)
return StringDocument(data, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
[docs]class RegexpSmashPreParser(PreParser):
"""Strip, replace or keep only data which matches a given regex."""
_possibleSettings = {
'char': {
'docs': """\
Character(s) to replace matches in the regular expression with. Defaults to
empty string (i.e. strip matches)"""
},
'regexp': {
'docs': "Regular expression to match in the data.",
'required': True
},
'keep': {
'docs': """\
Should instead keep only the matches. Boolean, defaults to False""",
'type': int,
'options': "0|1"
}
}
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
char = self.get_setting(session, 'char')
regex = self.get_setting(session, 'regexp')
self.keep = self.get_setting(session, 'keep')
if regex:
self.regexp = re.compile(regex, re.S)
if char:
self.char = char
else:
self.char = ''
def process_document(self, session, doc):
data = doc.get_raw(session)
if self.keep:
l = self.regexp.findall(data)
if l and l[0] and type(l[0]) == tuple:
r = []
for e in l:
r.append(e[0])
l = r
d2 = self.char.join(l)
else:
d2 = self.regexp.sub(self.char, data)
return StringDocument(d2, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
try:
import tidy
except ImportError:
# Gracefully degrade functionality
class HtmlTidyPreParser(PreParser):
def __init__(self, session, config, parent):
raise MissingDependencyException(self.__class__.__name__,
"tidy")
else:
[docs] class HtmlTidyPreParser(PreParser):
"""Uses TidyLib to turn HTML into XHTML for parsing."""
def process_document(self, session, doc):
d = tidy.parseString(doc.get_raw(session),
output_xhtml=1,
add_xml_decl=0,
tidy_mark=0,
indent=0)
return StringDocument(str(d), self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
# --- Not Quite Xml PreParsers ---
[docs]class SgmlPreParser(PreParser):
""" Convert SGML into XML """
entities = {}
emptyTags = []
doctype_re = None
attr_re = None
elem_re = None
amp_re = None
inMimeType = "text/sgml"
outMimeType = "text/xml"
_possibleSettings = {
'emptyElements': {
'docs': '''\
Space separated list of empty elements in the SGML to turn into empty XML
elements.'''
}
}
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.doctype_re = (re.compile('<!DOCTYPE\s+?(.+?)["\'](.+?)["\']>'))
self.attr_re = re.compile(
' ([a-zA-Z0-9_]+)[ ]*=[ ]*([-:_.a-zA-Z0-9]+)([ >])'
)
self.pi_re = re.compile("<\?(.*?)\?>")
self.elem_re = re.compile('(<[/]?)([a-zA-Z0-9_]+)')
self.amp_re = re.compile('&(\s)')
taglist = self.get_setting(session, 'emptyElements')
if taglist:
self.emptyTags = taglist.split()
def _loneAmpersand(self, match):
# Fix unencoded ampersands
return '&%s' % match.group(1)
def _lowerElement(self, match):
# Make all tags lowercase
#return match.groups()[0] + match.groups()[1].lower()
return "%s%s" % (match.group(1), match.group(2).lower())
def _attributeFix(self, match):
# Fix messy attribute values
# - lowercase attribute names
# - remove spurious whitespace
# - quote unquoted values
#return match.groups()[0].lower() + '="' + match.groups()[1] + '"'
return ' %s="%s"%s' % (match.group(1).lower(),
match.group(2),
match.group(3))
def _emptyElement(self, match):
# Make empty elements sefl-closing
return "<%s/>" % (match.group(1))
def process_document(self, session, doc):
txt = doc.get_raw(session)
txt = txt.replace('\n', ' ')
txt = txt.replace('\r', ' ')
for x in range(9, 14):
txt = txt.replace('&#%d;' % (x), ' ')
txt = self.doctype_re.sub('', txt)
for e in self.entities.keys():
txt = txt.replace("&%s;" % (e), self.entities[e])
txt = self.amp_re.sub(self._loneAmpersand, txt)
txt = txt.replace('&<', '&<')
txt = self.attr_re.sub(self._attributeFix, txt)
txt = self.elem_re.sub(self._lowerElement, txt)
for t in self.emptyTags:
empty_re = re.compile('<(%s( [^>/]+)?)[\s/]*>' % t)
txt = empty_re.sub(self._emptyElement, txt)
# strip processing instructions.
txt = self.pi_re.sub('', txt)
return StringDocument(txt, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
[docs]class AmpPreParser(PreParser):
"""Escape lone ampersands in otherwise XML text."""
entities = {}
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.amp_re = re.compile('&([^\s;]*)(\s|$)')
self.entities = {}
def _loneAmpersand(self, match):
# Fix unencoded ampersands
return '&%s ' % match.group(1)
def process_document(self, session, doc):
txt = doc.get_raw(session)
for e in self.entities.keys():
txt = txt.replace("&%s;" % (e), self.entities[e])
txt = self.amp_re.sub(self._loneAmpersand, txt)
return StringDocument(txt, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
# --- MARC PreParsers ---
[docs]class MarcToXmlPreParser(PreParser):
""" Convert MARC into MARCXML """
inMimeType = "application/marc"
outMimeType = "text/xml"
def process_document(self, session, doc):
data = doc.get_raw(session)
m = MARC(data)
return StringDocument(m.toMARCXML(), self.id, doc.processHistory,
mimeType='text/xml', parent=doc.parent,
filename=doc.filename)
[docs]class MarcToSgmlPreParser(PreParser):
""" Convert MARC into Cheshire2's MarcSgml """
inMimeType = "application/marc"
outMimeType = "text/sgml"
def process_document(self, session, doc):
data = doc.get_raw(session)
m = MARC(data)
return StringDocument(m.toSGML(), self.id, doc.processHistory,
mimeType='text/sgml', parent=doc.parent,
filename=doc.filename)
# --- Raw Text PreParsers ---
[docs]class TxtToXmlPreParser(PreParser):
"""Minimally wrap text in <data> XML tags"""
inMimeType = "text/plain"
outMimeType = "text/xml"
def process_document(self, session, doc):
txt = doc.get_raw(session)
txt = escape(txt)
data = "<data>{0}</data>".format(txt)
return StringDocument(data, self.id, doc.processHistory,
mimeType='text/xml', parent=doc.parent,
filename=doc.filename)
class RstToXmlPreParser(PreParser):
"""Convert reStructuredText into Docutils-native XML."""
inMimeType = "text/prs.fallenstein.rst"
outMimeType = "application/xml"
def process_document(self, session, doc):
rst = doc.get_raw(session)
data = publish_string(rst, writer_name="xml")
return StringDocument(data,
self.id,
doc.processHistory,
mimeType=self.outMimeType,
parent=doc.parent,
filename=doc.filename
)
# --- Compression PreParsers ---
[docs]class PicklePreParser(PreParser):
"""Compress Document content using Python pickle."""
def process_document(self, session, doc):
data = doc.get_raw(session)
string = pickle.dumps(data)
return StringDocument(string, self.id, doc.processHistory,
mimeType='text/pickle', parent=doc.parent,
filename=doc.filename)
[docs]class UnpicklePreParser(PreParser):
"""Decompress Document content using Python pickle."""
def process_document(self, session, doc):
data = doc.get_raw(session)
string = pickle.loads(data)
return StringDocument(string, self.id, doc.processHistory,
mimeType='text/pickle', parent=doc.parent,
filename=doc.filename)
try:
import gzip
except ImportError:
# Gracefully degrade functionality
class GzipPreParser(PreParser):
"""Gzip a not-gzipped document."""
def __init__(self, session, config, parent):
raise MissingDependencyException(self.__class__.__name__,
"gzip")
class GunzipPreParser(PreParser):
"""Gunzip a gzipped document."""
def __init__(self, session, config, parent):
raise MissingDependencyException(self.__class__.__name__,
"gzip")
else:
[docs] class GzipPreParser(PreParser):
"""Gzip a not-gzipped document."""
inMimeType = ""
outMimeType = ""
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.compressLevel = self.get_setting(session, "compressLevel", 1)
def process_document(self, session, doc):
outDoc = StringIO.StringIO()
zfile = gzip.GzipFile(mode='wb', fileobj=outDoc,
compresslevel=self.compressLevel)
zfile.write(doc.get_raw(session))
zfile.close()
l = outDoc.tell()
outDoc.seek(0)
data = outDoc.read(l)
outDoc.close()
return StringDocument(data, self.id, doc.processHistory,
parent=doc.parent, filename=doc.filename)
# This comment needed for validation by PEP8 validator
[docs] class GunzipPreParser(PreParser):
"""Gunzip a gzipped document."""
inMimeType = ""
outMimeType = ""
def process_document(self, session, doc):
buff = StringIO.StringIO(doc.get_raw(session))
zfile = gzip.GzipFile(mode='rb', fileobj=buff)
data = zfile.read()
zfile.close()
buff.close()
del zfile
del buff
return StringDocument(data, self.id, doc.processHistory,
parent=doc.parent, filename=doc.filename)
try:
import bz2
except ImportError:
# Gracefully degrade functionality
class Bzip2PreParser(PreParser):
"""Unzip a bz2 zipped document."""
def __init__(self, session, config, parent):
raise MissingDependencyException(self.__class__.__name__,
"bzip2")
else:
class Bzip2PreParser(PreParser):
"""Unzip a bz2 zipped document."""
def process_document(self, session, doc):
bzdata = doc.get_raw(session)
data = bz2.decompress(bzdata)
return StringDocument(data, self.id, doc.processHistory,
parent=doc.parent, filename=doc.filename)
[docs]class B64EncodePreParser(PreParser):
"""Encode document in Base64."""
def process_document(self, session, doc):
data = doc.get_raw(session)
new = b64encode(data)
return StringDocument(new, self.id, doc.processHistory,
parent=doc.parent, filename=doc.filename)
[docs]class B64DecodePreParser(PreParser):
"""Decode document from Base64."""
def process_document(self, session, doc):
data = doc.get_raw(session)
new = b64decode(data)
return StringDocument(new, self.id, doc.processHistory,
parent=doc.parent, filename=doc.filename)
class LZ4CompressPreParser(PreParser):
"""Compress data using the lz4 algorithm."""
def process_document(self, session, doc):
data = doc.get_raw(session)
new = lz4.compress(data)
return StringDocument(new, self.id, doc.processHistory,
parent=doc.parent, filename=doc.filename)
class LZ4DecompressPreParser(PreParser):
"""Decompress lz4 compressed data."""
def process_document(self, session, doc):
data = doc.get_raw(session)
new = lz4.decompress(data)
return StringDocument(new, self.id, doc.processHistory,
parent=doc.parent, filename=doc.filename)
# --- Nasty OpenOffice PreParser ---
class UrlPreParser(PreParser):
"""Abstract Base Class for PreParsers that use OpenOffice.
DEPRECATED: see cheshire3.formats sub-package instead
"""
_possiblePaths = {
'remoteUrl': {
'docs': 'URL at which the OpenOffice handler is listening'
}
}
def _post_multipart(self, host, selector, fields, files):
content_type, body = self._encode_multipart_formdata(fields, files)
h = httplib.HTTPConnection(host)
headers = {'content-type': content_type}
h.request('POST', selector, body, headers)
resp = h.getresponse()
return resp.read()
def _encode_multipart_formdata(self, fields, files):
BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
CRLF = '\r\n'
L = []
for (key, value) in fields:
L.append('--' + BOUNDARY)
L.append('Content-Disposition: form-data; name="%s"' % key)
L.append('')
L.append(value)
for (key, filename, value) in files:
L.append('--' + BOUNDARY)
L.append(
'Content-Disposition: form-data; name="%s"; filename="%s"' %
(key, filename)
)
L.append('Content-Type: %s' % self._get_content_type(filename))
L.append('')
L.append(value)
L.append('--' + BOUNDARY + '--')
L.append('')
body = CRLF.join(L)
content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
return content_type, body
def _get_content_type(self, filename):
return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
def _send_request(self, session, data=None):
url = self.get_path(session, 'remoteUrl')
if (url[:7] == "http://"):
url = url[7:]
hlist = url.split('/', 1)
host = hlist[0]
if (len(hlist) == 2):
selector = hlist[1]
else:
selector = ""
# TODO: Remove dependency
fields = ()
files = [("file", "foo.doc", data)]
return self._post_multipart(host, selector, fields, files)
class OpenOfficePreParser(UrlPreParser):
"""Use OpenOffice server to convert documents into OpenDocument XML """
inMimeType = ""
outMimeType = "text/xml"
def process_document(self, session, doc):
data = doc.get_raw(session)
try:
xml = self._send_request(session, data)
except:
xml = "<error/>"
return StringDocument(xml, self.id, doc.processHistory,
mimeType='text/xml', parent=doc.parent,
filename=doc.filename)
[docs]class PrintableOnlyPreParser(PreParser):
"""Replace or Strip non printable characters."""
inMimeType = "text/*"
outMimeType = "text/plain"
_possibleSettings = {
'strip': {
'docs': """\
Should the preParser strip the characters or replace with numeric character \
entities (default)""",
'type': int,
'options': "0|1"
}
}
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.asciiRe = re.compile('([\x7b-\xff])')
self.nonxmlRe = re.compile('([\x00-\x08]|[\x0E-\x1F]|[\x0B\x0C\x1F])')
self.strip = self.get_setting(session, 'strip', 0)
def process_document(self, session, doc):
"""Strip any non printable characters."""
data = doc.get_raw(session)
# This is bizarre, but otherwise:
# UnicodeDecodeError: 'ascii' codec can't decode byte ...
if isinstance(data, unicode):
data = data.replace(u"\xe2\x80\x9c", u'"')
data = data.replace(u"\xe2\x80\x9d", u'"')
data = data.replace(u"\xe2\x80\x9e", u'"')
data = data.replace(u"\xe2\x80\x93", u'-')
data = data.replace(u"\xe2\x80\x98", u"'")
data = data.replace(u"\xe2\x80\x99", u"'")
data = data.replace(u"\xe2\x80\x9a", u",")
data = data.replace(u"\x99", u"'")
data = data.replace(u'\xa0', u' ')
else:
data = data.replace("\xe2\x80\x9c", '"')
data = data.replace("\xe2\x80\x9d", '"')
data = data.replace("\xe2\x80\x9e", '"')
data = data.replace("\xe2\x80\x93", '-')
data = data.replace("\xe2\x80\x98", "'")
data = data.replace("\xe2\x80\x99", "'")
data = data.replace("\xe2\x80\x9a", ",")
data = data.replace("\x99", "'")
data = data.replace('\xa0', ' ')
data = self.nonxmlRe.sub(' ', data)
if self.strip:
new = self.asciiRe.sub('', data)
else:
fn = lambda x: "&#%s;" % ord(x.group(1))
new = self.asciiRe.sub(fn, data)
return StringDocument(new, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
[docs]class CharacterEntityPreParser(PreParser):
"""Change named and broken entities to numbered.
Transform latin-1 and broken character entities into numeric character
entities. eg
&something; --> &#123;
"""
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.numericalEntRe = re.compile('&(\d+);')
self.fractionRe = re.compile('&frac(\d)(\d);')
self.invalidRe = re.compile('&#(\d|[0-2]\d|3[01]);')
self.start = 160
self.otherEntities = {
"quot": '#34',
"amp": '#38',
"lt": '#60',
"gt": '#62',
"trade": '#8482',
"OElig": '#338',
"oelig": '#339',
"Scaron": '#352',
"scaron": '#353',
"Yuml": '#376',
"circ": '#710',
"tilde": '#732',
"ensp": '#8194',
"emsp": '#8195',
"thinsp": '#8201',
"zwnj": '#8204',
"zwj": '#8205',
"lrm": '#8206',
"rlm": '#8207',
"ndash": '#8211',
"mdash": '#8212',
"lsquo": '#8216',
"rsquo": '#8217',
"sbquo": '#8218',
"ldquo": '#8220',
"rdquo": '#8221',
"bdquo": '#8222',
"dagger": '#8224',
"Dagger": '#8225',
"permil": '#8240',
"lsaquo": '#8249',
"rsaquo": '#8250',
"euro": '#8364',
"rdquo": '#34',
"lsquo": '#34',
"rsquo": '#34',
"half": '#189',
"ast": '#8727'
}
self.inane = {
"apos": "'",
"hellip": '...',
"ldquo": '',
"lsqb": '[',
"rsqb": ']',
"sol": '\\',
"commat": '@',
"plus": '+',
"percnt": '%'
}
self.preEntities = {
"OUML;": "Ouml",
"UUML": "Uuml",
"AELIG": "AElig",
"Aelig": "AElig"
}
self.entities = ['nbsp', 'iexcl', 'cent', 'pound', 'curren', 'yen',
'brvbar', 'sect', 'uml', 'copy', 'ordf', 'laquo',
'not', 'shy', 'reg', 'macr', 'deg', 'plusmn',
'sup2', 'sup3', 'acute', 'micro', 'para', 'middot',
'cedil', 'sup1', 'ordm', 'raquo', 'frac14', 'frac12',
'frac34', 'iquest', 'Agrave', 'Aacute', 'Acirc',
'Atilde', 'Auml', 'Aring', 'AElig', 'Ccedil',
'Egrave', 'Eacute', 'Ecirc', 'Euml', 'Igrave',
'Iacute', 'Icirc', 'Iuml', 'ETH', 'Ntilde', 'Ograve',
'Oacute', 'Ocirc', 'Otilde', 'Ouml', 'times',
'Oslash', 'Ugrave', 'Uacute', 'Ucirc', 'Uuml',
'Yacute', 'THORN', 'szlig', 'agrave', 'aacute',
'acirc', 'atilde', 'auml', 'aring', 'aelig',
'ccedil', 'egrave', 'eacute', 'ecirc', 'euml',
'igrave', 'iacute', 'icirc', 'iuml', 'eth', 'ntilde',
'ograve', 'oacute', 'ocirc', 'otilde', 'ouml',
'divide', 'oslash', 'ugrave', 'uacute', 'ucirc',
'uuml', 'yacute', 'thorn', 'yuml']
def process_document(self, session, doc):
txt = doc.get_raw(session)
# Replace entities that can be represented with simple chars
for (fromEnt, toEnt) in self.inane.iteritems():
txt = txt.replace("&%s;" % fromEnt, toEnt)
# Fix some common mistakes
for (fromEnt, toEnt) in self.preEntities.iteritems():
txt = txt.replace("&%s;" % fromEnt, "&%s;" % toEnt)
# Fix straight forward entites
for (s, enty) in enumerate(self.entities):
txt = txt.replace("&%s;" % enty, "&#%s;" % (160 + s))
# Fix additional random entities
for (fent, totxt) in self.otherEntities.iteritems():
txt = txt.replace("&%s;" % fent, "&%s;" % totxt)
# Add missing # in &123;
def hashed(mo):
return '&#%s;' % mo.group(1)
txt = self.numericalEntRe.sub(hashed, txt)
# Fix made up fraction entities. (?)
def fraction(mo):
return '%s⁄%s' % (mo.group(1), mo.group(2))
txt = self.fractionRe.sub(fraction, txt)
# Kill remaining invalid character entities
txt = self.invalidRe.sub('', txt)
return StringDocument(txt, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
class DataChecksumPreParser(PreParser):
"""Checksum Document data and add to Document metadata."""
_possibleSettings = {
'sumType': {
'docs': "Type of checkSum to carry out.",
'type': str,
'default': 'md5'
}
}
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.sumType = self.get_setting(session, 'sumType', 'md5')
try:
hashlib.new(self.sumType)
except ValueError as e:
raise ConfigFileException(str(e))
def process_document(self, session, doc):
data = doc.get_raw(session)
h = hashlib.new(self.sumType)
h.update(data)
md = {
self.sumType: {
'hexdigest': h.hexdigest(),
'analysisDateTime': time.strftime('%Y-%m-%dT%H:%M:%S%Z')
}
}
try:
doc.metadata['checksum'].update(md)
except KeyError:
doc.metadata['checksum'] = md
doc.processHistory.append(self.id)
return doc
class METSWrappingPreParser(TypedPreParser):
"""PreParser to wrap any Document content in METS XML."""
def __init__(self, session, config, parent):
TypedPreParser.__init__(self, session, config, parent)
# Over-ride if missing outgoing mime-type
if not self.outMimeType:
self.outMimeType = 'application/xml'
def _get_metsWrapper(self, doc):
# Get a generic METS wrapper for the given Document
# Find/Generate identifiers and labels
objid = gen_uuid()
# Set up METS root and header
mets = METS.mets(
{'ID': '/'.join([objid, 'mets']),
'OBJID': objid,
'TYPE': 'ZIPFILE'
},
METS.metsHdr(
{'ID': '/'.join([objid, 'metsHdr']),
'CREATEDDATE': time.strftime('%Y-%m-%dT%H:%M:%S%Z')
},
METS.agent(
{'ROLE': "CREATOR",
'TYPE': "OTHER",
'OTHERTYPE': 'SOFTWARE'
},
METS.name("Cheshire3"),
METS.note(
"METS instance was created by a Cheshire3 object"
" of type {0} identified as {1}"
"".format(type(self).__name__, self.id)
)
)
),
METS.dmdSec(),
METS.amdSec(),
METS.fileSec(
METS.fileGrp({'ID': '/'.join([objid, 'fileGrp', '0001'])})
)
)
# Set a human readable label if possible
if doc.filename:
mets.set("LABEL", os.path.abspath(doc.filename))
elif doc.id:
mets.set("LABEL", doc.id)
return mets
def _get_metsFile(self, identifier, rawdata, size=0, mimeType=""):
# Get a METS file element for the given data
file_ = METS.file({'ID': identifier,
}
)
# Create a METS FContent element
FContent = METS.FContent()
file_.append(FContent)
# Try to set size
if size:
file_.attrib["SIZE"] = str(size)
else:
file_.attrib["SIZE"] = str(len(rawdata))
# Attempt to add the MIME-Type
if mimeType == "text/xml":
# Fix broken MIME-Type
file_.attrib['MIMETYPE'] = 'application/xml'
elif mimeType:
file_.attrib['MIMETYPE'] = mimeType
# Add the content as either XML or binary (Base 64) data
try:
# Attempt to parse file content as XML
xmldata = etree.fromstring(rawdata)
except etree.XMLSyntaxError:
# Encode as Base64
FContent.append(METS.binData(b64encode(rawdata)))
else:
FContent.append(METS.xmlData(xmldata))
return file_
def process_document(self, session, doc):
global METS_NAMESPACES
mets = self._get_metsWrapper(doc)
objid = mets.get("OBJID")
# Get the fileSec element
fileGrp = mets.xpath('/mets:mets/mets:fileSec/mets:fileGrp[1]',
namespaces=METS_NAMESPACES)[0]
file_ = self._get_metsFile(
'/'.join([objid,
mets.attrib.get("LABEL", "file0001")
]),
doc.get_raw(session),
doc.byteCount,
doc.mimeType
)
# Append the file element to fileGrp
fileGrp.append(file_)
# Update last modification date
mets.attrib['LASTMODDATE'] = time.strftime('%Y-%m-%dT%H:%M:%S%Z')
# Serialize METS
data = etree.tostring(mets, pretty_print=True)
# Return a Document
return StringDocument(
data,
self.id,
doc.processHistory,
self.outMimeType,
parent=doc.parent,
filename=doc.filename,
byteCount=len(data),
byteOffset=0
)
class ZIPToMETSPreParser(METSWrappingPreParser):
"""PreParser to process a ZIP file to METS XML.
As Office Open XML format and OpenDocument format Documents are based on
ZIP files, this PreParser can also be used to unpack them, and wrap their
component parts in METS.
Office Open XML (a.k.a. OpenXML, OOXML) is the name for ECMA 376 office
file formats used by default in Microsoft Office 2007 onwards (.docx,
.xlsx , .pptx etc.) It is available as an import/export format in
LibreOffice, OpenOffice >= 3.2, Google Docs and more.
"""
def process_document(self, session, doc):
global METS_NAMESPACES
mets = self._get_metsWrapper(doc)
objid = mets.get("OBJID")
# Get the fileSec element
fileGrp = mets.xpath('/mets:mets/mets:fileSec/mets:fileGrp[1]',
namespaces=METS_NAMESPACES)[0]
# Make raw data of incoming document file-like
stringio = StringIO.StringIO(doc.get_raw(session))
# Read file-like object as a ZIP file
with ZipFile(stringio, 'r') as zf:
# Iterate through the zipped files
for zipinfo in zf.infolist():
# Attempt to get the MIME-Type
mts = mimetypes.guess_type(zipinfo.filename)
if mts and mts[0]:
mimeType = mts[0]
else:
mimeType = ""
file_ = self._get_metsFile(
'/'.join([objid, zipinfo.filename]),
zf.read(zipinfo),
str(zipinfo.file_size),
mimeType
)
# Append the file element to fileGrp
fileGrp.append(file_)
# Update last modification date
mets.attrib['LASTMODDATE'] = time.strftime('%Y-%m-%dT%H:%M:%S%Z')
# Serialize METS
data = etree.tostring(mets, pretty_print=True)
# Return a Document
return StringDocument(
data,
self.id,
doc.processHistory,
self.outMimeType,
parent=doc.parent,
filename=doc.filename,
byteCount=len(data),
byteOffset=0
)
# Set up ElementMaker for METS and XLink namespaces
METS_NAMESPACES = {'mets': "http://www.loc.gov/METS/",
'xlink': "http://www.w3.org/1999/xlink"
}
METS = ElementMaker(namespace=METS_NAMESPACES['mets'],
nsmap=METS_NAMESPACES
)