Source code for ocrd_models.utils
"""
Utilities for ocrd_models
"""
from lxml import etree as ET
from ocrd_utils import getLogger
from .constants import NAMESPACES as NS
__all__ = [
'xmllint_format',
'handle_oai_response',
'is_oai_content',
'extract_mets_from_oai_content'
]
[docs]def xmllint_format(xml):
"""
Pretty-print XML like ``xmllint`` does.
Arguments:
xml (string): Serialized XML
"""
log = getLogger('ocrd.models.utils.xmllint_format')
parser = ET.XMLParser(resolve_entities=False, strip_cdata=False, remove_blank_text=True)
document = ET.fromstring(xml, parser)
return ('%s\n%s' % ('<?xml version="1.0" encoding="UTF-8"?>',
ET.tostring(document, pretty_print=True, encoding='UTF-8').decode('utf-8'))).encode('utf-8')
[docs]def handle_oai_response(response):
"""
In case of a valid OAI-Response, extract first METS-Entry-Data
"""
log = getLogger('ocrd.models.utils.handle_oai_response')
content_type = response.headers['Content-Type']
if 'xml' in content_type or 'text' in content_type:
content = response.content
try:
if is_oai_content(content):
return extract_mets_from_oai_content(content)
except ET.LxmlError as exc:
log.warning("textual response but no xml: %s (%s)", content, exc)
return response.content
[docs]def is_oai_content(data):
"""
Return True if data is an OAI-PMH request/response
"""
log = getLogger('ocrd.models.utils.is_oai_content')
xml_root = ET.fromstring(data)
root_tag = xml_root.tag
log.info("response data root.tag: '%s'" % root_tag)
return str(root_tag).endswith('OAI-PMH')
[docs]def extract_mets_from_oai_content(data, preamble='<?xml version="1.0" encoding="UTF-8"?>'):
"""
Extract METS from an OAI-PMH GetRecord response
"""
xml_root = ET.fromstring(data)
if 'mets' in xml_root.tag:
return data
mets_root_el = xml_root.find('.//{%s}mets' % NS['mets'])
if mets_root_el is not None:
new_tree = ET.ElementTree(mets_root_el)
xml_formatted = ET.tostring(new_tree,
pretty_print=True,
encoding='UTF-8').decode('UTF-8')
formatted_content = '{}\n{}'.format(preamble, xml_formatted)
return formatted_content.encode('UTF-8').replace(b'\n', b'\r\n')
raise Exception("Missing mets-section in %s" % data)