Source code for ocrd_validators.xsd_validator

"""
Validating XML Schema
"""

from pathlib import Path
from lxml import etree as ET

from ocrd_models import ValidationReport

from .constants import XSD_PATHS

#
# -------------------------------------------------
#

[docs]class XsdValidator(): """ XML Schema validator. """ _instances = {}
[docs] @classmethod def instance(cls, schema_url): if schema_url in cls._instances: return cls._instances[schema_url] cls._instances[schema_url] = cls(schema_url) return cls._instances[schema_url]
[docs] @classmethod def validate(cls, schema_url, doc): """ Validate an XML document against a schema. Args: doc (etree.ElementTree|str|bytes): schema_url (str): URI of XML schema to validate against. """ return cls.instance(schema_url)._validate(doc) # pylint: disable=protected-access
def __init__(self, schema_url): """ Construct an XsdValidator. Args: schema_url (str): URI of XML schema to validate against. """ if schema_url not in XSD_PATHS: raise Exception('XML schema not bundled with OCR-D: %s' % schema_url) with open(XSD_PATHS[schema_url], 'r') as f: xmlschema_doc = ET.parse(f) self._xmlschema = ET.XMLSchema(xmlschema_doc) def _validate(self, doc): """ Do the actual validation. Arguments: doc (etree.ElementTree|str|bytes|pathlib.Path): the document. if etree: us as-is. if str/bytes: parse as XML string. If Path: read_text on it Returns: ValidationReport """ report = ValidationReport() if isinstance(doc, Path): doc = ET.parse(str(doc)) if isinstance(doc, (bytes, str)): doc = ET.fromstring(doc) try: self._xmlschema.assertValid(doc) except ET.DocumentInvalid as fail: for err in fail.error_log: # pylint: disable=no-member report.add_error("Line %s: %s" % (err.line, err.message)) return report