Source code for ocrd_models.constants

"""
Constants for ocrd_models.
"""
from re import Pattern
from enum import Enum, auto
from dataclasses import dataclass, field
from abc import ABC, abstractmethod
from typing import Any, List, Union
from ocrd_utils import resource_string

__all__ = [
    'IDENTIFIER_PRIORITY',
    'METS_XML_EMPTY',
    'NAMESPACES',
    'TAG_METS_AGENT',
    'TAG_METS_DIV',
    'TAG_METS_FILE',
    'TAG_METS_FILEGRP',
    'TAG_METS_FILESEC',
    'TAG_METS_FPTR',
    'TAG_METS_FLOCAT',
    'TAG_METS_METSHDR',
    'TAG_METS_NAME',
    'TAG_METS_NOTE',
    'TAG_METS_STRUCTMAP',
    'TAG_MODS_IDENTIFIER',
    'TAG_PAGE_ALTERNATIVEIMAGE',
    'TAG_PAGE_COORDS',
    'TAG_PAGE_READINGORDER',
    'TAG_PAGE_REGIONREFINDEXED',
    'TAG_PAGE_TEXTLINE',
    'TAG_PAGE_TEXTEQUIV',
    'TAG_PAGE_TEXTREGION',
    'METS_PAGE_DIV_ATTRIBUTE',
    'METS_STRUCT_DIV_ATTRIBUTE',
    'METS_DIV_ATTRIBUTE_ATOM_PATTERN',
    'METS_DIV_ATTRIBUTE_RANGE_PATTERN',
    'METS_DIV_ATTRIBUTE_REGEX_PATTERN',
    'PAGE_REGION_TYPES',
    'PAGE_ALTIMG_FEATURES',
]


IDENTIFIER_PRIORITY = ['purl', 'urn', 'doi', 'url']

METS_XML_EMPTY = resource_string(__package__, 'mets-empty.xml')

NAMESPACES = {
    'mets': "http://www.loc.gov/METS/",
    'mods': "http://www.loc.gov/mods/v3",
    'xlink': "http://www.w3.org/1999/xlink",
    'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15",
    'xsl': 'http://www.w3.org/1999/XSL/Transform#',
    'ocrd': 'https://ocr-d.de',
}

TAG_METS_AGENT            = '{%s}agent' % NAMESPACES['mets']
TAG_METS_DIV              = '{%s}div' % NAMESPACES['mets']
TAG_METS_FILE             = '{%s}file' % NAMESPACES['mets']
TAG_METS_FILEGRP          = '{%s}fileGrp' % NAMESPACES['mets']
TAG_METS_FILESEC          = '{%s}fileSec' % NAMESPACES['mets']
TAG_METS_FPTR             = '{%s}fptr' % NAMESPACES['mets']
TAG_METS_FLOCAT           = '{%s}FLocat' % NAMESPACES['mets']
TAG_METS_METSHDR          = '{%s}metsHdr' % NAMESPACES['mets']
TAG_METS_NAME             = '{%s}name' % NAMESPACES['mets']
TAG_METS_NOTE             = '{%s}note' % NAMESPACES['mets']
TAG_METS_STRUCTMAP        = '{%s}structMap' % NAMESPACES['mets']

TAG_MODS_IDENTIFIER       = '{%s}identifier' % NAMESPACES['mods']

TAG_PAGE_ALTERNATIVEIMAGE = '{%s}AlternativeImage' % NAMESPACES['page']
TAG_PAGE_COORDS           = '{%s}Coords' % NAMESPACES['page']
TAG_PAGE_READINGORDER     = '{%s}ReadingOrder' % NAMESPACES['page']
TAG_PAGE_REGIONREFINDEXED = '{%s}RegionRefIndexed' % NAMESPACES['page']
TAG_PAGE_TEXTLINE         = '{%s}TextLine' % NAMESPACES['page']
TAG_PAGE_TEXTEQUIV        = '{%s}TextEquiv' % NAMESPACES['page']
TAG_PAGE_TEXTREGION       = '{%s}TextRegion' % NAMESPACES['page']

PAGE_REGION_TYPES = [
    'Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image',
    'LineDrawing', 'Map', 'Maths', 'Music', 'Noise',
    'Separator', 'Table', 'Text', 'Unknown'
]

PAGE_ALTIMG_FEATURES = [
    'binarized',
    'grayscale_normalized',
    'despeckled',
    'cropped',
    'deskewed',
    'rotated-90',
    'rotated-180',
    'rotated-270',
    'dewarped',
    'clipped',
]


[docs] class METS_PAGE_DIV_ATTRIBUTE(Enum): """page selection attributes of PHYSICAL mets:structMap//mets:div""" ID = auto() ORDER = auto() ORDERLABEL = auto() LABEL = auto() CONTENTIDS = auto()
[docs] @classmethod def names(cls): return [x.name for x in cls]
[docs] @classmethod def type_prefix(cls): """disambiguation prefix to use for all subtypes""" return "physical:"
[docs] def prefix(self): """disambiguation prefix to use for this attribute type""" return self.type_prefix() + self.name.lower() + ":"
[docs] class METS_STRUCT_DIV_ATTRIBUTE(Enum): """page selection attributes of LOGICAL mets:structMap//mets:div""" ID = auto() DMDID = auto() TYPE = auto() LABEL = auto()
[docs] @classmethod def names(cls): return [x.name for x in cls]
[docs] @classmethod def type_prefix(cls): """disambiguation prefix to use for all subtypes""" return "logical:"
[docs] def prefix(self): """disambiguation prefix to use for this attribute type""" return self.type_prefix() + self.name.lower() + ":"
@dataclass class METS_DIV_ATTRIBUTE_PATTERN(ABC): """page selection pattern (abstract supertype)""" expr: Any """pattern value to match a mets:div against""" attr: List[Union[METS_PAGE_DIV_ATTRIBUTE, METS_STRUCT_DIV_ATTRIBUTE]] = field( default_factory=lambda: list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE)) """attribute type(s) to match a mets:div for (pre-disambiguated with prefix syntax, or filled upon first match) """ has_matched: bool = field(init=False, default=False) """whether this pattern has already been matched""" def attr_prefix(self): """attribute type disambiguation prefix corresponding to the current state of disambiguation""" if self.attr == list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE): return "" if self.attr == list(METS_PAGE_DIV_ATTRIBUTE): return METS_PAGE_DIV_ATTRIBUTE.type_prefix() if self.attr == list(METS_STRUCT_DIV_ATTRIBUTE): return METS_STRUCT_DIV_ATTRIBUTE.type_prefix() assert len(self.attr) == 1, "unexpected type ambiguity: %s" % repr(self.attr) return self.attr[0].prefix() @abstractmethod def _matches(self, input) -> bool: return def matches(self, input) -> bool: """does the selection pattern match on the given attribute value?""" if (matched := self._matches(input)): self.has_matched = True return matched
[docs] @dataclass class METS_DIV_ATTRIBUTE_ATOM_PATTERN(METS_DIV_ATTRIBUTE_PATTERN): """page selection pattern for literal (single value) matching""" expr: str def __repr__(self): return "%s%s" % (self.attr_prefix(), self.expr) def _matches(self, input): return input == self.expr
[docs] @dataclass class METS_DIV_ATTRIBUTE_RANGE_PATTERN(METS_DIV_ATTRIBUTE_PATTERN): """page selection pattern for interval (list expansion) matching""" expr: List[str] start: str = field(init=False) """first value of the range after expansion, before matching-exhausting""" stop: str = field(init=False) """last value of the range after expansion, before matching-exhausting""" def __post_init__(self): self.start = self.expr[0] self.stop = self.expr[-1] def __repr__(self): return "%s%s..%s" % (self.attr_prefix(), self.start, self.stop) def _matches(self, input): return input in self.expr
[docs] @dataclass class METS_DIV_ATTRIBUTE_REGEX_PATTERN(METS_DIV_ATTRIBUTE_PATTERN): """page selection pattern for regular expression matching""" expr: Pattern def __repr__(self): return "%s//%s" % (self.attr_prefix(), self.expr.pattern) def _matches(self, input): return bool(self.expr.fullmatch(input))