Source code for ocrd_modelfactory

"""

Factory methods to create models for data, files, URLs.

"""
from datetime import datetime
from pathlib import Path
from typing import Tuple, Union
from yaml import safe_load, safe_dump

from PIL import Image
from lxml import etree as ET

from ocrd_utils import VERSION, MIMETYPE_PAGE, guess_media_type
from ocrd_models import OcrdExif, OcrdFile, ClientSideOcrdFile
from ocrd_models.ocrd_page import (
    OcrdPage, PcGtsType, PageType, MetadataType,
    parse, parseEtree
)
from ocrd_utils.deprecate import deprecation_warning

__all__ = [
    'exif_from_filename',
    'page_from_file',
    'page_from_image',
]


[docs] def exif_from_filename(image_filename): """ Create :py:class:`~ocrd_models.ocrd_exif.OcrdExif` by opening an image file with PIL and reading its metadata. Arguments: image_filename (str): Local image path name (relative to workspace). """ if image_filename is None: raise Exception("Must pass 'image_filename' to 'exif_from_filename'") with Image.open(image_filename) as pil_img: ocrd_exif = OcrdExif(pil_img) return ocrd_exif
[docs] def page_from_image(input_file : Union[OcrdFile, ClientSideOcrdFile], **kwargs) -> OcrdPage: """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` representing an image (i.e. should have ``@mimetype`` starting with ``image/``). Arguments: input_file (:py:class:`~ocrd_models.ocrd_file.OcrdFile`): file to open \ and produce a PAGE DOM for """ if 'with_etree' in kwargs: deprecation_warning('kwarg "with_etree" is obsolete now, we always return OcrdPage including etree') if not input_file.local_filename: raise ValueError("input_file must have 'local_filename' property") if not Path(input_file.local_filename).exists(): raise FileNotFoundError("File not found: '%s' (%s)" % (input_file.local_filename, input_file)) exif = exif_from_filename(input_file.local_filename) now = datetime.now() pcgts = PcGtsType( Metadata=MetadataType( Creator="OCR-D/core %s" % VERSION, Created=now, LastChange=now ), Page=PageType( imageWidth=exif.width, imageHeight=exif.height, # XXX brittle imageFilename=str(input_file.local_filename) if input_file.local_filename else input_file.url ), pcGtsId=input_file.ID ) mapping = {} etree : ET._Element = pcgts.to_etree(mapping_=mapping) revmap = dict(((node, element) for element, node in mapping.items())) return OcrdPage(pcgts, etree, mapping, revmap)
[docs] def page_from_file(input_file, **kwargs) -> OcrdPage: """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` or a file path representing either a PAGE-XML or an image (to generate a PAGE-XML for). Arguments: input_file (:py:class:`~ocrd_models.ocrd_file.OcrdFile` or `str`): file to open \ and produce a PAGE DOM for """ if 'with_etree' in kwargs: deprecation_warning('kwarg "with_etree" is obsolete now, we always return OcrdPage including etree') if not isinstance(input_file, (OcrdFile, ClientSideOcrdFile)): mimetype = guess_media_type(input_file, application_xml=MIMETYPE_PAGE) input_file = OcrdFile(ET.Element("dummy"), local_filename=input_file, mimetype=mimetype) if not input_file.local_filename: raise ValueError("input_file must have 'local_filename' property") if not Path(input_file.local_filename).exists(): raise FileNotFoundError("File not found: '%s' (%s)" % (input_file.local_filename, input_file)) if input_file.mimetype.startswith('image'): return page_from_image(input_file) if input_file.mimetype == MIMETYPE_PAGE: revmap = {} # the old/default gds.reverse_node_mapping is useless # since 2.39.4, we can actually get the exact reverse mapping for perfect round-trip # but awkwardly, we have to pass the dict in for that page = OcrdPage(*parseEtree(input_file.local_filename, reverse_mapping=revmap, silence=True)) page.revmap = revmap return page raise ValueError("Unsupported mimetype '%s'" % input_file.mimetype)