Source code for ocrd_modelfactory

"""

Factory methods to create models for data, files, URLs.

"""
from datetime import datetime
from pathlib import Path
from typing import Tuple, Union
from yaml import safe_load, safe_dump

from PIL import Image
from lxml import etree as ET

from ocrd_utils import VERSION, MIMETYPE_PAGE, guess_media_type
from ocrd_models import OcrdExif, OcrdFile, ClientSideOcrdFile
from ocrd_models.ocrd_page import (
    PcGtsType, PageType, MetadataType,
    parse, parseEtree
)

__all__ = [
    'exif_from_filename',
    'page_from_file',
    'page_from_image',
]


[docs] def exif_from_filename(image_filename): """ Create :py:class:`~ocrd_models.ocrd_exif.OcrdExif` by opening an image file with PIL and reading its metadata. Arguments: image_filename (str): Local image path name (relative to workspace). """ if image_filename is None: raise Exception("Must pass 'image_filename' to 'exif_from_filename'") with Image.open(image_filename) as pil_img: ocrd_exif = OcrdExif(pil_img) return ocrd_exif
[docs] def page_from_image(input_file, with_tree=False): """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` representing an image (i.e. should have ``@mimetype`` starting with ``image/``). Arguments: input_file (:py:class:`~ocrd_models.ocrd_file.OcrdFile`): file to open \ and produce a PAGE DOM for Keyword arguments: with_tree (boolean): whether to return XML node tree, element-node mapping \ and reverse mapping, too (cf. :py:func:`ocrd_models.ocrd_page.parseEtree`) """ if not input_file.local_filename: raise ValueError("input_file must have 'local_filename' property") if not Path(input_file.local_filename).exists(): raise FileNotFoundError("File not found: '%s' (%s)" % (input_file.local_filename, input_file)) exif = exif_from_filename(input_file.local_filename) now = datetime.now() pcgts = PcGtsType( Metadata=MetadataType( Creator="OCR-D/core %s" % VERSION, Created=now, LastChange=now ), Page=PageType( imageWidth=exif.width, imageHeight=exif.height, # XXX brittle imageFilename=str(input_file.local_filename) if input_file.local_filename else input_file.url ), pcGtsId=input_file.ID ) if not with_tree: return pcgts mapping = dict() etree = pcgts.to_etree(mapping_=mapping) revmap = dict(((node, element) for element, node in mapping.items())) return pcgts, etree, mapping, revmap
[docs] def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET.Element, dict, dict]]: """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` or a file path representing either a PAGE-XML or an image (to generate a PAGE-XML for). Arguments: input_file (:py:class:`~ocrd_models.ocrd_file.OcrdFile` or `str`): file to open \ and produce a PAGE DOM for Keyword arguments: with_tree (boolean): whether to return XML node tree, element-node mapping \ and reverse mapping, too (cf. :py:func:`ocrd_models.ocrd_page.parseEtree`) """ if not isinstance(input_file, (OcrdFile, ClientSideOcrdFile)): mimetype = guess_media_type(input_file, application_xml=MIMETYPE_PAGE) input_file = OcrdFile(ET.Element("dummy"), local_filename=input_file, mimetype=mimetype) if not input_file.local_filename: raise ValueError("input_file must have 'local_filename' property") if not Path(input_file.local_filename).exists(): raise FileNotFoundError("File not found: '%s' (%s)" % (input_file.local_filename, input_file)) if input_file.mimetype.startswith('image'): return page_from_image(input_file, with_tree=with_tree) if input_file.mimetype == MIMETYPE_PAGE: return (parseEtree if with_tree else parse)(input_file.local_filename, silence=True) raise ValueError("Unsupported mimetype '%s'" % input_file.mimetype)