Source code for ocrd.processor.builtin.dummy_processor
# pylint: disable=missing-module-docstring,invalid-name
from os.path import join
from typing import Optional
import click
from ocrd import Processor
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd.processor.ocrd_page_result import OcrdPageResult
from ocrd_models.ocrd_file import OcrdFileType
from ocrd_models.ocrd_page import OcrdPage, to_xml
from ocrd_utils import (
make_file_id,
MIME_TO_EXT,
MIMETYPE_PAGE,
)
from ocrd_modelfactory import page_from_file
[docs]
class DummyProcessor(Processor):
"""
Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group
"""
[docs]
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
assert input_pcgts[0]
# nothing to do here
return OcrdPageResult(input_pcgts[0])
[docs]
def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
input_file = input_files[0]
assert input_file
assert input_file.local_filename
if self.parameter['copy_files'] and input_file.mimetype != MIMETYPE_PAGE:
# we need to mimic the actual copying in addition to the PAGE boilerplate
file_id = make_file_id(input_file, self.output_file_grp)
ext = MIME_TO_EXT.get(input_file.mimetype, '')
local_filename = join(self.output_file_grp, file_id + ext)
self.logger.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id)
with open(input_file.local_filename, 'rb') as f:
output_file = self.workspace.add_file(
file_id=file_id,
file_grp=self.output_file_grp,
page_id=input_file.pageId,
mimetype=input_file.mimetype,
local_filename=local_filename,
content=f.read(),
)
file_id = file_id + '_PAGE'
pcgts = page_from_file(output_file)
assert isinstance(pcgts, OcrdPage)
pcgts = self.process_page_pcgts(pcgts).pcgts
pcgts.set_pcGtsId(file_id)
self.add_metadata(pcgts)
self.logger.info("Add PAGE-XML %s generated for %s", file_id, output_file)
self.workspace.add_file(file_id=file_id,
file_grp=self.output_file_grp,
page_id=input_file.pageId,
local_filename=join(self.output_file_grp, file_id + '.xml'),
mimetype=MIMETYPE_PAGE,
content=to_xml(pcgts),
)
else:
if self.parameter['copy_files']:
self.logger.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed", input_file.local_filename)
else:
self.logger.info("Not copying %s because it is not a PAGE-XML file and copy_files was false", input_file.local_filename)
# we can rely on base implementation verbatim
super().process_page_file(input_file)
@property
def metadata_filename(self):
return 'processor/builtin/dummy/ocrd-tool.json'
@property
def executable(self):
return 'ocrd-dummy'
@click.command()
@ocrd_cli_options
def cli(*args, **kwargs):
return ocrd_cli_wrap_processor(DummyProcessor, *args, **kwargs)