Source code for ocrd.processor.builtin.shell_processor

# pylint: disable=missing-module-docstring,invalid-name
from typing import List, Optional, get_args
import os
import subprocess
from tempfile import TemporaryDirectory

import click

from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd import Processor
from ocrd.processor.base import MissingInputFile
from ocrd_models import OcrdPage, OcrdFileType
from ocrd_models.ocrd_page import to_xml
from ocrd_modelfactory import page_from_file
from ocrd_utils import config, make_file_id, MIMETYPE_PAGE


[docs] class ShellProcessor(Processor):
[docs] def setup(self): command = self.parameter['command'] if '@INFILE' not in command: raise Exception("command parameter requires @INFILE pattern") if '@OUTFILE' not in command: raise Exception("command parameter requires @OUTFILE pattern")
[docs] def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: """ Process PAGE files via arbitrary command line on the shell. \b For each selected physical page of the workspace, pass ``command`` to the shell, replacing: - the string ``@INFILE`` with the PAGE input file path, - the string ``@OUTFILE`` with the PAGE output file path. Modify the resulting PAGE output file with our new `@pcGtsId` and metadata. """ input_paths: List[str] = [""] * len(input_files) input_pos = next(i for i, input_file in enumerate(input_files) if input_file is not None) page_id = input_files[input_pos].pageId self._base_logger.info("processing page %s", page_id) for i, input_file in enumerate(input_files): grp = self.input_file_grp.split(',')[i] if input_file is None: self._base_logger.debug(f"ignoring missing file for input fileGrp {grp} for page {page_id}") continue assert isinstance(input_file, get_args(OcrdFileType)) if not input_file.local_filename: self._base_logger.error(f'No local file exists for page {page_id} in file group {grp}') if config.OCRD_MISSING_INPUT == 'ABORT': raise MissingInputFile(grp, page_id, input_file.mimetype) continue self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}") if os.path.exists(input_file.local_filename): input_paths[i] = input_file.local_filename else: self._base_logger.error(f"non-existing local file for input fileGrp {grp} for page {page_id}") if not any(input_paths): self._base_logger.warning(f'skipping page {page_id}') return output_file_id = make_file_id(input_files[input_pos], self.output_file_grp) if input_files[input_pos].fileGrp == self.output_file_grp: # input=output fileGrp: re-use ID exactly output_file_id = input_files[input_pos].ID output_file = next(self.workspace.mets.find_files(ID=output_file_id), None) if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE': # short-cut avoiding useless computation: raise FileExistsError( f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set" ) command = self.parameter['command'] with TemporaryDirectory(suffix=page_id) as tmpdir: out_path = os.path.join(tmpdir, output_file_id + ".xml") # remove quotation around filename patterns, if any command = command.replace('"@INFILE"', '@INFILE').replace('"@OUTFILE"', '@OUTFILE') command = command.replace("'@INFILE'", '@INFILE').replace("'@OUTFILE'", '@OUTFILE') # replace filename patterns with actual paths, quoted for in_path in input_paths: command = command.replace('@INFILE', '"' + in_path + '"', 1) command = command.replace('@OUTFILE', '"' + out_path + '"') # execute command pattern self.logger.debug("Running command: '%s'", command) # pylint: disable=subprocess-run-check result = subprocess.run(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.logger.debug("Command for %s returned: %d", page_id, result.returncode) if result.stdout: self.logger.info("Command for %s stdout: %s", page_id, result.stdout) if result.stderr: self.logger.warning("Command for %s stderr: %s", page_id, result.stderr) if result.returncode != 0: self.logger.error("Command for %s failed", page_id) return try: result = page_from_file(out_path) assert isinstance(result, OcrdPage) except ValueError as err: # not PAGE and not an image to generate PAGE for self._base_logger.error(f"non-PAGE output for page {page_id}: {err}") return result.set_pcGtsId(output_file_id) self.add_metadata(result) self.workspace.add_file( file_id=output_file_id, file_grp=self.output_file_grp, page_id=page_id, local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(result), )
@property def metadata_filename(self): return 'processor/builtin/dummy/ocrd-tool.json' @property def executable(self): return 'ocrd-command'
@click.command() @ocrd_cli_options def cli(*args, **kwargs): return ocrd_cli_wrap_processor(ShellProcessor, *args, **kwargs)