Source code for ocrd.processor.builtin.merge_processor

# pylint: disable=missing-module-docstring,invalid-name
from typing import Optional
from itertools import count
from collections import OrderedDict as odict

import click

from ocrd import Processor, OcrdPageResult
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd_modelfactory import page_from_file
from ocrd_models import OcrdPage
from ocrd_models.ocrd_page import (
    BorderType,
    CoordsType,
    ReadingOrderType,
    UnorderedGroupType,
)
from ocrd_utils import bbox_from_points

_SEGTYPES = [
    "NoiseRegion",
    "LineDrawingRegion",
    "AdvertRegion",
    "ImageRegion",
    "ChartRegion",
    "MusicRegion",
    "GraphicRegion",
    "UnknownRegion",
    "CustomRegion",
    "SeparatorRegion",
    "MathsRegion",
    "TextRegion",
    "MapRegion",
    "ChemRegion",
    "TableRegion",
    "TextLine",
    "Word",
    "Glyph"
]



[docs]
def get_border_bbox(pcgts):
    if pcgts.Page.Border is None:
        return [0, 0, pcgts.Page.imageWidth, pcgts.Page.imageHeight]
    return bbox_from_points(pcgts.Page.Border.Coords.points)



[docs]
def rename_segments(pcgts, start=1):
    renamed = {}
    rodict = pcgts.Page.get_ReadingOrderGroups()
    # get everything that has an identifier
    nodes = pcgts.xpath("//*[@id]")
    # filter segments
    segments = [segment for segment in map(pcgts.revmap.get, nodes)
                # get PAGE objects from matching etree nodes
                # but allow only hierarchy segments
                if segment.__class__.__name__.replace('Type', '') in _SEGTYPES]
    # count segments and rename them
    # fixme: or perhaps better to have each segment type named and counted differently?
    num = 0
    regions = []
    for num, segment in zip(count(start=start), segments):
        segtype = segment.original_tagname_
        #parent = segment.parent_object_
        newname = "seg%011d" % num
        assert not segment.id in renamed
        if segtype.endswith('Region') and segment.id in rodict:
            # update reading order
            roelem = rodict[segment.id]
            roelem.regionRef = newname
        renamed[segment.id] = newname
        segment.id = newname
    return num



[docs]
class MergeProcessor(Processor):

[docs]
    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
        """
        Merge PAGE segment hierarchy elements from all input file groups.

        For each page, open and deserialise PAGE input files. Rename all elements
        of the segment hierarchy to new (clash-free) identifers. Redefine the
        `Border` coordinates as the convex hull of all input borders. Then add all
        regions from all input files, concatenating them into a single `ReadingOrder`
        in the order of input file groups.

        Produce a new PAGE output file by serialising the resulting hierarchy.
        """
        actual_pcgts = list(filter(None, input_pcgts))
        assert len(set(pcgts.Page.imageFilename for pcgts in actual_pcgts)) == 1, \
            "input files must all reference the same @imageFilename"
        # create new PAGE for image
        result = OcrdPageResult(page_from_file(actual_pcgts[0].Page.imageFilename))
        # unify Border
        borders = [get_border_bbox(pcgts) for pcgts in actual_pcgts]
        minx, miny, maxx, maxy = zip(*borders)
        minx = min(minx)
        miny = min(miny)
        maxx = max(maxx)
        maxy = max(maxy)
        result.pcgts.Page.set_Border(
            BorderType(CoordsType(
                points=f"{minx},{miny} {maxx},{miny} {maxx},{maxy} {minx},{maxy}")))
        # rename all segments
        num = 1
        for pcgts in actual_pcgts:
            num = rename_segments(pcgts, num)
        # concatenate all regions
        ug = UnorderedGroupType(id="merged")
        result.pcgts.Page.set_ReadingOrder(ReadingOrderType(UnorderedGroup=ug))
        for pcgts in actual_pcgts:
            for region in pcgts.Page.get_AllRegions():
                adder = getattr(result.pcgts.Page, 'add_' + region.original_tagname_)
                adder(region)
            if pcgts.Page.ReadingOrder:
                group = pcgts.Page.ReadingOrder.OrderedGroup or pcgts.Page.ReadingOrder.UnorderedGroup
                adder = getattr(ug, 'add_' + group.original_tagname_)
                adder(group)
        return result


    @property
    def metadata_filename(self):
        return 'processor/builtin/dummy/ocrd-tool.json'

    @property
    def executable(self):
        return 'ocrd-merge'



@click.command()
@ocrd_cli_options
def cli(*args, **kwargs):
    return ocrd_cli_wrap_processor(MergeProcessor, *args, **kwargs)
Source code for ocrd.processor.builtin.merge_processor

OCR-D/core

Navigation

Related Topics