"""
API for validating `OcrdPage <../ocrd_models/ocrd_models.ocrd_page.html>`_.
"""
import re
from shapely.geometry import Polygon, LineString
from shapely.validation import explain_validity
from ocrd_utils import getLogger, polygon_from_points, deprecated_alias
from ocrd_models.ocrd_page import parse
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
PcGtsType,
PageType,
TextRegionType,
TextLineType,
WordType,
GlyphType,
TextEquivType
)
from ocrd_models.ocrd_page_generateds import (
RegionType,
ReadingDirectionSimpleType,
TextLineOrderSimpleType,
RegionRefType,
RegionRefIndexedType,
OrderedGroupType,
OrderedGroupIndexedType,
UnorderedGroupType,
UnorderedGroupIndexedType,
)
from ocrd_models import ValidationReport
_HIERARCHY = [
# page can contain different types of regions
(PageType, 'get_AdvertRegion', None), # pylint: disable=bad-whitespace
(PageType, 'get_ChartRegion', None), # pylint: disable=bad-whitespace
(PageType, 'get_ChemRegion', None), # pylint: disable=bad-whitespace
(PageType, 'get_CustomRegion', None), # pylint: disable=bad-whitespace
(PageType, 'get_GraphicRegion', None), # pylint: disable=bad-whitespace
(PageType, 'get_ImageRegion', None), # pylint: disable=bad-whitespace
(PageType, 'get_LineDrawingRegion', None), # pylint: disable=bad-whitespace
(PageType, 'get_MapRegion', None), # pylint: disable=bad-whitespace
(PageType, 'get_MathsRegion', None), # pylint: disable=bad-whitespace
(PageType, 'get_MusicRegion', None), # pylint: disable=bad-whitespace
(PageType, 'get_NoiseRegion', None), # pylint: disable=bad-whitespace
(PageType, 'get_SeparatorRegion', None), # pylint: disable=bad-whitespace
(PageType, 'get_TableRegion', None), # pylint: disable=bad-whitespace
(PageType, 'get_TextRegion', None), # pylint: disable=bad-whitespace
(PageType, 'get_UnknownRegion', None), # pylint: disable=bad-whitespace
# all regions can be recursive
(RegionType, 'get_AdvertRegion', None), # pylint: disable=bad-whitespace
(RegionType, 'get_ChartRegion', None), # pylint: disable=bad-whitespace
(RegionType, 'get_ChemRegion', None), # pylint: disable=bad-whitespace
(RegionType, 'get_CustomRegion', None), # pylint: disable=bad-whitespace
(RegionType, 'get_GraphicRegion', None), # pylint: disable=bad-whitespace
(RegionType, 'get_ImageRegion', None), # pylint: disable=bad-whitespace
(RegionType, 'get_LineDrawingRegion', None), # pylint: disable=bad-whitespace
#(RegionType, 'get_MapRegion', None), # pylint: disable=bad-whitespace
(RegionType, 'get_MathsRegion', None), # pylint: disable=bad-whitespace
(RegionType, 'get_MusicRegion', None), # pylint: disable=bad-whitespace
(RegionType, 'get_NoiseRegion', None), # pylint: disable=bad-whitespace
(RegionType, 'get_SeparatorRegion', None), # pylint: disable=bad-whitespace
(RegionType, 'get_TableRegion', None), # pylint: disable=bad-whitespace
(RegionType, 'get_TextRegion', None), # pylint: disable=bad-whitespace
(RegionType, 'get_UnknownRegion', None), # pylint: disable=bad-whitespace
# only TextRegion can contain TextLine
(TextRegionType, 'get_TextLine', '\n'), # pylint: disable=bad-whitespace
(TextLineType, 'get_Word', ' '), # pylint: disable=bad-whitespace
(WordType, 'get_Glyph', ''), # pylint: disable=bad-whitespace
(GlyphType, None, None), # pylint: disable=bad-whitespace
]
_ORDER = [
(None, TextLineOrderSimpleType.BOTTOMTOTOP, ReadingDirectionSimpleType.RIGHTTOLEFT),
(PageType, 'get_textLineOrder', 'get_readingDirection'), # pylint: disable=bad-whitespace
(TextRegionType, 'get_textLineOrder', 'get_readingDirection'), # pylint: disable=bad-whitespace
(TextLineType, None, 'get_readingDirection'), # pylint: disable=bad-whitespace
(WordType, None, 'get_readingDirection'), # pylint: disable=bad-whitespace
]
# The following parameters control how tolerant we are with respect to
# polygon path self-validity and parent-child containment. We have to
# offer this, because most implementations, including PRImA itself,
# do _not_ offer pixel-precise correctness.
# How much may polygon paths deviate when simplifying them
# to avoid self-intersections?
POLY_TOLERANCE = 1.0
# How large a margin to increase parent polygons before
# checking their children are properly contained?
PARENT_SLACK = 1.5
[docs]
class ConsistencyError(Exception):
"""
Exception representing a consistency error in textual transcription across levels of a PAGE-XML.
(Element text strings must be the concatenation of their children's text strings, joined by white space.)
"""
def __init__(self, tag, ID, file_id, actual, expected):
"""
Construct a new ConsistencyError.
Arguments:
tag (string): Level of the inconsistent element (parent)
ID (string): ``ID`` of the inconsistent element (parent)
file_id (string): ``mets:id`` of the PAGE file
actual (string): Value of parent's TextEquiv[0]/Unicode
expected (string): Concatenated values of children's
TextEquiv[0]/Unicode, joined by white-space
"""
self.tag = tag
self.ID = ID
self.file_id = file_id
self.actual = actual
self.expected = expected
super(ConsistencyError, self).__init__(
"INCONSISTENCY in %s ID '%s' of file '%s': text results '%s' != concatenated '%s'" % (
tag, ID, file_id, actual, expected))
[docs]
class CoordinateConsistencyError(Exception):
"""
Exception representing a consistency error in coordinate confinement across levels of a PAGE-XML.
(Element coordinate polygons must be properly contained in their parents' coordinate polygons.)
"""
def __init__(self, tag, ID, file_id, outer, inner):
"""
Construct a new CoordinateConsistencyError.
Arguments:
tag (string): Level of the offending element (child)
ID (string): ``ID`` of the offending element (child)
file_id (string): ``mets:id`` of the PAGE file
outer (string): Coordinate points of the parent
inner (string): Coordinate points of the child
"""
self.tag = tag
self.ID = ID
self.file_id = file_id
self.outer = outer
self.inner = inner
super(CoordinateConsistencyError, self).__init__(
"INCONSISTENCY in %s ID '%s' of '%s': coords '%s' not within parent coords '%s'" % (
tag, ID, file_id, inner, outer))
[docs]
class CoordinateValidityError(Exception):
"""
Exception representing a validity error of an element's coordinates in PAGE-XML.
(Element coordinate polygons must have at least 3 points, and must not
self-intersect or be non-contiguous or be negative.)
"""
def __init__(self, tag, ID, file_id, points, reason='unknown'):
"""
Construct a new CoordinateValidityError.
Arguments:
tag (string): Level of the offending element (child)
ID (string): ``ID`` of the offending element (child)
points (string): Coordinate points
reason (string): description of the problem
"""
self.tag = tag
self.ID = ID
self.file_id = file_id
self.points = points
super(CoordinateValidityError, self).__init__(
"INVALIDITY in %s ID '%s' of '%s': coords '%s' - %s" % (
tag, ID, file_id, points, reason))
[docs]
def compare_without_whitespace(a, b):
"""
Compare two strings, ignoring all whitespace.
"""
return re.sub('\\s+', '', a) == re.sub('\\s+', '', b)
[docs]
def page_get_reading_order(ro, rogroup):
"""Add all elements from the given reading order group to the given dictionary.
Given a dict ``ro`` from layout element IDs to ReadingOrder element objects,
and an object ``rogroup`` with additional ReadingOrder element objects,
add all references to the dict, traversing the group recursively.
"""
regionrefs = list()
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)):
regionrefs = (rogroup.get_RegionRefIndexed() +
rogroup.get_OrderedGroupIndexed() +
rogroup.get_UnorderedGroupIndexed())
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)):
regionrefs = (rogroup.get_RegionRef() +
rogroup.get_OrderedGroup() +
rogroup.get_UnorderedGroup())
for elem in regionrefs:
ro[elem.get_regionRef()] = elem
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)):
page_get_reading_order(ro, elem)
[docs]
def make_poly(polygon_points):
"""Instantiate a Polygon from a list of point pairs, or return an error string"""
if len(polygon_points) < 4:
return 'has too few points'
poly = Polygon(polygon_points)
if POLY_TOLERANCE:
poly = poly.simplify(POLY_TOLERANCE)
if not poly.is_valid:
return explain_validity(poly)
elif poly.is_empty:
return 'is empty'
elif poly.bounds[0] < 0 or poly.bounds[1] < 0:
return 'is negative'
return poly
[docs]
def make_line(line_points):
"""Instantiate a LineString from a list of point pairs, or return an error string"""
if len(line_points) < 2:
return 'has too few points'
line = LineString(line_points)
if not line.is_valid:
return explain_validity(line)
elif line.is_empty:
return 'is empty'
elif line.bounds[0] < 0 or line.bounds[1] < 0:
return 'is negative'
return line
[docs]
@deprecated_alias(strictness='page_textequiv_consistency')
@deprecated_alias(strategy='page_textequiv_strategy')
def validate_consistency(node, page_textequiv_consistency, page_textequiv_strategy,
check_baseline, check_coords, report, file_id,
joinRelations=None, readingOrder=None,
textLineOrder=None, readingDirection=None):
"""
Check whether the text results on an element is consistent with its child element text results,
and whether the coordinates of an element are fully within its parent element coordinates.
"""
log = getLogger('ocrd.page_validator.validate_consistency')
if isinstance(node, PcGtsType):
# top-level (start recursion)
node_id = node.get_pcGtsId()
node = node.get_Page() # has no .id
if not readingOrder:
readingOrder = dict()
ro = node.get_ReadingOrder()
if ro:
page_get_reading_order(readingOrder, ro.get_OrderedGroup() or ro.get_UnorderedGroup())
if not joinRelations:
joinRelations = list()
relations = node.get_Relations() # get RelationsType
if relations:
relations = relations.get_Relation() # get list of RelationType
else:
relations = []
for relation in relations:
if relation.get_type() == 'join': # ignore 'link' type here
joinRelations.append((relation.get_SourceRegionRef().get_regionRef(),
relation.get_TargetRegionRef().get_regionRef()))
elif isinstance(node, GlyphType):
# terminal level (end recursion)
return True
else:
node_id = node.id
tag = node.original_tagname_
log.debug("Validating %s %s", tag, node_id)
consistent = True
if check_coords or check_baseline:
if isinstance(node, PageType):
parent = node.get_Border()
else:
parent = node
if parent:
parent_points = parent.get_Coords().points
node_poly = make_poly(polygon_from_points(parent_points))
if not isinstance(node_poly, Polygon):
report.add_error(CoordinateValidityError(tag, node_id, file_id,
parent_points, node_poly))
log.debug("Invalid coords of %s %s", tag, node_id)
consistent = False
node_poly = None # don't use in further comparisons
else:
node_poly = None
for class_, getterLO, getterRD in _ORDER[1:]:
if isinstance(node, class_):
if getterLO:
textLineOrder = getattr(node, getterLO)()
if getterRD:
readingDirection = getattr(node, getterRD)()
for class_, getter, concatenate_with in _HIERARCHY:
if not isinstance(node, class_):
continue
children = getattr(node, getter)()
if (getter == 'get_TextRegion' and children and
all(child.id in readingOrder for child in children) and
isinstance(readingOrder[children[0].id].parent_object_,
(OrderedGroupType, OrderedGroupIndexedType))):
children = sorted(children, key=lambda child:
readingOrder[child.id].index)
elif ((getter == 'get_TextLine' and textLineOrder == _ORDER[0][1]) or
(getter in ['get_Word', 'get_Glyph'] and readingDirection == _ORDER[0][2])):
children = list(reversed(children))
for child in children:
consistent = (validate_consistency(child, page_textequiv_consistency, page_textequiv_strategy,
check_baseline, check_coords,
report, file_id,
joinRelations, readingOrder,
textLineOrder, readingDirection)
and consistent)
if check_coords and node_poly:
child_tag = child.original_tagname_
child_points = child.get_Coords().points
child_poly = make_poly(polygon_from_points(child_points))
if not isinstance(child_poly, Polygon):
# report.add_error(CoordinateValidityError(child_tag, child.id, file_id, child_points))
# log.debug("Invalid coords of %s %s", child_tag, child.id)
# consistent = False
pass # already reported in recursive call above
elif not child_poly.within(node_poly.buffer(PARENT_SLACK)):
# TODO: automatic repair?
report.add_error(CoordinateConsistencyError(child_tag, child.id, file_id,
parent_points, child_points))
log.debug("Inconsistent coords of %s %s", child_tag, child.id)
consistent = False
if isinstance(node, TextLineType) and check_baseline and node.get_Baseline():
baseline_points = node.get_Baseline().points
baseline_line = make_line(polygon_from_points(baseline_points))
if not isinstance(baseline_line, LineString):
report.add_error(CoordinateValidityError("Baseline", node_id, file_id,
baseline_points, baseline_line))
log.debug("Invalid coords of baseline in %s", node_id)
consistent = False
elif node_poly and not baseline_line.within(node_poly.buffer(PARENT_SLACK)):
report.add_error(CoordinateConsistencyError("Baseline", node_id, file_id,
parent_points, baseline_points))
log.debug("Inconsistent coords of baseline in %s %s", tag, node_id)
consistent = False
if concatenate_with is not None and page_textequiv_consistency != 'off':
# validate textual consistency of node with children
concatenated = concatenate(children, concatenate_with, page_textequiv_strategy,
joinRelations)
text_results = get_text(node, page_textequiv_strategy)
if concatenated and text_results and concatenated != text_results:
consistent = False
if page_textequiv_consistency == 'fix':
log.debug("Repaired text of %s %s", tag, node_id)
set_text(node, concatenated, page_textequiv_strategy)
elif (page_textequiv_consistency == 'strict' # or 'lax' but...
or not compare_without_whitespace(concatenated, text_results)):
log.debug("Inconsistent text of %s %s", tag, node_id)
report.add_error(ConsistencyError(tag, node_id, file_id,
text_results, concatenated))
return consistent
[docs]
def concatenate(nodes, concatenate_with, page_textequiv_strategy, joins=None):
"""
Concatenate nodes textually according to https://ocr-d.github.io/page#consistency-of-text-results-on-different-levels
"""
if not nodes:
return ''
if not joins:
joins = list()
result = get_text(nodes[0], page_textequiv_strategy)
for node, next_node in zip(nodes, nodes[1:]):
if (node.id, next_node.id) not in joins:
# TODO: also cover 2-level joins like word-word
result += concatenate_with
result += get_text(next_node, page_textequiv_strategy)
return result.strip()
[docs]
def get_text(node, page_textequiv_strategy='first'):
"""
Get the first or most confident among text results (depending on ``page_textequiv_strategy``).
For the strategy ``best``, return the string of the highest scoring result.
For the strategy ``first``, return the string of the lowest indexed result.
If there are no scores/indexes, use the first result.
If there are no results, return the empty string.
"""
log = getLogger('ocrd.page_validator.get_text')
textEquivs = node.get_TextEquiv()
if not textEquivs:
log.debug("No text results on %s %s", node, node.id)
return ''
elif page_textequiv_strategy == 'best':
if len(textEquivs) > 1:
textEquivsSorted = sorted([x for x in textEquivs if x.conf],
# generateDS does not convert simpleType for attributes (yet?)
key=lambda x: float(x.conf))
if textEquivsSorted:
return textEquivsSorted[-1].get_Unicode().strip()
# fall back to first element
return textEquivs[0].get_Unicode().strip()
#elif page_textequiv_strategy == 'first':
else:
if len(textEquivs) > 1:
textEquivsSorted = sorted([x for x in textEquivs if isinstance(x.index, int)],
key=lambda x: x.index)
if textEquivsSorted:
return textEquivsSorted[0].get_Unicode().strip()
# fall back to first element
return textEquivs[0].get_Unicode().strip()
[docs]
def set_text(node, text, page_textequiv_strategy):
"""
Set the first or most confident among text results (depending on ``page_textequiv_strategy``).
For the strategy ``best``, set the string of the highest scoring result.
For the strategy ``first``, set the string of the lowest indexed result.
If there are no scores/indexes, use the first result.
If there are no results, add a new one.
"""
text = text.strip()
textEquivs = node.get_TextEquiv()
if not textEquivs:
node.add_TextEquiv(TextEquivType(Unicode=text)) # or index=0 ?
elif page_textequiv_strategy == 'best':
if len(textEquivs) > 1:
textEquivsSorted = sorted([x for x in textEquivs if x.conf],
# generateDS does not convert simpleType for attributes (yet?)
key=lambda x: float(x.conf))
if textEquivsSorted:
textEquivsSorted[-1].set_Unicode(text)
return
# fall back to first element
textEquivs[0].set_Unicode(text)
#elif page_textequiv_strategy == 'first':
else:
if len(textEquivs) > 1:
textEquivsSorted = sorted([x for x in textEquivs if isinstance(x.index, int)],
key=lambda x: x.index)
if textEquivsSorted:
textEquivsSorted[0].set_Unicode(text)
return
# fall back to first element
textEquivs[0].set_Unicode(text)
[docs]
class PageValidator():
"""
Validator for `OcrdPage <../ocrd_models/ocrd_models.ocrd_page.html>`.
"""
[docs]
@staticmethod
@deprecated_alias(strictness='page_textequiv_consistency')
@deprecated_alias(strategy='page_textequiv_strategy')
def validate(filename=None, ocrd_page=None, ocrd_file=None,
page_textequiv_consistency='strict', page_textequiv_strategy='first',
check_baseline=True, check_coords=True):
"""
Validates a PAGE file for consistency by filename, OcrdFile or passing OcrdPage directly.
Arguments:
filename (string): Path to PAGE
ocrd_page (OcrdPage): OcrdPage instance
ocrd_file (OcrdFile): OcrdFile instance wrapping OcrdPage
page_textequiv_consistency (string): 'strict', 'lax', 'fix' or 'off'
page_textequiv_strategy (string): Currently only 'first'
check_baseline (bool): whether Baseline must be fully within TextLine/Coords
check_coords (bool): whether *Region/TextLine/Word/Glyph must each be fully
contained within Border/*Region/TextLine/Word, resp.
Returns:
report (:class:`ValidationReport`) Report on the validity
"""
log = getLogger('ocrd.page_validator.validate')
if ocrd_page:
page = ocrd_page
file_id = ocrd_page.get_pcGtsId()
elif ocrd_file:
page = page_from_file(ocrd_file)
file_id = ocrd_file.ID
elif filename:
page = parse(filename, silence=True)
file_id = filename
else:
raise Exception("At least one of ocrd_page, ocrd_file or filename must be set")
if page_textequiv_strategy not in ('first'):
raise Exception("page_textequiv_strategy %s not implemented" % page_textequiv_strategy)
if page_textequiv_consistency not in ('strict', 'lax', 'fix', 'off'):
raise Exception("page_textequiv_consistency level %s not implemented" % page_textequiv_consistency)
report = ValidationReport()
log.info("Validating input file '%s'", file_id)
validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords, report, file_id)
return report