Fix the extraction of text from Page with TableRegion

Dinglehopper did not consider `OrderedGroupIndex` in the `ReadingOrder`
element when extracting text regions. As a consequence a `TableRegion`
was not considered for text extraction.
pull/50/head
Benjamin Rosemann 4 years ago
parent 8cd8314c8a
commit a68fc269d9

@ -70,24 +70,11 @@ def page_extract(tree, *, textequiv_level="region"):
if reading_order is not None:
for group in reading_order.iterfind("./*", namespaces=nsmap):
if ET.QName(group.tag).localname == "OrderedGroup":
region_ref_indexeds = group.findall(
"./page:RegionRefIndexed", namespaces=nsmap
)
for region_ref_indexed in sorted(
region_ref_indexeds, key=lambda r: int(r.attrib["index"])
):
region_id = region_ref_indexed.attrib["regionRef"]
region = tree.find(
'.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap
regions.extend(
extract_texts_from_reading_order_group(
group, tree, nsmap, textequiv_level
)
if region is not None:
regions.append(
ExtractedText.from_text_segment(
region, nsmap, textequiv_level=textequiv_level
)
)
else:
pass # Not a TextRegion
)
else:
raise NotImplementedError
else:
@ -104,6 +91,35 @@ def page_extract(tree, *, textequiv_level="region"):
return ExtractedText(None, regions, "\n", None)
def extract_texts_from_reading_order_group(group, tree, nsmap, textequiv_level):
"""Recursive function to extract the texts from TextRegions in ReadingOrder."""
regions = []
ro_children = group.findall("./page:RegionRefIndexed", namespaces=nsmap)
ro_children.extend(group.findall("./page:OrderedGroupIndexed", namespaces=nsmap))
ro_children = filter(lambda child: "index" in child.attrib.keys(), ro_children)
for ro_child in sorted(ro_children, key=lambda child: int(child.attrib["index"])):
if ET.QName(ro_child.tag).localname == "OrderedGroupIndexed":
regions.extend(
extract_texts_from_reading_order_group(
ro_child, tree, nsmap, textequiv_level
)
)
else:
region_id = ro_child.attrib["regionRef"]
region = tree.find(
'.//page:TextRegion[@id="%s"]' % region_id, namespaces=nsmap
)
if region is not None:
regions.append(
ExtractedText.from_text_segment(
region, nsmap, textequiv_level=textequiv_level
)
)
else:
pass # Not a TextRegion
return regions
def page_text(tree, *, textequiv_level="region"):
return page_extract(tree, textequiv_level=textequiv_level).text

@ -0,0 +1,121 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd">
<Metadata>
<Creator/>
<Created>2020-10-28T08:43:47</Created>
<LastChange>1970-01-01T00:00:00</LastChange>
<Comments/>
</Metadata>
<Page imageFilename="0001.png" imageHeight="1123" imageWidth="794">
<TextRegion id="r1" orientation="0.0">
<Coords points="315,437 315,407 339,407 339,437"/>
<TextLine id="l1">
<Coords points="318,434 318,409 337,409 337,434"/>
<TextEquiv index="0">
<Unicode>5</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>5</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r2" orientation="0.0">
<Coords points="425,436 425,406 450,406 450,436"/>
<TextLine id="l2">
<Coords points="429,434 429,410 446,410 446,434"/>
<TextEquiv index="0">
<Unicode>6</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>6</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r3" orientation="0.0">
<Coords points="233,499 233,467 262,467 262,499"/>
<TextLine id="l3">
<Coords points="237,496 237,468 258,468 258,496"/>
<TextEquiv index="0">
<Unicode>7</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>7</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r4" orientation="0.0">
<Coords points="316,497 316,470 340,470 340,497"/>
<TextLine id="l4">
<Coords points="319,494 319,472 337,472 337,494"/>
<TextEquiv index="0">
<Unicode>8</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>8</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r5" orientation="0.0">
<Coords points="423,501 423,468 451,468 451,501"/>
<TextLine id="l5">
<Coords points="427,497 427,470 447,470 447,497"/>
<TextEquiv index="0">
<Unicode>9</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>9</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r6" orientation="0.0">
<Coords points="237,373 237,347 259,347 259,373"/>
<TextLine id="l6">
<Coords points="240,372 240,349 256,349 256,372"/>
<TextEquiv index="0">
<Unicode>1</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>1</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r7" orientation="0.0">
<Coords points="312,373 312,347 341,347 341,373"/>
<TextLine id="l7">
<Coords points="318,372 318,350 338,350 338,372"/>
<TextEquiv index="0">
<Unicode>2</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>2</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r8" orientation="0.0">
<Coords points="428,373 428,349 448,349 448,373"/>
<TextLine id="l8">
<Coords points="430,373 430,349 445,349 445,373"/>
<TextEquiv index="0">
<Unicode>3</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>3</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r9" orientation="0.0">
<Coords points="236,438 236,406 261,406 261,438"/>
<TextLine id="l9">
<Coords points="238,436 238,408 258,408 258,436"/>
<TextEquiv index="0">
<Unicode>4</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>4</Unicode>
</TextEquiv>
</TextRegion>
</Page>
</PcGts>

@ -0,0 +1,134 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd">
<Metadata>
<Creator/>
<Created>2020-10-28T08:43:47</Created>
<LastChange>1970-01-01T00:00:00</LastChange>
<Comments/>
</Metadata>
<Page imageFilename="0001.png" imageHeight="1123" imageWidth="794">
<ReadingOrder>
<OrderedGroup id="g1">
<RegionRefIndexed index="0" regionRef="r6"/>
<RegionRefIndexed index="1" regionRef="r7"/>
<RegionRefIndexed index="2" regionRef="r8"/>
<RegionRefIndexed index="3" regionRef="r9"/>
<RegionRefIndexed index="4" regionRef="r1"/>
<RegionRefIndexed index="5" regionRef="r2"/>
<RegionRefIndexed index="6" regionRef="r3"/>
<RegionRefIndexed index="7" regionRef="r4"/>
<RegionRefIndexed index="8" regionRef="r5"/>
</OrderedGroup>
</ReadingOrder>
<TextRegion id="r1" orientation="0.0">
<Coords points="315,437 315,407 339,407 339,437"/>
<TextLine id="l1">
<Coords points="318,434 318,409 337,409 337,434"/>
<TextEquiv index="0">
<Unicode>5</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>5</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r2" orientation="0.0">
<Coords points="425,436 425,406 450,406 450,436"/>
<TextLine id="l2">
<Coords points="429,434 429,410 446,410 446,434"/>
<TextEquiv index="0">
<Unicode>6</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>6</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r3" orientation="0.0">
<Coords points="233,499 233,467 262,467 262,499"/>
<TextLine id="l3">
<Coords points="237,496 237,468 258,468 258,496"/>
<TextEquiv index="0">
<Unicode>7</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>7</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r4" orientation="0.0">
<Coords points="316,497 316,470 340,470 340,497"/>
<TextLine id="l4">
<Coords points="319,494 319,472 337,472 337,494"/>
<TextEquiv index="0">
<Unicode>8</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>8</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r5" orientation="0.0">
<Coords points="423,501 423,468 451,468 451,501"/>
<TextLine id="l5">
<Coords points="427,497 427,470 447,470 447,497"/>
<TextEquiv index="0">
<Unicode>9</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>9</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r6" orientation="0.0">
<Coords points="237,373 237,347 259,347 259,373"/>
<TextLine id="l6">
<Coords points="240,372 240,349 256,349 256,372"/>
<TextEquiv index="0">
<Unicode>1</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>1</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r7" orientation="0.0">
<Coords points="312,373 312,347 341,347 341,373"/>
<TextLine id="l7">
<Coords points="318,372 318,350 338,350 338,372"/>
<TextEquiv index="0">
<Unicode>2</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>2</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r8" orientation="0.0">
<Coords points="428,373 428,349 448,349 448,373"/>
<TextLine id="l8">
<Coords points="430,373 430,349 445,349 445,373"/>
<TextEquiv index="0">
<Unicode>3</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>3</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r9" orientation="0.0">
<Coords points="236,438 236,406 261,406 261,438"/>
<TextLine id="l9">
<Coords points="238,436 238,408 258,408 258,436"/>
<TextEquiv index="0">
<Unicode>4</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>4</Unicode>
</TextEquiv>
</TextRegion>
</Page>
</PcGts>

@ -0,0 +1,134 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd">
<Metadata>
<Creator/>
<Created>2020-10-28T08:43:47</Created>
<LastChange>1970-01-01T00:00:00</LastChange>
<Comments/>
</Metadata>
<Page imageFilename="0002.png" imageHeight="1123" imageWidth="794">
<ReadingOrder>
<OrderedGroup id="g1">
<RegionRefIndexed index="0" regionRef="r6"/>
<RegionRefIndexed index="1" regionRef="r9"/>
<RegionRefIndexed index="2" regionRef="r3"/>
<RegionRefIndexed index="3" regionRef="r7"/>
<RegionRefIndexed index="4" regionRef="r1"/>
<RegionRefIndexed index="5" regionRef="r4"/>
<RegionRefIndexed index="6" regionRef="r8"/>
<RegionRefIndexed index="7" regionRef="r2"/>
<RegionRefIndexed index="8" regionRef="r5"/>
</OrderedGroup>
</ReadingOrder>
<TextRegion id="r1" orientation="0.0">
<Coords points="315,437 315,407 339,407 339,437"/>
<TextLine id="l1">
<Coords points="318,434 318,409 337,409 337,434"/>
<TextEquiv index="0">
<Unicode>5</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>5</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r2" orientation="0.0">
<Coords points="425,436 425,406 450,406 450,436"/>
<TextLine id="l2">
<Coords points="429,434 429,410 446,410 446,434"/>
<TextEquiv index="0">
<Unicode>6</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>6</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r3" orientation="0.0">
<Coords points="233,499 233,467 262,467 262,499"/>
<TextLine id="l3">
<Coords points="237,496 237,468 258,468 258,496"/>
<TextEquiv index="0">
<Unicode>7</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>7</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r4" orientation="0.0">
<Coords points="316,497 316,470 340,470 340,497"/>
<TextLine id="l4">
<Coords points="319,494 319,472 337,472 337,494"/>
<TextEquiv index="0">
<Unicode>8</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>8</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r5" orientation="0.0">
<Coords points="423,501 423,468 451,468 451,501"/>
<TextLine id="l5">
<Coords points="427,497 427,470 447,470 447,497"/>
<TextEquiv index="0">
<Unicode>9</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>9</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r6" orientation="0.0">
<Coords points="237,373 237,347 259,347 259,373"/>
<TextLine id="l6">
<Coords points="240,372 240,349 256,349 256,372"/>
<TextEquiv index="0">
<Unicode>1</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>1</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r7" orientation="0.0">
<Coords points="312,373 312,347 341,347 341,373"/>
<TextLine id="l7">
<Coords points="318,372 318,350 338,350 338,372"/>
<TextEquiv index="0">
<Unicode>2</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>2</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r8" orientation="0.0">
<Coords points="428,373 428,349 448,349 448,373"/>
<TextLine id="l8">
<Coords points="430,373 430,349 445,349 445,373"/>
<TextEquiv index="0">
<Unicode>3</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>3</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r9" orientation="0.0">
<Coords points="236,438 236,406 261,406 261,438"/>
<TextLine id="l9">
<Coords points="238,436 238,408 258,408 258,436"/>
<TextEquiv index="0">
<Unicode>4</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>4</Unicode>
</TextEquiv>
</TextRegion>
</Page>
</PcGts>

@ -0,0 +1,139 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd">
<Metadata>
<Creator/>
<Created>2020-10-28T08:43:47</Created>
<LastChange>1970-01-01T00:00:00</LastChange>
<Comments/>
</Metadata>
<Page imageFilename="0001.png" imageHeight="1123" imageWidth="794">
<ReadingOrder>
<OrderedGroup id="g1">
<OrderedGroupIndexed id="r0_order" regionRef="r0" index="0">
<RegionRefIndexed index="0" regionRef="r6"/>
<RegionRefIndexed index="1" regionRef="r7"/>
<RegionRefIndexed index="2" regionRef="r8"/>
<RegionRefIndexed index="3" regionRef="r9"/>
<RegionRefIndexed index="4" regionRef="r1"/>
<RegionRefIndexed index="5" regionRef="r2"/>
<RegionRefIndexed index="6" regionRef="r3"/>
<RegionRefIndexed index="7" regionRef="r4"/>
<RegionRefIndexed index="8" regionRef="r5"/>
</OrderedGroupIndexed>
</OrderedGroup>
</ReadingOrder>
<TableRegion id="r0">
<Coords points="230,530 230,330 460,330 460,530"/>
<TextRegion id="r1" orientation="0.0">
<Coords points="315,437 315,407 339,407 339,437"/>
<TextLine id="l1">
<Coords points="318,434 318,409 337,409 337,434"/>
<TextEquiv index="0">
<Unicode>5</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>5</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r2" orientation="0.0">
<Coords points="425,436 425,406 450,406 450,436"/>
<TextLine id="l2">
<Coords points="429,434 429,410 446,410 446,434"/>
<TextEquiv index="0">
<Unicode>6</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>6</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r3" orientation="0.0">
<Coords points="233,499 233,467 262,467 262,499"/>
<TextLine id="l3">
<Coords points="237,496 237,468 258,468 258,496"/>
<TextEquiv index="0">
<Unicode>7</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>7</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r4" orientation="0.0">
<Coords points="316,497 316,470 340,470 340,497"/>
<TextLine id="l4">
<Coords points="319,494 319,472 337,472 337,494"/>
<TextEquiv index="0">
<Unicode>8</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>8</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r5" orientation="0.0">
<Coords points="423,501 423,468 451,468 451,501"/>
<TextLine id="l5">
<Coords points="427,497 427,470 447,470 447,497"/>
<TextEquiv index="0">
<Unicode>9</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>9</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r6" orientation="0.0">
<Coords points="237,373 237,347 259,347 259,373"/>
<TextLine id="l6">
<Coords points="240,372 240,349 256,349 256,372"/>
<TextEquiv index="0">
<Unicode>1</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>1</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r7" orientation="0.0">
<Coords points="312,373 312,347 341,347 341,373"/>
<TextLine id="l7">
<Coords points="318,372 318,350 338,350 338,372"/>
<TextEquiv index="0">
<Unicode>2</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>2</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r8" orientation="0.0">
<Coords points="428,373 428,349 448,349 448,373"/>
<TextLine id="l8">
<Coords points="430,373 430,349 445,349 445,373"/>
<TextEquiv index="0">
<Unicode>3</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>3</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r9" orientation="0.0">
<Coords points="236,438 236,406 261,406 261,438"/>
<TextLine id="l9">
<Coords points="238,436 238,408 258,408 258,436"/>
<TextEquiv index="0">
<Unicode>4</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>4</Unicode>
</TextEquiv>
</TextRegion>
</TableRegion>
</Page>
</PcGts>

@ -0,0 +1,134 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd">
<Metadata>
<Creator/>
<Created>2020-10-28T08:43:47</Created>
<LastChange>1970-01-01T00:00:00</LastChange>
<Comments/>
</Metadata>
<Page imageFilename="0001.png" imageHeight="1123" imageWidth="794">
<ReadingOrder>
<UnorderedGroup id="g1">
<RegionRef regionRef="r6"/>
<RegionRef regionRef="r7"/>
<RegionRef regionRef="r8"/>
<RegionRef regionRef="r9"/>
<RegionRef regionRef="r1"/>
<RegionRef regionRef="r2"/>
<RegionRef regionRef="r3"/>
<RegionRef regionRef="r4"/>
<RegionRef regionRef="r5"/>
</UnorderedGroup>
</ReadingOrder>
<TextRegion id="r1" orientation="0.0">
<Coords points="315,437 315,407 339,407 339,437"/>
<TextLine id="l1">
<Coords points="318,434 318,409 337,409 337,434"/>
<TextEquiv index="0">
<Unicode>5</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>5</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r2" orientation="0.0">
<Coords points="425,436 425,406 450,406 450,436"/>
<TextLine id="l2">
<Coords points="429,434 429,410 446,410 446,434"/>
<TextEquiv index="0">
<Unicode>6</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>6</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r3" orientation="0.0">
<Coords points="233,499 233,467 262,467 262,499"/>
<TextLine id="l3">
<Coords points="237,496 237,468 258,468 258,496"/>
<TextEquiv index="0">
<Unicode>7</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>7</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r4" orientation="0.0">
<Coords points="316,497 316,470 340,470 340,497"/>
<TextLine id="l4">
<Coords points="319,494 319,472 337,472 337,494"/>
<TextEquiv index="0">
<Unicode>8</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>8</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r5" orientation="0.0">
<Coords points="423,501 423,468 451,468 451,501"/>
<TextLine id="l5">
<Coords points="427,497 427,470 447,470 447,497"/>
<TextEquiv index="0">
<Unicode>9</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>9</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r6" orientation="0.0">
<Coords points="237,373 237,347 259,347 259,373"/>
<TextLine id="l6">
<Coords points="240,372 240,349 256,349 256,372"/>
<TextEquiv index="0">
<Unicode>1</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>1</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r7" orientation="0.0">
<Coords points="312,373 312,347 341,347 341,373"/>
<TextLine id="l7">
<Coords points="318,372 318,350 338,350 338,372"/>
<TextEquiv index="0">
<Unicode>2</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>2</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r8" orientation="0.0">
<Coords points="428,373 428,349 448,349 448,373"/>
<TextLine id="l8">
<Coords points="430,373 430,349 445,349 445,373"/>
<TextEquiv index="0">
<Unicode>3</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>3</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion id="r9" orientation="0.0">
<Coords points="236,438 236,406 261,406 261,438"/>
<TextLine id="l9">
<Coords points="238,436 238,408 258,408 258,436"/>
<TextEquiv index="0">
<Unicode>4</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>4</Unicode>
</TextEquiv>
</TextRegion>
</Page>
</PcGts>

@ -0,0 +1,29 @@
import os
import pytest
from lxml import etree as ET
from .. import page_text
@pytest.mark.parametrize(
"file,expected_text",
[
("table-order-0001.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"),
("table-order-0002.xml", "1\n4\n7\n2\n5\n8\n3\n6\n9"),
("table-region.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"),
("table-no-reading-order.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"),
("table-unordered.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"),
],
)
@pytest.mark.integration
def test_reading_order_settings(file, expected_text):
data_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "data", "table-order"
)
if "table-unordered.xml" == file:
with pytest.raises(NotImplementedError):
page_text(ET.parse(os.path.join(data_dir, file)))
else:
ocr = page_text(ET.parse(os.path.join(data_dir, file)))
assert ocr == expected_text
Loading…
Cancel
Save