mirror of
				https://github.com/qurator-spk/ocrd_repair_inconsistencies.git
				synced 2025-10-31 00:44:13 +01:00 
			
		
		
		
	Merge pull request #6 from bertsky/relax-concat-test
Relax concatenation test, backout gracefully
This commit is contained in:
		
						commit
						594d937ef5
					
				
					 7 changed files with 156 additions and 90 deletions
				
			
		
							
								
								
									
										1
									
								
								.gitignore
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
										
									
									
										vendored
									
									
								
							|  | @ -104,5 +104,6 @@ venv.bak/ | |||
| .mypy_cache/ | ||||
| 
 | ||||
| # vim tmp | ||||
| *~ | ||||
| *.swp | ||||
| *.swo | ||||
|  |  | |||
							
								
								
									
										24
									
								
								.pylintrc
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								.pylintrc
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,24 @@ | |||
| [MASTER] | ||||
| 
 | ||||
| [MESSAGES CONTROL] | ||||
| disable = | ||||
|     ungrouped-imports, | ||||
|     bad-continuation, | ||||
|     missing-docstring, | ||||
|     no-self-use, | ||||
|     superfluous-parens, | ||||
|     invalid-name, | ||||
|     line-too-long, | ||||
|     too-many-arguments, | ||||
|     too-many-branches, | ||||
|     too-many-statements, | ||||
|     too-many-locals, | ||||
|     too-few-public-methods, | ||||
|     wrong-import-order, | ||||
|     duplicate-code | ||||
| 
 | ||||
| # allow indented whitespace (as required by interpreter): | ||||
| no-space-check=empty-line | ||||
| 
 | ||||
| # allow non-snake-case identifiers: | ||||
| good-names=n,i | ||||
							
								
								
									
										21
									
								
								Makefile
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								Makefile
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,21 @@ | |||
| SHELL = /bin/bash | ||||
| PYTHON = python3 | ||||
| PIP = pip3 | ||||
| 
 | ||||
| define HELP | ||||
| cat <<EOF | ||||
| ocrd_repair_inconsistencies | ||||
| 
 | ||||
| Targets: | ||||
| 	deps     Install Python dependencies via pip | ||||
| 	install  Install Python package | ||||
| EOF | ||||
| endef | ||||
| export HELP | ||||
| help: ; @eval "$$HELP" | ||||
| 
 | ||||
| deps: | ||||
| 	$(PIP) install -r requirements.txt | ||||
| 
 | ||||
| install: | ||||
| 	$(PIP) install . | ||||
							
								
								
									
										39
									
								
								README.md
									
										
									
									
									
								
							
							
						
						
									
										39
									
								
								README.md
									
										
									
									
									
								
							|  | @ -1,20 +1,49 @@ | |||
| # ocrd_repair_inconsistencies | ||||
| 
 | ||||
| Automatically re-order lines, words and glyphs to become textually consistent with their parents. | ||||
|     Automatically re-order lines, words and glyphs to become textually consistent with their parents. | ||||
| 
 | ||||
| ## Introduction | ||||
| 
 | ||||
| PAGE-XML elements with textual annotation are re-ordered by their centroid coordinates | ||||
| in top-to-bottom/left-to-right fashion iff such re-ordering fixes the inconsistency | ||||
| between their appropriately concatenated `TextEquiv` texts with their parent's `TextEquiv` text. | ||||
| iff such re-ordering fixes the inconsistency between their appropriately concatenated | ||||
| `TextEquiv` texts with their parent's `TextEquiv` text. | ||||
| 
 | ||||
| If `TextEquiv` is missing, skip the respective elements. | ||||
| 
 | ||||
| Where available, respect the annotated visual order: | ||||
| - For regions vs lines, sort in `top-to-bottom` fashion, unless another `textLineOrder` is annotated.   | ||||
|   (Both `left-to-right` and `right-to-left` will be skipped currently.) | ||||
| - For lines vs words and words vs glyphs, sort in `left-to-right` fashion, unless another `readingDirection` is annotated.   | ||||
|   (Both `top-to-bottom` and `bottom-to-top` will be skipped currently.) | ||||
| 
 | ||||
| This processor does not affect `ReadingOrder` between regions, just the order of the XML elements | ||||
| below the region level, and only if not contradicting the annotated `textLineOrder`/`readingDirection`. | ||||
| 
 | ||||
| We wrote this as a one-shot script to fix some files. Use with caution. | ||||
| 
 | ||||
| ## Installation | ||||
| 
 | ||||
| ## Example usage | ||||
| (In your venv, run:) | ||||
| 
 | ||||
| ```sh | ||||
| make deps     # or pip install -r requirements.txt | ||||
| make install  # or pip install . | ||||
| ``` | ||||
| 
 | ||||
| ## Usage | ||||
| 
 | ||||
| Offers the following user interfaces: | ||||
| 
 | ||||
| ### [OCR-D processor](https://ocr-d.github.io/cli) CLI `ocrd-repair-inconsistencies` | ||||
| 
 | ||||
| To be used with [PageXML](https://github.com/PRImA-Research-Lab/PAGE-XML) | ||||
| documents in an [OCR-D](https://ocr-d.github.io) annotation workflow. | ||||
| 
 | ||||
| ### Example | ||||
| 
 | ||||
| Use the following script to repair `OCR-D-GT-PAGE` annotation in workspaces, | ||||
| and then replace it with the output on success: | ||||
| 
 | ||||
| For example, use this fix script: | ||||
| ~~~sh | ||||
| #!/bin/bash | ||||
| set -e | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| import json | ||||
| from pkg_resources import resource_string | ||||
| 
 | ||||
| OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) | ||||
| OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) | ||||
|  |  | |||
|  | @ -6,6 +6,8 @@ from collections import Sequence | |||
| from ocrd import Processor | ||||
| from ocrd_modelfactory import page_from_file | ||||
| from ocrd_models.ocrd_page import ( | ||||
|     TextRegionType, TextLineType, WordType, | ||||
|     MetadataItemType, LabelsType, LabelType, | ||||
|     to_xml | ||||
| ) | ||||
| from ocrd_utils import ( | ||||
|  | @ -33,7 +35,20 @@ class RepairInconsistencies(Processor): | |||
|             LOG.info("INPUT FILE %i / %s", n, page_id) | ||||
|             pcgts = page_from_file(self.workspace.download_file(input_file)) | ||||
|             page = pcgts.get_Page() | ||||
| 
 | ||||
|              | ||||
|             # add metadata about this operation and its runtime parameters: | ||||
|             metadata = pcgts.get_Metadata() # ensured by from_file() | ||||
|             metadata.add_MetadataItem( | ||||
|                 MetadataItemType(type_="processingStep", | ||||
|                                  name=self.ocrd_tool['steps'][0], | ||||
|                                  value=TOOL, | ||||
|                                  Labels=[LabelsType( | ||||
|                                      externalModel="ocrd-tool", | ||||
|                                      externalId="parameters", | ||||
|                                      Label=[LabelType(type_=name, | ||||
|                                                       value=self.parameter[name]) | ||||
|                                             for name in self.parameter.keys()])])) | ||||
|              | ||||
|             regions = page.get_TextRegion() | ||||
| 
 | ||||
|             for region in regions: | ||||
|  | @ -49,7 +64,7 @@ class RepairInconsistencies(Processor): | |||
|                              page_id, region.id, textLineOrder) | ||||
|                     continue | ||||
| 
 | ||||
|                 _fix_lines(region, page_id, reverse=textLineOrder=='bottom-to-top') | ||||
|                 _fix_segment(region, page_id, reverse=(textLineOrder == 'bottom-to-top')) | ||||
| 
 | ||||
|                 lines = region.get_TextLine() | ||||
|                 for line in lines: | ||||
|  | @ -65,7 +80,7 @@ class RepairInconsistencies(Processor): | |||
|                                  page_id, line.id, readingDirection) | ||||
|                         continue | ||||
|                      | ||||
|                     _fix_words(line, page_id, reverse=readingDirection=='right-to-left') | ||||
|                     _fix_segment(line, page_id, reverse=(readingDirection == 'right-to-left')) | ||||
| 
 | ||||
|                     words = line.get_Word() | ||||
|                     for word in words: | ||||
|  | @ -81,7 +96,7 @@ class RepairInconsistencies(Processor): | |||
|                                      page_id, word.id, readingDirection) | ||||
|                             continue | ||||
| 
 | ||||
|                         _fix_glyphs(word, page_id, reverse=readingDirection=='right-to-left') | ||||
|                         _fix_segment(word, page_id, reverse=(readingDirection == 'right-to-left')) | ||||
| 
 | ||||
|             file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) | ||||
|             if file_id == input_file.ID: | ||||
|  | @ -95,91 +110,66 @@ class RepairInconsistencies(Processor): | |||
|                 content=to_xml(pcgts)) | ||||
| 
 | ||||
| 
 | ||||
| def get_text(thing, joiner=None): | ||||
| def get_text(thing, joiner=''): | ||||
|     """Get the text of the given thing, joining if necessary""" | ||||
| 
 | ||||
|     def _get_text_for_one(t): | ||||
|         if len(t.get_TextEquiv()) != 1: | ||||
|             raise NotImplementedError | ||||
|     def _get_text_for_one(one): | ||||
|         try: | ||||
|             return t.get_TextEquiv()[0].get_Unicode() | ||||
|             return one.get_TextEquiv()[0].get_Unicode() | ||||
|         except Exception: | ||||
|             LOG.warning('element "%s" has no text', one.id) | ||||
|             return None | ||||
| 
 | ||||
|      | ||||
|     if isinstance(thing, Sequence): | ||||
|         text = joiner.join(_get_text_for_one(t) for t in thing) | ||||
|         texts = [_get_text_for_one(part) for part in thing] | ||||
|         if all(texts): | ||||
|             return joiner.join(texts) | ||||
|         return None | ||||
|     return _get_text_for_one(thing) | ||||
| 
 | ||||
| def _fix_segment(segment, page_id, reverse=False): | ||||
|     """Fix order of child elements of (region/line/word) segment.""" | ||||
|      | ||||
|     if isinstance(segment, TextRegionType): | ||||
|         joiner = '\n' | ||||
|         sort_horizontal = False | ||||
|         children = segment.get_TextLine() | ||||
|         adoption = segment.set_TextLine | ||||
|     elif isinstance(segment, TextLineType): | ||||
|         joiner = ' ' | ||||
|         sort_horizontal = True | ||||
|         children = segment.get_Word() | ||||
|         adoption = segment.set_Word | ||||
|     elif isinstance(segment, WordType): | ||||
|         joiner = '' | ||||
|         sort_horizontal = True | ||||
|         children = segment.get_Glyph() | ||||
|         adoption = segment.set_Glyph | ||||
|     else: | ||||
|         text = _get_text_for_one(thing) | ||||
|     return text | ||||
| 
 | ||||
| 
 | ||||
| def _fix_words(line, page_id, reverse=False): | ||||
|     """Fix word order in a line""" | ||||
| 
 | ||||
|     words = line.get_Word() | ||||
|     if not words: | ||||
|         raise Exception('invalid element type %s of segment to fix' % type(segment)) | ||||
|     if not children: | ||||
|         return | ||||
|     line_text = get_text(line) | ||||
|     words_text = get_text(words, ' ') | ||||
|     if line_text != words_text: | ||||
|         sorted_words = sorted(words, reverse=reverse, | ||||
|                               key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)).centroid.x) | ||||
|         sorted_words_text = get_text(sorted_words, ' ') | ||||
| 
 | ||||
|         if sorted_words_text == line_text: | ||||
|             LOG.info('Fixing word order of page "%s" line "%s"', page_id, line.id) | ||||
|             line.set_Word(sorted_words) | ||||
|     segment_text = get_text(segment) | ||||
|     concat_text = get_text(children, joiner) | ||||
|     if (segment_text and concat_text and | ||||
|         segment_text != concat_text and | ||||
|         segment_text.replace(joiner, '') != concat_text.replace(joiner, '')): | ||||
|         def polygon_position(child, horizontal=sort_horizontal): | ||||
|             polygon = Polygon(polygon_from_points(child.get_Coords().points)) | ||||
|             if horizontal: | ||||
|                 return polygon.centroid.x | ||||
|             return polygon.centroid.y | ||||
|         sorted_children = sorted(children, reverse=reverse, key=polygon_position) | ||||
|         sorted_concat_text = get_text(sorted_children, joiner) | ||||
|          | ||||
|         if (segment_text == sorted_concat_text or | ||||
|             segment_text.replace(joiner, '') == sorted_concat_text.replace(joiner, '')): | ||||
|             LOG.info('Fixing element order of page "%s" segment "%s"', page_id, segment.id) | ||||
|             adoption(sorted_children) | ||||
|         else: | ||||
|             LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"', | ||||
|                       page_id, line.id, | ||||
|                       str([word.id for word in words]), | ||||
|                       str([word.id for word in sorted_words]), | ||||
|                       words_text, line_text) | ||||
| 
 | ||||
| 
 | ||||
| def _fix_glyphs(word, page_id, reverse=False): | ||||
|     """Fix glyph order in a word""" | ||||
| 
 | ||||
|     glyphs = word.get_Glyph() | ||||
|     if not glyphs: | ||||
|         return | ||||
|     word_text = get_text(word) | ||||
|     glyphs_text = get_text(glyphs, '') | ||||
|     if word_text != glyphs_text: | ||||
|         sorted_glyphs = sorted(glyphs, reverse=reverse, | ||||
|                                key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x) | ||||
|         sorted_glyphs_text = get_text(sorted_glyphs, '') | ||||
| 
 | ||||
|         if sorted_glyphs_text == word_text: | ||||
|             LOG.info('Fixing glyph order of page "%s" word "%s"', page_id, word.id) | ||||
|             word.set_Glyph(sorted_glyphs) | ||||
|         else: | ||||
|             LOG.debug('Resorting glyphs of page "%s" word "%s" from %s to %s does not suffice to turn "%s" into "%s"', | ||||
|                       page_id, word.id, | ||||
|                       str([glyph.id for glyph in glyphs]), | ||||
|                       str([glyph.id for glyph in sorted_glyphs]), | ||||
|                       glyphs_text, word_text) | ||||
| 
 | ||||
| 
 | ||||
| def _fix_lines(region, page_id, reverse=False): | ||||
|     """Fix line order in a region""" | ||||
| 
 | ||||
|     lines = region.get_TextLine() | ||||
|     if not lines: | ||||
|         return | ||||
|     region_text = get_text(region) | ||||
|     lines_text = get_text(lines, '\n') | ||||
|     if region_text != lines_text: | ||||
|         sorted_lines = sorted(lines, reverse=reverse, | ||||
|                               key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)).centroid.y) | ||||
|         sorted_lines_text = get_text(sorted_lines, '\n') | ||||
| 
 | ||||
|         if sorted_lines_text == region_text: | ||||
|             LOG.info('Fixing line order of page "%s" region "%s"', page_id, region.id) | ||||
|             region.set_TextLine(sorted_lines) | ||||
|         else: | ||||
|             LOG.debug('Resorting lines of page "%s" region "%s" from %s to %s does not suffice to turn "%s" into "%s"', | ||||
|                       page_id, region.id, | ||||
|                       str([line.id for line in lines]), | ||||
|                       str([line.id for line in sorted_lines]), | ||||
|                       lines_text, region_text) | ||||
|             LOG.debug('Resorting children of page "%s" segment "%s" from %s to %s' + | ||||
|                       'does not suffice to turn "%s" into "%s"', | ||||
|                       page_id, segment.id, | ||||
|                       str([seg.id for seg in children]), | ||||
|                       str([seg.id for seg in sorted_children]), | ||||
|                       concat_text, segment_text) | ||||
|  |  | |||
							
								
								
									
										3
									
								
								setup.py
									
										
									
									
									
								
							
							
						
						
									
										3
									
								
								setup.py
									
										
									
									
									
								
							|  | @ -6,7 +6,8 @@ from setuptools import setup, find_packages | |||
| setup( | ||||
|     name='ocrd_repair_inconsistencies', | ||||
|     description='Repair glyph/word/line order inconsistencies', | ||||
|     #long_description=codecs.open('README.md', encoding='utf-8').read(), | ||||
|     long_description=codecs.open('README.md', encoding='utf-8').read(), | ||||
|     long_description_content_type='text/markdown', | ||||
|     author='Mike Gerber', | ||||
|     author_email='mike.gerber@sbb.spk-berlin.de', | ||||
|     license='Apache License 2.0', | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue