mirror of
https://github.com/qurator-spk/ocrd_repair_inconsistencies.git
synced 2025-06-09 03:40:06 +02:00
🎨 Extract methods to fix lines + words
This commit is contained in:
parent
8d21cd8ab9
commit
c7033a5d4d
1 changed files with 33 additions and 26 deletions
|
@ -42,20 +42,9 @@ class RepairInconsistencies(Processor):
|
|||
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
|
||||
super(RepairInconsistencies, self).__init__(*args, **kwargs)
|
||||
|
||||
def process(self):
|
||||
for (n, input_file) in enumerate(self.input_files):
|
||||
page_id = input_file.pageId or input_file.ID
|
||||
LOG.info("INPUT FILE %i / %s", n, page_id)
|
||||
pcgts = page_from_file(self.workspace.download_file(input_file))
|
||||
page = pcgts.get_Page()
|
||||
def _fix_words(self, line):
|
||||
"""Fix word order in a line"""
|
||||
|
||||
regions = page.get_TextRegion()
|
||||
for region in regions:
|
||||
|
||||
lines = region.get_TextLine()
|
||||
for line in lines:
|
||||
|
||||
# Fix words in lines
|
||||
words = line.get_Word()
|
||||
line_text = get_text(line)
|
||||
words_text = get_text(words, ' ')
|
||||
|
@ -68,22 +57,40 @@ class RepairInconsistencies(Processor):
|
|||
LOG.info('Fixing word order of line "%s"', line.id)
|
||||
line.set_Word(sorted_words)
|
||||
|
||||
words = line.get_Word()
|
||||
for word in words:
|
||||
def _fix_glyphs(self, word):
|
||||
"""Fix glyph order in a word"""
|
||||
|
||||
# Fix glyphs in words
|
||||
glyphs = word.get_Glyph()
|
||||
word_text = get_text(word)
|
||||
glyphs_text = get_text(glyphs, '')
|
||||
if word_text != glyphs_text:
|
||||
# XXX Assumes left-to-right
|
||||
sorted_glyphs = sorted(glyphs, key=lambda g: Polygon( polygon_from_points(g.get_Coords().points)).centroid.x)
|
||||
sorted_glyphs = sorted(glyphs, key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)).centroid.x)
|
||||
sorted_glyphs_text = get_text(sorted_glyphs, '')
|
||||
|
||||
if sorted_glyphs_text == word_text:
|
||||
LOG.info('Fixing glyph order of word "%s"', word.id)
|
||||
word.set_Glyph(sorted_glyphs)
|
||||
|
||||
def process(self):
|
||||
for (n, input_file) in enumerate(self.input_files):
|
||||
page_id = input_file.pageId or input_file.ID
|
||||
LOG.info("INPUT FILE %i / %s", n, page_id)
|
||||
pcgts = page_from_file(self.workspace.download_file(input_file))
|
||||
page = pcgts.get_Page()
|
||||
|
||||
|
||||
regions = page.get_TextRegion()
|
||||
for region in regions:
|
||||
|
||||
lines = region.get_TextLine()
|
||||
for line in lines:
|
||||
self._fix_words(line)
|
||||
|
||||
words = line.get_Word()
|
||||
for word in words:
|
||||
self._fix_glyphs(word)
|
||||
|
||||
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
|
||||
if file_id == input_file.ID:
|
||||
file_id = concat_padded(self.output_file_grp, n)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue