cli: produce TSV if no words are transcribed

2026-02-23 09:52:15 +01:00 · 2022-02-21 17:00:03 +01:00 · 2022-02-21 17:00:03 +01:00 · fe0c355e5a
commit fe0c355e5a
parent 93ee53c8e2
1 changed files with 10 additions and 6 deletions
--- a/tsvtools/cli.py
+++ b/tsvtools/cli.py
@ -101,12 +101,16 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
                                text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id))
            else:
                for word in words:
-                    for text_equiv in word.get_TextEquiv():
+                    # XXX TODO make this configurable
-                        # transform OCR coordinates using `scale_factor` to derive
+                    textequiv = ''
-                        # correct coordinates for the web presentation image
+                    list_textequivs = word.get_TextEquiv()
-                        left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(word.get_Coords().points)]
+                    if list_textequivs:
-                        tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0,
+                        textequiv = list_textequivs[0].get_Unicode()
-                                    text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id))
+                    # transform OCR coordinates using `scale_factor` to derive
                    # correct coordinates for the web presentation image
                    left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(word.get_Coords().points)]
                    tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0,
                                textequiv, len(urls), left, right, top, bottom, text_line.id))
    line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id'])