From b4b0890294d2dc1fbf6ca84794587d5185a7546f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 11 Dec 2024 18:45:18 +0000 Subject: [PATCH] add option to overwrite output xml, but skip by default if file exists --- src/eynollah/cli.py | 9 ++++++++- src/eynollah/eynollah.py | 13 +++++++++++-- src/eynollah/writer.py | 6 +++--- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py index 5f4b5a4..a9b5765 100644 --- a/src/eynollah/cli.py +++ b/src/eynollah/cli.py @@ -97,6 +97,12 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) type=click.Path(exists=True, file_okay=False), required=True, ) +@click.option( + "--overwrite", + "-O", + help="overwrite (instead of skipping) if output xml exists", + is_flag=True, +) @click.option( "--dir_in", "-di", @@ -253,7 +259,7 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out) help="Override log level globally to this", ) -def layout(image, out, dir_in, model, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, num_col_upper, num_col_lower, skip_layout_and_reading_order, ignore_page_extraction, log_level): +def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, num_col_upper, num_col_lower, skip_layout_and_reading_order, ignore_page_extraction, log_level): initLogging() if log_level: getLogger('eynollah').setLevel(getLevelName(log_level)) @@ -273,6 +279,7 @@ def layout(image, out, dir_in, model, save_images, save_layout, save_deskewed, s sys.exit(1) eynollah = Eynollah( image_filename=image, + overwrite=overwrite, dir_out=out, dir_in=dir_in, dir_models=model, diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py index 8b8808c..8883f19 100644 --- a/src/eynollah/eynollah.py +++ b/src/eynollah/eynollah.py @@ -165,6 +165,7 @@ class Eynollah: image_filename=None, image_pil=None, image_filename_stem=None, + overwrite=False, dir_out=None, dir_in=None, dir_of_cropped_images=None, @@ -203,6 +204,7 @@ class Eynollah: if override_dpi: self.dpi = override_dpi self.image_filename = image_filename + self.overwrite = overwrite self.dir_out = dir_out self.dir_in = dir_in self.dir_of_all = dir_of_all @@ -360,6 +362,7 @@ class Eynollah: curved_line=self.curved_line, textline_light = self.textline_light, pcgts=self.pcgts) + def imread(self, grayscale=False, uint8=True): key = 'img' if grayscale: @@ -4460,8 +4463,14 @@ class Eynollah: if self.dir_in: self.reset_file_name_dir(os.path.join(self.dir_in,img_name)) #print("text region early -11 in %.1fs", time.time() - t0) - - + + if os.path.exists(self.writer.output_filename): + if self.overwrite: + self.logger.warning("will overwrite existing output file '%s'", self.writer.output_filename) + else: + self.logger.warning("will skip input for existing output file '%s'", self.writer.output_filename) + continue + if self.extract_only_images: img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) self.logger.info("Enhancing took %.1fs ", time.time() - t0) diff --git a/src/eynollah/writer.py b/src/eynollah/writer.py index 5f282f2..dc5a5dc 100644 --- a/src/eynollah/writer.py +++ b/src/eynollah/writer.py @@ -28,6 +28,7 @@ class EynollahXmlWriter(): self.counter = EynollahIdCounter() self.dir_out = dir_out self.image_filename = image_filename + self.output_filename = os.path.join(self.dir_out, self.image_filename_stem) + ".xml" self.curved_line = curved_line self.textline_light = textline_light self.pcgts = pcgts @@ -163,9 +164,8 @@ class EynollahXmlWriter(): coords.set_points(points_co[:-1]) def write_pagexml(self, pcgts): - out_fname = os.path.join(self.dir_out, self.image_filename_stem) + ".xml" - self.logger.info("output filename: '%s'", out_fname) - with open(out_fname, 'w') as f: + self.logger.info("output filename: '%s'", self.output_filename) + with open(self.output_filename, 'w') as f: f.write(to_xml(pcgts)) def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines):