add option to overwrite output xml, but skip by default if file exists

pull/142/head
Robert Sachunsky 2 weeks ago
parent b9ca7a6191
commit b4b0890294

@ -97,6 +97,12 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out)
type=click.Path(exists=True, file_okay=False), type=click.Path(exists=True, file_okay=False),
required=True, required=True,
) )
@click.option(
"--overwrite",
"-O",
help="overwrite (instead of skipping) if output xml exists",
is_flag=True,
)
@click.option( @click.option(
"--dir_in", "--dir_in",
"-di", "-di",
@ -253,7 +259,7 @@ def binarization(patches, model_dir, input_image, output_image, dir_in, dir_out)
help="Override log level globally to this", help="Override log level globally to this",
) )
def layout(image, out, dir_in, model, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, num_col_upper, num_col_lower, skip_layout_and_reading_order, ignore_page_extraction, log_level): def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, num_col_upper, num_col_lower, skip_layout_and_reading_order, ignore_page_extraction, log_level):
initLogging() initLogging()
if log_level: if log_level:
getLogger('eynollah').setLevel(getLevelName(log_level)) getLogger('eynollah').setLevel(getLevelName(log_level))
@ -273,6 +279,7 @@ def layout(image, out, dir_in, model, save_images, save_layout, save_deskewed, s
sys.exit(1) sys.exit(1)
eynollah = Eynollah( eynollah = Eynollah(
image_filename=image, image_filename=image,
overwrite=overwrite,
dir_out=out, dir_out=out,
dir_in=dir_in, dir_in=dir_in,
dir_models=model, dir_models=model,

@ -165,6 +165,7 @@ class Eynollah:
image_filename=None, image_filename=None,
image_pil=None, image_pil=None,
image_filename_stem=None, image_filename_stem=None,
overwrite=False,
dir_out=None, dir_out=None,
dir_in=None, dir_in=None,
dir_of_cropped_images=None, dir_of_cropped_images=None,
@ -203,6 +204,7 @@ class Eynollah:
if override_dpi: if override_dpi:
self.dpi = override_dpi self.dpi = override_dpi
self.image_filename = image_filename self.image_filename = image_filename
self.overwrite = overwrite
self.dir_out = dir_out self.dir_out = dir_out
self.dir_in = dir_in self.dir_in = dir_in
self.dir_of_all = dir_of_all self.dir_of_all = dir_of_all
@ -360,6 +362,7 @@ class Eynollah:
curved_line=self.curved_line, curved_line=self.curved_line,
textline_light = self.textline_light, textline_light = self.textline_light,
pcgts=self.pcgts) pcgts=self.pcgts)
def imread(self, grayscale=False, uint8=True): def imread(self, grayscale=False, uint8=True):
key = 'img' key = 'img'
if grayscale: if grayscale:
@ -4460,8 +4463,14 @@ class Eynollah:
if self.dir_in: if self.dir_in:
self.reset_file_name_dir(os.path.join(self.dir_in,img_name)) self.reset_file_name_dir(os.path.join(self.dir_in,img_name))
#print("text region early -11 in %.1fs", time.time() - t0) #print("text region early -11 in %.1fs", time.time() - t0)
if os.path.exists(self.writer.output_filename):
if self.overwrite:
self.logger.warning("will overwrite existing output file '%s'", self.writer.output_filename)
else:
self.logger.warning("will skip input for existing output file '%s'", self.writer.output_filename)
continue
if self.extract_only_images: if self.extract_only_images:
img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version) img_res, is_image_enhanced, num_col_classifier, num_column_is_classified = self.run_enhancement(self.light_version)
self.logger.info("Enhancing took %.1fs ", time.time() - t0) self.logger.info("Enhancing took %.1fs ", time.time() - t0)

@ -28,6 +28,7 @@ class EynollahXmlWriter():
self.counter = EynollahIdCounter() self.counter = EynollahIdCounter()
self.dir_out = dir_out self.dir_out = dir_out
self.image_filename = image_filename self.image_filename = image_filename
self.output_filename = os.path.join(self.dir_out, self.image_filename_stem) + ".xml"
self.curved_line = curved_line self.curved_line = curved_line
self.textline_light = textline_light self.textline_light = textline_light
self.pcgts = pcgts self.pcgts = pcgts
@ -163,9 +164,8 @@ class EynollahXmlWriter():
coords.set_points(points_co[:-1]) coords.set_points(points_co[:-1])
def write_pagexml(self, pcgts): def write_pagexml(self, pcgts):
out_fname = os.path.join(self.dir_out, self.image_filename_stem) + ".xml" self.logger.info("output filename: '%s'", self.output_filename)
self.logger.info("output filename: '%s'", out_fname) with open(self.output_filename, 'w') as f:
with open(out_fname, 'w') as f:
f.write(to_xml(pcgts)) f.write(to_xml(pcgts))
def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines): def build_pagexml_no_full_layout(self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_textline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_textline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables, ocr_all_textlines):

Loading…
Cancel
Save