training.generate-gt.pagexml2label: add --missing-printspace

- keep default (fallback to full page), but warn
- new option `skip`
- new option `project`
This commit is contained in:
Robert Sachunsky 2026-02-25 11:16:21 +01:00
parent 7823ea2c95
commit 4202a1b2db
2 changed files with 51 additions and 10 deletions

View file

@ -35,26 +35,28 @@ def main():
@click.option(
"--dir_xml",
"-dx",
help="directory of GT page-xml files",
help="input directory of GT PAGE-XML files",
type=click.Path(exists=True, file_okay=False),
required=True,
)
@click.option(
"--dir_images",
"-di",
help="directory of org images. If print space cropping or scaling is needed for labels it would be great to provide the original images to apply the same function on them. So if -ps is not set true or in config files no columns_width key is given this argumnet can be ignored. File stems in this directory should be the same as those in dir_xml.",
help="input directory of GT image files (only needed for '--printspace' or scaling configured via 'columns_width'; filename stems should match those in --dir_xml)",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
"--dir_out_images",
"-doi",
help="directory where the output org images after undergoing a process (like print space cropping or scaling) will be written.",
help="output directory for training image files (for printspace cropping or scaling)",
type=click.Path(exists=True, file_okay=False),
)
@click.option(
"--dir_out",
"-do",
help="directory where ground truth label images would be written",
help="output directory for training label files",
type=click.Path(exists=True, file_okay=False),
required=True,
)
@click.option(
@ -67,16 +69,25 @@ def main():
@click.option(
"--type_output",
"-to",
help="this defines how output should be. A 2d image array or a 3d image array encoded with RGB color. Just pass 2d or 3d. The file will be saved one directory up. 2D image array is 3d but only information of one channel would be enough since all channels have the same values.",
type=click.Choice(["2d", "3d"]),
default="2d",
help="generate labels as [H, W] array pseudo index-color images for training ('2d') or [H, W, C] array RGB color images for plotting ('3d')",
)
@click.option(
"--printspace",
"-ps",
is_flag=True,
help="if this parameter set to true, generated labels and in the case of provided org images cropping will be imposed and cropped labels and images will be written in output directories.",
help="crop pages from annotated PrintSpace or Border to generate labels and images (will also require -di for so original images so output images are cropped along with labels)",
)
@click.option(
"--missing-printspace",
"-mps",
type=click.Choice(["full", "skip", "project"]),
default="full",
help="if -ps is set, what to do in case a PAGE-XML has no PrintSpace or Border annotation: keep entire page ('full'), ignore file ('skip') or crop artificially from outer hull of all segments ('project')",
)
def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, dir_out_images):
def pagexml2label(dir_xml, dir_out, type_output, config, printspace, missing_printspace, dir_images, dir_out_images):
"""
extract PAGE-XML GT data suitable for model training for segmentation tasks
"""
@ -86,8 +97,17 @@ def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, di
else:
print("passed")
config_params = None
gt_list = get_content_of_dir(dir_xml)
get_images_of_ground_truth(gt_list,dir_xml,dir_out,type_output, config, config_params, printspace, dir_images, dir_out_images)
get_images_of_ground_truth(get_content_of_dir(dir_xml),
dir_xml,
dir_out,
type_output,
config,
config_params,
printspace,
missing_printspace,
dir_images,
dir_out_images
)
@main.command()
@click.option(

View file

@ -658,7 +658,18 @@ def get_layout_contours_for_visualization(xml_file):
co_noise.append(np.array(c_t_in))
return co_text, co_graphic, co_sep, co_img, co_table, co_map, co_noise, y_len, x_len
def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_file, config_params, printspace, dir_images, dir_out_images):
def get_images_of_ground_truth(
gt_list,
dir_in,
output_dir,
output_type,
config_file,
config_params,
printspace,
missing_printspace,
dir_images,
dir_out_images
):
"""
Reading the page xml files and write the ground truth images into given output directory.
"""
@ -702,10 +713,20 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_
if printspace or "printspace_as_class_in_layout" in list(config_params.keys()):
ps = (root1.xpath('/pc:PcGts/pc:Page/pc:Border', namespaces=NS) +
root1.xpath('/pc:PcGts/pc:Page/pc:PrintSpace', namespaces=NS))
coords = root1.xpath('//pc:Coords/@points', namespaces=NS)
if len(ps):
points = ps[0].find('pc:Coords', NS).get('points')
ps_bbox = bbox_from_points(points)
elif missing_printspace == 'skip':
print(gt_list[index], "has no Border or PrintSpace - skipping file")
continue
elif missing_printspace == 'project' and len(coords):
print(gt_list[index], "has no Border or PrintSpace - projecting hull of segments")
bboxes = list(map(bbox_from_points, coords))
left, top, right, bottom = zip(*bboxes)
ps_bbox = [min(left), min(top), max(right), max(bottom)]
else:
print(gt_list[index], "has no Border or PrintSpace - using full page")
ps_bbox = [0, 0, None, None]