remove (half-implemented) page_alto functionality

This commit is contained in:
kba 2026-06-11 17:04:56 +02:00
parent 9858221724
commit bed7fe526b
3 changed files with 169 additions and 291 deletions

View file

@ -56,12 +56,6 @@ from ..utils import is_image_filename
is_flag=True, is_flag=True,
help="if this parameter set to true, vertical textline images will be excluded.", help="if this parameter set to true, vertical textline images will be excluded.",
) )
@click.option(
"--page_alto",
"-alto",
is_flag=True,
help="If this parameter is set to True, text line image cropping and text extraction are performed using PAGE/ALTO files. Otherwise, the default method for PAGE XML files is used.",
)
def linegt_cli( def linegt_cli(
image, image,
dir_in, dir_in,
@ -70,7 +64,6 @@ def linegt_cli(
pref_of_dataset, pref_of_dataset,
do_not_mask_with_textline_contour, do_not_mask_with_textline_contour,
exclude_vertical_lines, exclude_vertical_lines,
page_alto,
): ):
assert bool(dir_in) ^ bool(image), "Set --dir-in or --image-filename, not both" assert bool(dir_in) ^ bool(image), "Set --dir-in or --image-filename, not both"
if dir_in: if dir_in:
@ -86,84 +79,6 @@ def linegt_cli(
dir_xml = os.path.join(dir_xmls, file_name + '.xml') dir_xml = os.path.join(dir_xmls, file_name + '.xml')
img = cv2.imread(dir_img) img = cv2.imread(dir_img)
if page_alto:
h, w = img.shape[:2]
tree = ET.parse(dir_xml)
root = tree.getroot()
NS = {'alto': root.tag.split('}')[0].strip('{')}#{"alto": "http://www.loc.gov/standards/alto/ns-v4#"}
results = []
indexer_textlines = 0
for line in root.findall(".//alto:TextLine", NS):
string_el = line.find("alto:String", NS)
textline_text = string_el.attrib["CONTENT"] if string_el is not None else None
polygon_el = line.find("alto:Shape/alto:Polygon", NS)
if polygon_el is None:
continue
points = polygon_el.attrib["POINTS"].split()
coords = [
(int(points[i]), int(points[i + 1]))
for i in range(0, len(points), 2)
]
coords = np.array(coords, dtype=np.int32)
x, y, w, h = cv2.boundingRect(coords)
if exclude_vertical_lines and h > 1.4 * w:
img_crop = None
continue
img_poly_on_img = np.copy(img)
mask_poly = np.zeros(img.shape)
mask_poly = cv2.fillPoly(mask_poly, pts=[coords], color=(1, 1, 1))
mask_poly = mask_poly[y : y + h, x : x + w, :]
img_crop = img_poly_on_img[y : y + h, x : x + w, :]
if not do_not_mask_with_textline_contour:
img_crop[mask_poly == 0] = 255
if img_crop.shape[0] == 0 or img_crop.shape[1] == 0:
img_crop = None
continue
if textline_text and img_crop is not None:
base_name = os.path.join(
dir_out, file_name + '_line_' + str(indexer_textlines)
)
if pref_of_dataset:
base_name += '_' + pref_of_dataset
if not do_not_mask_with_textline_contour:
base_name += '_masked'
with open(base_name + '.txt', 'w') as text_file:
text_file.write(textline_text)
cv2.imwrite(base_name + '.png', img_crop)
indexer_textlines += 1
else:
total_bb_coordinates = [] total_bb_coordinates = []
tree = ET.parse(dir_xml, parser=ET.XMLParser(encoding="utf-8")) tree = ET.parse(dir_xml, parser=ET.XMLParser(encoding="utf-8"))

View file

@ -73,14 +73,8 @@ def main():
is_flag=True, is_flag=True,
help="if this parameter set to true, generated labels and in the case of provided org images cropping will be imposed and cropped labels and images will be written in output directories.", help="if this parameter set to true, generated labels and in the case of provided org images cropping will be imposed and cropped labels and images will be written in output directories.",
) )
@click.option(
"--page_alto",
"-alto",
is_flag=True,
help="If this parameter is set to True, textline label generation is performed using PAGE/ALTO files. Otherwise, the default method for PAGE XML files is used.",
)
def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, dir_out_images, page_alto): def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, dir_out_images):
if config: if config:
with open(config) as f: with open(config) as f:
config_params = json.load(f) config_params = json.load(f)
@ -88,7 +82,7 @@ def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, di
print("passed") print("passed")
config_params = None config_params = None
gt_list = get_content_of_dir(dir_xml) gt_list = get_content_of_dir(dir_xml)
get_images_of_ground_truth(gt_list,dir_xml,dir_out,type_output, config, config_params, printspace, dir_images, dir_out_images, page_alto) get_images_of_ground_truth(gt_list,dir_xml,dir_out,type_output, config, config_params, printspace, dir_images, dir_out_images)
@main.command() @main.command()
@click.option( @click.option(

View file

@ -686,7 +686,7 @@ def get_layout_contours_for_visualization(xml_file):
co_noise.append(np.array(c_t_in)) co_noise.append(np.array(c_t_in))
return co_text, co_graphic, co_sep, co_img, co_table, co_map, co_music, co_noise, y_len, x_len return co_text, co_graphic, co_sep, co_img, co_table, co_map, co_music, co_noise, y_len, x_len
def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_file, config_params, printspace, dir_images, dir_out_images, page_alto=False): def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_file, config_params, printspace, dir_images, dir_out_images):
""" """
Reading the page xml files and write the ground truth images into given output directory. Reading the page xml files and write the ground truth images into given output directory.
""" """
@ -699,19 +699,6 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_
print(gt_list[index]) print(gt_list[index])
try: try:
if page_alto:
tree = ET.parse(dir_in+'/'+gt_list[index])
root = tree.getroot()
NS = {'alto': root.tag.split('}')[0].strip('{')}#{"alto": "http://www.loc.gov/standards/alto/ns-v4#"}
x_len, y_len = 0, 0
page = root.find('.//alto:Page', NS)
x_len = int( page.get("WIDTH") )
y_len = int( page.get("HEIGHT") )
else:
tree1 = ET.parse(dir_in+'/'+gt_list[index], parser = ET.XMLParser(encoding='utf-8')) tree1 = ET.parse(dir_in+'/'+gt_list[index], parser = ET.XMLParser(encoding='utf-8'))
root1=tree1.getroot() root1=tree1.getroot()
alltags=[elem.tag for elem in root1.iter()] alltags=[elem.tag for elem in root1.iter()]
@ -797,24 +784,6 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_
textline_rgb_color = (255, 0, 0) textline_rgb_color = (255, 0, 0)
if page_alto:
co_use_case = []
for line in root.findall(".//alto:TextLine", NS):
string_el = line.find("alto:String", NS)
textline_text = string_el.attrib["CONTENT"] if string_el is not None else None
polygon_el = line.find("alto:Shape/alto:Polygon", NS)
if polygon_el is None:
continue
points = polygon_el.attrib["POINTS"].split()
coords = [
(int(points[i]), int(points[i + 1]))
for i in range(0, len(points), 2)
]
co_use_case.append( np.array(coords, dtype=np.int32) )
else:
if config_params['use_case']=='textline': if config_params['use_case']=='textline':
region_tags = np.unique([x for x in alltags if x.endswith('TextLine')]) region_tags = np.unique([x for x in alltags if x.endswith('TextLine')])
elif config_params['use_case']=='word': elif config_params['use_case']=='word':