remove (half-implemented) page_alto functionality

This commit is contained in:
kba 2026-06-11 17:04:56 +02:00
parent 9858221724
commit bed7fe526b
3 changed files with 169 additions and 291 deletions

View file

@ -56,12 +56,6 @@ from ..utils import is_image_filename
is_flag=True, is_flag=True,
help="if this parameter set to true, vertical textline images will be excluded.", help="if this parameter set to true, vertical textline images will be excluded.",
) )
@click.option(
"--page_alto",
"-alto",
is_flag=True,
help="If this parameter is set to True, text line image cropping and text extraction are performed using PAGE/ALTO files. Otherwise, the default method for PAGE XML files is used.",
)
def linegt_cli( def linegt_cli(
image, image,
dir_in, dir_in,
@ -70,7 +64,6 @@ def linegt_cli(
pref_of_dataset, pref_of_dataset,
do_not_mask_with_textline_contour, do_not_mask_with_textline_contour,
exclude_vertical_lines, exclude_vertical_lines,
page_alto,
): ):
assert bool(dir_in) ^ bool(image), "Set --dir-in or --image-filename, not both" assert bool(dir_in) ^ bool(image), "Set --dir-in or --image-filename, not both"
if dir_in: if dir_in:
@ -86,147 +79,69 @@ def linegt_cli(
dir_xml = os.path.join(dir_xmls, file_name + '.xml') dir_xml = os.path.join(dir_xmls, file_name + '.xml')
img = cv2.imread(dir_img) img = cv2.imread(dir_img)
if page_alto: total_bb_coordinates = []
h, w = img.shape[:2]
tree = ET.parse(dir_xml) tree = ET.parse(dir_xml, parser=ET.XMLParser(encoding="utf-8"))
root = tree.getroot() root = tree.getroot()
alltags = [elem.tag for elem in root.iter()]
NS = {'alto': root.tag.split('}')[0].strip('{')}#{"alto": "http://www.loc.gov/standards/alto/ns-v4#"} name_space = alltags[0].split('}')[0]
name_space = name_space.split('{')[1]
results = [] region_tags = np.unique([x for x in alltags if x.endswith('TextRegion')])
indexer_textlines = 0 cropped_lines_region_indexer = []
for line in root.findall(".//alto:TextLine", NS):
string_el = line.find("alto:String", NS)
textline_text = string_el.attrib["CONTENT"] if string_el is not None else None
polygon_el = line.find("alto:Shape/alto:Polygon", NS) indexer_text_region = 0
if polygon_el is None: indexer_textlines = 0
continue # FIXME: non recursive, use OCR-D PAGE generateDS API. Or use an existing tool for this purpose altogether
for nn in root.iter(region_tags):
for child_textregion in nn:
if child_textregion.tag.endswith("TextLine"):
for child_textlines in child_textregion:
if child_textlines.tag.endswith("Coords"):
cropped_lines_region_indexer.append(indexer_text_region)
p_h = child_textlines.attrib['points'].split(' ')
textline_coords = np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])
points = polygon_el.attrib["POINTS"].split() x, y, w, h = cv2.boundingRect(textline_coords)
coords = [
(int(points[i]), int(points[i + 1]))
for i in range(0, len(points), 2)
]
coords = np.array(coords, dtype=np.int32) if exclude_vertical_lines and h > 1.4 * w:
x, y, w, h = cv2.boundingRect(coords) img_crop = None
continue
total_bb_coordinates.append([x, y, w, h])
img_poly_on_img = np.copy(img)
mask_poly = np.zeros(img.shape)
mask_poly = cv2.fillPoly(mask_poly, pts=[textline_coords], color=(1, 1, 1))
mask_poly = mask_poly[y : y + h, x : x + w, :]
img_crop = img_poly_on_img[y : y + h, x : x + w, :]
if not do_not_mask_with_textline_contour:
img_crop[mask_poly == 0] = 255
if img_crop.shape[0] == 0 or img_crop.shape[1] == 0:
img_crop = None
continue
if exclude_vertical_lines and h > 1.4 * w: if child_textlines.tag.endswith("TextEquiv"):
img_crop = None for cheild_text in child_textlines:
continue if cheild_text.tag.endswith("Unicode"):
textline_text = cheild_text.text
if textline_text and img_crop is not None:
base_name = os.path.join(
dir_out, file_name + '_line_' + str(indexer_textlines)
)
if pref_of_dataset:
base_name += '_' + pref_of_dataset
if not do_not_mask_with_textline_contour:
base_name += '_masked'
img_poly_on_img = np.copy(img) with open(base_name + '.txt', 'w') as text_file:
text_file.write(textline_text)
mask_poly = np.zeros(img.shape) cv2.imwrite(base_name + '.png', img_crop)
mask_poly = cv2.fillPoly(mask_poly, pts=[coords], color=(1, 1, 1)) indexer_textlines += 1
mask_poly = mask_poly[y : y + h, x : x + w, :]
img_crop = img_poly_on_img[y : y + h, x : x + w, :]
if not do_not_mask_with_textline_contour:
img_crop[mask_poly == 0] = 255
if img_crop.shape[0] == 0 or img_crop.shape[1] == 0:
img_crop = None
continue
if textline_text and img_crop is not None:
base_name = os.path.join(
dir_out, file_name + '_line_' + str(indexer_textlines)
)
if pref_of_dataset:
base_name += '_' + pref_of_dataset
if not do_not_mask_with_textline_contour:
base_name += '_masked'
with open(base_name + '.txt', 'w') as text_file:
text_file.write(textline_text)
cv2.imwrite(base_name + '.png', img_crop)
indexer_textlines += 1
else:
total_bb_coordinates = []
tree = ET.parse(dir_xml, parser=ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
alltags = [elem.tag for elem in root.iter()]
name_space = alltags[0].split('}')[0]
name_space = name_space.split('{')[1]
region_tags = np.unique([x for x in alltags if x.endswith('TextRegion')])
cropped_lines_region_indexer = []
indexer_text_region = 0
indexer_textlines = 0
# FIXME: non recursive, use OCR-D PAGE generateDS API. Or use an existing tool for this purpose altogether
for nn in root.iter(region_tags):
for child_textregion in nn:
if child_textregion.tag.endswith("TextLine"):
for child_textlines in child_textregion:
if child_textlines.tag.endswith("Coords"):
cropped_lines_region_indexer.append(indexer_text_region)
p_h = child_textlines.attrib['points'].split(' ')
textline_coords = np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])
x, y, w, h = cv2.boundingRect(textline_coords)
if exclude_vertical_lines and h > 1.4 * w:
img_crop = None
continue
total_bb_coordinates.append([x, y, w, h])
img_poly_on_img = np.copy(img)
mask_poly = np.zeros(img.shape)
mask_poly = cv2.fillPoly(mask_poly, pts=[textline_coords], color=(1, 1, 1))
mask_poly = mask_poly[y : y + h, x : x + w, :]
img_crop = img_poly_on_img[y : y + h, x : x + w, :]
if not do_not_mask_with_textline_contour:
img_crop[mask_poly == 0] = 255
if img_crop.shape[0] == 0 or img_crop.shape[1] == 0:
img_crop = None
continue
if child_textlines.tag.endswith("TextEquiv"):
for cheild_text in child_textlines:
if cheild_text.tag.endswith("Unicode"):
textline_text = cheild_text.text
if textline_text and img_crop is not None:
base_name = os.path.join(
dir_out, file_name + '_line_' + str(indexer_textlines)
)
if pref_of_dataset:
base_name += '_' + pref_of_dataset
if not do_not_mask_with_textline_contour:
base_name += '_masked'
with open(base_name + '.txt', 'w') as text_file:
text_file.write(textline_text)
cv2.imwrite(base_name + '.png', img_crop)
indexer_textlines += 1

View file

@ -73,14 +73,8 @@ def main():
is_flag=True, is_flag=True,
help="if this parameter set to true, generated labels and in the case of provided org images cropping will be imposed and cropped labels and images will be written in output directories.", help="if this parameter set to true, generated labels and in the case of provided org images cropping will be imposed and cropped labels and images will be written in output directories.",
) )
@click.option(
"--page_alto",
"-alto",
is_flag=True,
help="If this parameter is set to True, textline label generation is performed using PAGE/ALTO files. Otherwise, the default method for PAGE XML files is used.",
)
def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, dir_out_images, page_alto): def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, dir_out_images):
if config: if config:
with open(config) as f: with open(config) as f:
config_params = json.load(f) config_params = json.load(f)
@ -88,7 +82,7 @@ def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, di
print("passed") print("passed")
config_params = None config_params = None
gt_list = get_content_of_dir(dir_xml) gt_list = get_content_of_dir(dir_xml)
get_images_of_ground_truth(gt_list,dir_xml,dir_out,type_output, config, config_params, printspace, dir_images, dir_out_images, page_alto) get_images_of_ground_truth(gt_list,dir_xml,dir_out,type_output, config, config_params, printspace, dir_images, dir_out_images)
@main.command() @main.command()
@click.option( @click.option(

View file

@ -686,7 +686,7 @@ def get_layout_contours_for_visualization(xml_file):
co_noise.append(np.array(c_t_in)) co_noise.append(np.array(c_t_in))
return co_text, co_graphic, co_sep, co_img, co_table, co_map, co_music, co_noise, y_len, x_len return co_text, co_graphic, co_sep, co_img, co_table, co_map, co_music, co_noise, y_len, x_len
def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_file, config_params, printspace, dir_images, dir_out_images, page_alto=False): def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_file, config_params, printspace, dir_images, dir_out_images):
""" """
Reading the page xml files and write the ground truth images into given output directory. Reading the page xml files and write the ground truth images into given output directory.
""" """
@ -699,94 +699,81 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_
print(gt_list[index]) print(gt_list[index])
try: try:
if page_alto: tree1 = ET.parse(dir_in+'/'+gt_list[index], parser = ET.XMLParser(encoding='utf-8'))
tree = ET.parse(dir_in+'/'+gt_list[index]) root1=tree1.getroot()
root = tree.getroot() alltags=[elem.tag for elem in root1.iter()]
link=alltags[0].split('}')[0]+'}'
NS = {'alto': root.tag.split('}')[0].strip('{')}#{"alto": "http://www.loc.gov/standards/alto/ns-v4#"}
x_len, y_len = 0, 0
page = root.find('.//alto:Page', NS)
x_len = int( page.get("WIDTH") )
y_len = int( page.get("HEIGHT") )
else:
tree1 = ET.parse(dir_in+'/'+gt_list[index], parser = ET.XMLParser(encoding='utf-8'))
root1=tree1.getroot()
alltags=[elem.tag for elem in root1.iter()]
link=alltags[0].split('}')[0]+'}'
x_len, y_len = 0, 0 x_len, y_len = 0, 0
for jj in root1.iter(link+'Page'): for jj in root1.iter(link+'Page'):
y_len=int(jj.attrib['imageHeight']) y_len=int(jj.attrib['imageHeight'])
x_len=int(jj.attrib['imageWidth']) x_len=int(jj.attrib['imageWidth'])
if 'columns_width' in list(config_params.keys()): if 'columns_width' in list(config_params.keys()):
columns_width_dict = config_params['columns_width'] columns_width_dict = config_params['columns_width']
metadata_element = root1.find(link+'Metadata') metadata_element = root1.find(link+'Metadata')
num_col = None num_col = None
for child in metadata_element: for child in metadata_element:
tag2 = child.tag tag2 = child.tag
if tag2.endswith('}Comments') or tag2.endswith('}comments'): if tag2.endswith('}Comments') or tag2.endswith('}comments'):
text_comments = child.text text_comments = child.text
num_col = int(text_comments.split('num_col')[1]) num_col = int(text_comments.split('num_col')[1])
if num_col: if num_col:
x_new = columns_width_dict[str(num_col)] x_new = columns_width_dict[str(num_col)]
y_new = int ( x_new * (y_len / float(x_len)) ) y_new = int ( x_new * (y_len / float(x_len)) )
if printspace or "printspace_as_class_in_layout" in list(config_params.keys()): if printspace or "printspace_as_class_in_layout" in list(config_params.keys()):
region_tags = np.unique([x for x in alltags if x.endswith('PrintSpace') or x.endswith('Border')]) region_tags = np.unique([x for x in alltags if x.endswith('PrintSpace') or x.endswith('Border')])
co_use_case = [] co_use_case = []
for tag in region_tags: for tag in region_tags:
tag_endings = ['}PrintSpace','}Border'] tag_endings = ['}PrintSpace','}Border']
if tag.endswith(tag_endings[0]) or tag.endswith(tag_endings[1]): if tag.endswith(tag_endings[0]) or tag.endswith(tag_endings[1]):
for nn in root1.iter(tag): for nn in root1.iter(tag):
c_t_in = [] c_t_in = []
sumi = 0 sumi = 0
for vv in nn.iter(): for vv in nn.iter():
# check the format of coords # check the format of coords
if vv.tag == link + 'Coords': if vv.tag == link + 'Coords':
coords = bool(vv.attrib) coords = bool(vv.attrib)
if coords: if coords:
p_h = vv.attrib['points'].split(' ') p_h = vv.attrib['points'].split(' ')
c_t_in.append( c_t_in.append(
np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h])) np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h]))
break
else:
pass
if vv.tag == link + 'Point':
c_t_in.append([int(float(vv.attrib['x'])), int(float(vv.attrib['y']))])
sumi += 1
elif vv.tag != link + 'Point' and sumi >= 1:
break break
co_use_case.append(np.array(c_t_in)) else:
pass
img = np.zeros((y_len, x_len, 3)) if vv.tag == link + 'Point':
c_t_in.append([int(float(vv.attrib['x'])), int(float(vv.attrib['y']))])
sumi += 1
elif vv.tag != link + 'Point' and sumi >= 1:
break
co_use_case.append(np.array(c_t_in))
img_poly = cv2.fillPoly(img, pts=co_use_case, color=(1, 1, 1)) img = np.zeros((y_len, x_len, 3))
img_poly = img_poly.astype(np.uint8) img_poly = cv2.fillPoly(img, pts=co_use_case, color=(1, 1, 1))
imgray = cv2.cvtColor(img_poly, cv2.COLOR_BGR2GRAY) img_poly = img_poly.astype(np.uint8)
_, thresh = cv2.threshold(imgray, 0, 255, 0)
contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) imgray = cv2.cvtColor(img_poly, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(imgray, 0, 255, 0)
cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))]) contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
try: cnt_size = np.array([cv2.contourArea(contours[j]) for j in range(len(contours))])
cnt = contours[np.argmax(cnt_size)]
x, y, w, h = cv2.boundingRect(cnt)
except:
x, y , w, h = 0, 0, x_len, y_len
bb_xywh = [x, y, w, h] try:
cnt = contours[np.argmax(cnt_size)]
x, y, w, h = cv2.boundingRect(cnt)
except:
x, y , w, h = 0, 0, x_len, y_len
bb_xywh = [x, y, w, h]
if config_file and (config_params['use_case']=='textline' or config_params['use_case']=='word' or config_params['use_case']=='glyph' or config_params['use_case']=='printspace'): if config_file and (config_params['use_case']=='textline' or config_params['use_case']=='word' or config_params['use_case']=='glyph' or config_params['use_case']=='printspace'):
@ -797,67 +784,49 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_
textline_rgb_color = (255, 0, 0) textline_rgb_color = (255, 0, 0)
if page_alto: if config_params['use_case']=='textline':
co_use_case = [] region_tags = np.unique([x for x in alltags if x.endswith('TextLine')])
for line in root.findall(".//alto:TextLine", NS): elif config_params['use_case']=='word':
string_el = line.find("alto:String", NS) region_tags = np.unique([x for x in alltags if x.endswith('Word')])
textline_text = string_el.attrib["CONTENT"] if string_el is not None else None elif config_params['use_case']=='glyph':
region_tags = np.unique([x for x in alltags if x.endswith('Glyph')])
elif config_params['use_case']=='printspace':
region_tags = np.unique([x for x in alltags if x.endswith('PrintSpace')])
polygon_el = line.find("alto:Shape/alto:Polygon", NS) co_use_case = []
if polygon_el is None:
continue
points = polygon_el.attrib["POINTS"].split() for tag in region_tags:
coords = [
(int(points[i]), int(points[i + 1]))
for i in range(0, len(points), 2)
]
co_use_case.append( np.array(coords, dtype=np.int32) )
else:
if config_params['use_case']=='textline': if config_params['use_case']=='textline':
region_tags = np.unique([x for x in alltags if x.endswith('TextLine')]) tag_endings = ['}TextLine','}textline']
elif config_params['use_case']=='word': elif config_params['use_case']=='word':
region_tags = np.unique([x for x in alltags if x.endswith('Word')]) tag_endings = ['}Word','}word']
elif config_params['use_case']=='glyph': elif config_params['use_case']=='glyph':
region_tags = np.unique([x for x in alltags if x.endswith('Glyph')]) tag_endings = ['}Glyph','}glyph']
elif config_params['use_case']=='printspace': elif config_params['use_case']=='printspace':
region_tags = np.unique([x for x in alltags if x.endswith('PrintSpace')]) tag_endings = ['}PrintSpace','}printspace']
co_use_case = [] if tag.endswith(tag_endings[0]) or tag.endswith(tag_endings[1]):
for nn in root1.iter(tag):
for tag in region_tags: c_t_in = []
if config_params['use_case']=='textline': sumi = 0
tag_endings = ['}TextLine','}textline'] for vv in nn.iter():
elif config_params['use_case']=='word': # check the format of coords
tag_endings = ['}Word','}word'] if vv.tag == link + 'Coords':
elif config_params['use_case']=='glyph': coords = bool(vv.attrib)
tag_endings = ['}Glyph','}glyph'] if coords:
elif config_params['use_case']=='printspace': p_h = vv.attrib['points'].split(' ')
tag_endings = ['}PrintSpace','}printspace'] c_t_in.append(
np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h]))
if tag.endswith(tag_endings[0]) or tag.endswith(tag_endings[1]):
for nn in root1.iter(tag):
c_t_in = []
sumi = 0
for vv in nn.iter():
# check the format of coords
if vv.tag == link + 'Coords':
coords = bool(vv.attrib)
if coords:
p_h = vv.attrib['points'].split(' ')
c_t_in.append(
np.array([[int(x.split(',')[0]), int(x.split(',')[1])] for x in p_h]))
break
else:
pass
if vv.tag == link + 'Point':
c_t_in.append([int(float(vv.attrib['x'])), int(float(vv.attrib['y']))])
sumi += 1
elif vv.tag != link + 'Point' and sumi >= 1:
break break
co_use_case.append(np.array(c_t_in)) else:
pass
if vv.tag == link + 'Point':
c_t_in.append([int(float(vv.attrib['x'])), int(float(vv.attrib['y']))])
sumi += 1
elif vv.tag != link + 'Point' and sumi >= 1:
break
co_use_case.append(np.array(c_t_in))
if "artificial_class_label" in keys: if "artificial_class_label" in keys: