mirror of
https://github.com/qurator-spk/eynollah.git
synced 2026-03-13 02:31:56 +01:00
page alto label generation activated for textline
This commit is contained in:
parent
4b80e45d91
commit
f1d8257496
3 changed files with 211 additions and 171 deletions
|
|
@ -92,7 +92,7 @@ def linegt_cli(
|
||||||
tree = ET.parse(dir_xml)
|
tree = ET.parse(dir_xml)
|
||||||
root = tree.getroot()
|
root = tree.getroot()
|
||||||
|
|
||||||
NS = {"alto": "http://www.loc.gov/standards/alto/ns-v4#"}
|
NS = {'alto': root.tag.split('}')[0].strip('{')}#{"alto": "http://www.loc.gov/standards/alto/ns-v4#"}
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -73,8 +73,14 @@ def main():
|
||||||
is_flag=True,
|
is_flag=True,
|
||||||
help="if this parameter set to true, generated labels and in the case of provided org images cropping will be imposed and cropped labels and images will be written in output directories.",
|
help="if this parameter set to true, generated labels and in the case of provided org images cropping will be imposed and cropped labels and images will be written in output directories.",
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"--page_alto",
|
||||||
|
"-alto",
|
||||||
|
is_flag=True,
|
||||||
|
help="If this parameter is set to True, textline label generation is performed using PAGE/ALTO files. Otherwise, the default method for PAGE XML files is used.",
|
||||||
|
)
|
||||||
|
|
||||||
def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, dir_out_images):
|
def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, dir_out_images, page_alto):
|
||||||
if config:
|
if config:
|
||||||
with open(config) as f:
|
with open(config) as f:
|
||||||
config_params = json.load(f)
|
config_params = json.load(f)
|
||||||
|
|
@ -82,7 +88,7 @@ def pagexml2label(dir_xml,dir_out,type_output,config, printspace, dir_images, di
|
||||||
print("passed")
|
print("passed")
|
||||||
config_params = None
|
config_params = None
|
||||||
gt_list = get_content_of_dir(dir_xml)
|
gt_list = get_content_of_dir(dir_xml)
|
||||||
get_images_of_ground_truth(gt_list,dir_xml,dir_out,type_output, config, config_params, printspace, dir_images, dir_out_images)
|
get_images_of_ground_truth(gt_list,dir_xml,dir_out,type_output, config, config_params, printspace, dir_images, dir_out_images, page_alto)
|
||||||
|
|
||||||
@main.command()
|
@main.command()
|
||||||
@click.option(
|
@click.option(
|
||||||
|
|
|
||||||
|
|
@ -686,7 +686,7 @@ def get_layout_contours_for_visualization(xml_file):
|
||||||
co_noise.append(np.array(c_t_in))
|
co_noise.append(np.array(c_t_in))
|
||||||
return co_text, co_graphic, co_sep, co_img, co_table, co_map, co_music, co_noise, y_len, x_len
|
return co_text, co_graphic, co_sep, co_img, co_table, co_map, co_music, co_noise, y_len, x_len
|
||||||
|
|
||||||
def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_file, config_params, printspace, dir_images, dir_out_images):
|
def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_file, config_params, printspace, dir_images, dir_out_images, page_alto=False):
|
||||||
"""
|
"""
|
||||||
Reading the page xml files and write the ground truth images into given output directory.
|
Reading the page xml files and write the ground truth images into given output directory.
|
||||||
"""
|
"""
|
||||||
|
|
@ -696,8 +696,22 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_
|
||||||
ls_org_imgs = os.listdir(dir_images)
|
ls_org_imgs = os.listdir(dir_images)
|
||||||
ls_org_imgs_stem = [os.path.splitext(item)[0] for item in ls_org_imgs]
|
ls_org_imgs_stem = [os.path.splitext(item)[0] for item in ls_org_imgs]
|
||||||
for index in tqdm(range(len(gt_list))):
|
for index in tqdm(range(len(gt_list))):
|
||||||
#try:
|
|
||||||
print(gt_list[index])
|
print(gt_list[index])
|
||||||
|
|
||||||
|
try:
|
||||||
|
if page_alto:
|
||||||
|
tree = ET.parse(dir_in+'/'+gt_list[index])
|
||||||
|
root = tree.getroot()
|
||||||
|
|
||||||
|
NS = {'alto': root.tag.split('}')[0].strip('{')}#{"alto": "http://www.loc.gov/standards/alto/ns-v4#"}
|
||||||
|
x_len, y_len = 0, 0
|
||||||
|
|
||||||
|
page = root.find('.//alto:Page', NS)
|
||||||
|
|
||||||
|
x_len = int( page.get("WIDTH") )
|
||||||
|
y_len = int( page.get("HEIGHT") )
|
||||||
|
|
||||||
|
else:
|
||||||
tree1 = ET.parse(dir_in+'/'+gt_list[index], parser = ET.XMLParser(encoding='utf-8'))
|
tree1 = ET.parse(dir_in+'/'+gt_list[index], parser = ET.XMLParser(encoding='utf-8'))
|
||||||
root1=tree1.getroot()
|
root1=tree1.getroot()
|
||||||
alltags=[elem.tag for elem in root1.iter()]
|
alltags=[elem.tag for elem in root1.iter()]
|
||||||
|
|
@ -783,6 +797,24 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_
|
||||||
|
|
||||||
textline_rgb_color = (255, 0, 0)
|
textline_rgb_color = (255, 0, 0)
|
||||||
|
|
||||||
|
if page_alto:
|
||||||
|
co_use_case = []
|
||||||
|
for line in root.findall(".//alto:TextLine", NS):
|
||||||
|
string_el = line.find("alto:String", NS)
|
||||||
|
textline_text = string_el.attrib["CONTENT"] if string_el is not None else None
|
||||||
|
|
||||||
|
polygon_el = line.find("alto:Shape/alto:Polygon", NS)
|
||||||
|
if polygon_el is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
points = polygon_el.attrib["POINTS"].split()
|
||||||
|
coords = [
|
||||||
|
(int(points[i]), int(points[i + 1]))
|
||||||
|
for i in range(0, len(points), 2)
|
||||||
|
]
|
||||||
|
|
||||||
|
co_use_case.append( np.array(coords, dtype=np.int32) )
|
||||||
|
else:
|
||||||
if config_params['use_case']=='textline':
|
if config_params['use_case']=='textline':
|
||||||
region_tags = np.unique([x for x in alltags if x.endswith('TextLine')])
|
region_tags = np.unique([x for x in alltags if x.endswith('TextLine')])
|
||||||
elif config_params['use_case']=='word':
|
elif config_params['use_case']=='word':
|
||||||
|
|
@ -879,6 +911,8 @@ def get_images_of_ground_truth(gt_list, dir_in, output_dir, output_type, config_
|
||||||
|
|
||||||
cv2.imwrite(os.path.join(dir_out_images, org_image_name), img_org)
|
cv2.imwrite(os.path.join(dir_out_images, org_image_name), img_org)
|
||||||
|
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
if config_file and config_params['use_case']=='layout':
|
if config_file and config_params['use_case']=='layout':
|
||||||
keys = list(config_params.keys())
|
keys = list(config_params.keys())
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue