From 947a0e06f8cdaaeeec2400b0d2afe420cc8d05d5 Mon Sep 17 00:00:00 2001 From: vahidrezanezhad Date: Thu, 23 May 2024 15:43:31 +0200 Subject: [PATCH] missing text types are added --- custom_config_page2label.json | 12 ++++----- pagexml2label.py | 48 ++++++++++++++++++++++++++++++++++- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/custom_config_page2label.json b/custom_config_page2label.json index 85b5d7e..254f4df 100644 --- a/custom_config_page2label.json +++ b/custom_config_page2label.json @@ -1,8 +1,8 @@ { -"textregions":{"paragraph":1, "heading": 2, "header":2,"drop-capital": 3, "marginalia":4 ,"page-number":1 , "catch-word":1 }, -"imageregion":5, -"separatorregion":6, -"graphicregions" :{"handwritten-annotation":7, "decoration": 8, "signature": 9, "stamp": 10}, -"artificial_class_on_boundry": ["paragraph","header", "heading", "marginalia", "page-number", "catch-word", "drop-capital"], -"artificial_class_label":11 +"textregions":{"paragraph":1, "heading": 1, "header":1,"drop-capital": 1, "marginalia":1 ,"page-number":1 , "catch-word":1 ,"footnote": 1, "footnote-continued": 1}, +"imageregion":2, +"separatorregion":3, +"graphicregions" :{"handwritten-annotation":2, "decoration": 2, "signature": 2, "stamp": 2}, +"artificial_class_on_boundry": ["paragraph","header", "heading", "marginalia", "page-number", "catch-word", "drop-capital","footnote", "footnote-continued"], +"artificial_class_label":4 } diff --git a/pagexml2label.py b/pagexml2label.py index 5311c24..63b7acf 100644 --- a/pagexml2label.py +++ b/pagexml2label.py @@ -113,6 +113,7 @@ class pagexml2word: """ Reading the page xml files and write the ground truth images into given output directory. """ + ## to do: add footnote to text regions for index in tqdm(range(len(self.gt_list))): #try: tree1 = ET.parse(self.dir+'/'+self.gt_list[index]) @@ -144,11 +145,13 @@ class pagexml2word: types_graphic_label = list(types_graphic_dict.values()) - labels_rgb_color = [ (0,0,0), (255,0,0), (255,125,0), (255,0,125), (125,255,125), (125,125,0), (0,125,255), (0,125,0), (125,125,125), (255,0,255), (125,0,125), (0,255,0),(0,0,255), (0,255,255), (255,125,125), (0,125,125), (0,255,125)] + labels_rgb_color = [ (0,0,0), (255,0,0), (255,125,0), (255,0,125), (125,255,125), (125,125,0), (0,125,255), (0,125,0), (125,125,125), (255,0,255), (125,0,125), (0,255,0),(0,0,255), (0,255,255), (255,125,125), (0,125,125), (0,255,125), (255,125,255), (125,255,0)] region_tags=np.unique([x for x in alltags if x.endswith('Region')]) co_text_paragraph=[] + co_text_footnote=[] + co_text_footnote_con=[] co_text_drop=[] co_text_heading=[] co_text_header=[] @@ -177,6 +180,8 @@ class pagexml2word: c_t_in_signature_mark=[] c_t_in_catch=[] c_t_in_marginalia=[] + c_t_in_footnote=[] + c_t_in_footnote_con=[] sumi=0 for vv in nn.iter(): # check the format of coords @@ -190,6 +195,14 @@ class pagexml2word: if "drop-capital" in types_text: if "type" in nn.attrib and nn.attrib['type']=='drop-capital': c_t_in_drop.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "footnote" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='footnote': + c_t_in_footnote.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) + + if "footnote-continued" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='footnote-continued': + c_t_in_footnote_con.append( np.array( [ [ int(x.split(',')[0]) , int(x.split(',')[1]) ] for x in p_h] ) ) if "heading" in types_text: if "type" in nn.attrib and nn.attrib['type']=='heading': @@ -231,6 +244,16 @@ class pagexml2word: c_t_in_drop.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) sumi+=1 + if "footnote" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='footnote': + c_t_in_footnote.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + + if "footnote-continued" in types_text: + if "type" in nn.attrib and nn.attrib['type']=='footnote-continued': + c_t_in_footnote_con.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) + sumi+=1 + if "heading" in types_text: if "type" in nn.attrib and nn.attrib['type']=='heading': c_t_in_heading.append([ int(np.float(vv.attrib['x'])) , int(np.float(vv.attrib['y'])) ]) @@ -272,6 +295,10 @@ class pagexml2word: if len(c_t_in_drop)>0: co_text_drop.append(np.array(c_t_in_drop)) + if len(c_t_in_footnote_con)>0: + co_text_footnote_con.append(np.array(c_t_in_footnote_con)) + if len(c_t_in_footnote)>0: + co_text_footnote.append(np.array(c_t_in_footnote)) if len(c_t_in_paragraph)>0: co_text_paragraph.append(np.array(c_t_in_paragraph)) if len(c_t_in_heading)>0: @@ -497,6 +524,15 @@ class pagexml2word: erosion_rate = 2 dilation_rate = 4 co_text_marginalia, img_boundary = self.update_region_contours(co_text_marginalia, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "footnote" in elements_with_artificial_class: + erosion_rate = 2 + dilation_rate = 4 + co_text_footnote, img_boundary = self.update_region_contours(co_text_footnote, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + if "footnote-continued" in elements_with_artificial_class: + erosion_rate = 2 + dilation_rate = 4 + co_text_footnote_con, img_boundary = self.update_region_contours(co_text_footnote_con, img_boundary, erosion_rate, dilation_rate, y_len, x_len ) + img = np.zeros( (y_len,x_len,3) ) @@ -525,6 +561,10 @@ class pagexml2word: if 'textregions' in keys: if "paragraph" in types_text: img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=labels_rgb_color[ config_params['textregions']['paragraph']]) + if "footnote" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_footnote, color=labels_rgb_color[ config_params['textregions']['footnote']]) + if "footnote-continued" in types_text: + img_poly=cv2.fillPoly(img, pts =co_text_footnote_con, color=labels_rgb_color[ config_params['textregions']['footnote-continued']]) if "heading" in types_text: img_poly=cv2.fillPoly(img, pts =co_text_heading, color=labels_rgb_color[ config_params['textregions']['heading']]) if "header" in types_text: @@ -580,6 +620,12 @@ class pagexml2word: if "paragraph" in types_text: color_label = config_params['textregions']['paragraph'] img_poly=cv2.fillPoly(img, pts =co_text_paragraph, color=(color_label,color_label,color_label)) + if "footnote" in types_text: + color_label = config_params['textregions']['footnote'] + img_poly=cv2.fillPoly(img, pts =co_text_footnote, color=(color_label,color_label,color_label)) + if "footnote-continued" in types_text: + color_label = config_params['textregions']['footnote-continued'] + img_poly=cv2.fillPoly(img, pts =co_text_footnote_con, color=(color_label,color_label,color_label)) if "heading" in types_text: color_label = config_params['textregions']['heading'] img_poly=cv2.fillPoly(img, pts =co_text_heading, color=(color_label,color_label,color_label))