character normalization based on aletheia mapping

pull/17/head
Kai Labusch 9 months ago
parent eac71b3e40
commit 06c8b382db

@ -215,9 +215,48 @@ def alto2tsv(alto_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
except requests.HTTPError as e:
print(e)
def unicode_normalize(text, normalization_map=None, use_combining_characters=True):
if normalization_map is None:
ret = "".join([c if unicodedata.category(c) != "Co" else '' for c in text])
if ret != text:
print("Warning: Due to unicode normalization possible loss of information: "
"{} => {} (normalization file missing?)".format(text, ret))
elif use_combining_characters:
ret = "".join([c if unicodedata.category(c) != "Co" else
"{}{}".format(normalization_map.loc[ord(c)].base,
chr(int(normalization_map.loc[ord(c)].combining_character, base=16))
if normalization_map.loc[ord(c)].combining_character != '' else '')
if ord(c) in normalization_map.index else '' for c in text])
# do it again since the normalization map may again contain unicode private use chars
ret = "".join([c if unicodedata.category(c) != "Co" else '' for c in ret])
if ret != text:
print("Warning: Due to unicode normalization possible loss of information: "
"{} => {}".format(text, ret))
else:
ret = "".join([c if unicodedata.category(c) != "Co" else
normalization_map.loc[ord(c)].base
if ord(c) in normalization_map.index else ''
for c in text])
# do it again since the normalization map may again contain unicode private use chars
ret = "".join([c if unicodedata.category(c) != "Co" else '' for c in ret])
if ret != text:
print("Warning: Due to unicode normalization possible loss of information: "
"{} => {}".format(text, ret))
return unicodedata.normalize('NFC', ret)
def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint,
noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority):
noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority, normalization_file):
print("page2tsv - processing file: {}".format(page_xml_file))
if purpose == "NERD":
out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf']
elif purpose == "OCR":
@ -241,6 +280,17 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
tsv = []
line_info = []
_unicode_normalize = unicode_normalize
if normalization_file is not None:
normalization_map = pd.read_pickle(normalization_file)
normalization_map = normalization_map.set_index('decimal')
# import ipdb;ipdb.set_trace()
_unicode_normalize = lambda s: unicode_normalize(s, normalization_map=normalization_map)
for region_idx, region in enumerate(pcgts.get_Page().get_AllRegions(classes=['Text'], order='reading-order')):
for text_line in region.get_TextLine():
left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(text_line.get_Coords().points)]
@ -259,8 +309,13 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
# transform OCR coordinates using `scale_factor` to derive
# correct coordinates for the web presentation image
left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(text_line.get_Coords().points)]
tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0,
text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id))
text = text_equiv.get_Unicode()
for text_part in text.split(" "):
tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0,
_unicode_normalize(text_part), len(urls), left, right, top, bottom, text_line.id))
else:
for word in words:
# XXX TODO make this configurable
@ -272,7 +327,7 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
# correct coordinates for the web presentation image
left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(word.get_Coords().points)]
tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0,
textequiv, len(urls), left, right, top, bottom, text_line.id))
_unicode_normalize(textequiv), len(urls), left, right, top, bottom, text_line.id))
line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id'])
@ -312,12 +367,17 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint,
tsv = tsv.merge(line_info, left_on='line', right_index=True)
tsv = tsv[out_columns].reset_index(drop=True)
# import ipdb;ipdb.set_trace()
try:
if purpose == 'NERD' and ner_rest_endpoint is not None:
tsv, ner_result = ner(tsv, ner_rest_endpoint)
if ned_rest_endpoint is not None:
tsv, _ = ned(tsv, ner_result, ned_rest_endpoint, threshold=ned_threshold, priority=ned_priority)
tsv.to_csv(tsv_out_file, sep="\t", quoting=3, index=False, mode='a', header=False)
# import ipdb;ipdb.set_trace()
tsv.to_csv(tsv_out_file, sep="\t", quoting=3, index=False, mode='a', header=False, encoding='utf-8')
except requests.HTTPError as e:
print(e)
@ -406,10 +466,11 @@ def make_page2tsv_commands(xls_file, directory, purpose):
@click.option('--min-confidence', type=float, default=None)
@click.option('--max-confidence', type=float, default=None)
@click.option('--ned-priority', type=int, default=1)
@click.option('--normalization-file', type=click.Path(exists=True), default=None)
def page2tsv_cli(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint,
noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority):
noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority, normalization_file):
return page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint,
noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority)
noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority, normalization_file)
@click.command()

Loading…
Cancel
Save