diff --git a/tsvtools/cli.py b/tsvtools/cli.py index b5fa77c..d179916 100644 --- a/tsvtools/cli.py +++ b/tsvtools/cli.py @@ -75,9 +75,9 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_threshold, min_confidence, max_confidence): if purpose == "NERD": - out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom'] + out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf'] elif purpose == "OCR": - out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom'] + out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf'] if min_confidence is not None and max_confidence is not None: out_columns += ['ocrconf'] @@ -168,6 +168,7 @@ def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, tsv['NE-TAG'] = 'O' tsv['NE-EMB'] = 'O' tsv['ID'] = '-' + tsv['conf'] = '-' tsv = tsv.rename(columns={'TEXT': 'TOKEN'}) elif purpose == 'OCR': diff --git a/tsvtools/ned.py b/tsvtools/ned.py index abcd927..d35a18f 100644 --- a/tsvtools/ned.py +++ b/tsvtools/ned.py @@ -32,6 +32,7 @@ def ned(tsv, ner_result, ned_rest_endpoint, json_file=None, threshold=None): entity = "" entity_type = None tsv['ID'] = '-' + tsv['conf'] = '-' def check_entity(tag): nonlocal entity, entity_type, rids @@ -44,19 +45,26 @@ def ned(tsv, ner_result, ned_rest_endpoint, json_file=None, threshold=None): if 'ranking' in ned_result[eid]: ranking = ned_result[eid]['ranking'] - #tsv.loc[rids, 'ID'] = ranking[0][1]['wikidata'] if threshold is None or ranking[0][1]['proba_1'] >= threshold else '' + # tsv.loc[rids, 'ID'] = ranking[0][1]['wikidata'] + # if threshold is None or ranking[0][1]['proba_1'] >= threshold else '' tmp = "|".join([ranking[i][1]['wikidata'] for i in range(len(ranking)) if threshold is None or ranking[i][1]['proba_1'] >= threshold]) tsv.loc[rids, 'ID'] = tmp if len(tmp) > 0 else '-' + tmp = ",".join([str(ranking[i][1]['proba_1']) + for i in range(len(ranking)) + if threshold is None or ranking[i][1]['proba_1'] >= threshold]) + + tsv.loc[rids, 'conf'] = tmp if len(tmp) > 0 else '-' + rids = [] entity = "" entity_type = None ner_tmp = tsv.copy() - ner_tmp.loc[~ner_tmp['NE-TAG'].isin(['O', 'B-PER', 'B-LOC','B-ORG', 'I-PER', 'I-LOC', 'I-ORG']), 'NE-TAG'] = 'O' + ner_tmp.loc[~ner_tmp['NE-TAG'].isin(['O', 'B-PER', 'B-LOC', 'B-ORG', 'I-PER', 'I-LOC', 'I-ORG']), 'NE-TAG'] = 'O' for rid, row in ner_tmp.iterrows():