1
0
Fork 0
mirror of https://github.com/qurator-spk/page2tsv.git synced 2025-07-11 19:19:52 +02:00

remove ner/ned code from page2tsv package

This commit is contained in:
Kai 2021-12-15 15:51:00 +01:00
parent ed90193c45
commit 568e1cd104
6 changed files with 2 additions and 283 deletions

View file

@ -1,4 +1,3 @@
import json
import glob
import re
import os
@ -14,12 +13,9 @@ from lxml import etree as ET
from ocrd_models.ocrd_page import parse
from ocrd_utils import bbox_from_points
from .ned import ned
from .ner import ner
from .tsv import read_tsv, write_tsv, extract_doc_links
from qurator.utils.tsv import read_tsv, write_tsv, extract_doc_links
from .ocr import get_conf_color
@click.command()
@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1)
@click.argument('url-file', type=click.Path(exists=False), required=True, nargs=1)
@ -218,59 +214,6 @@ def tsv2page(output_filename, keep_words, page_file, tsv_file):
f.write(ET.tostring(tree, pretty_print=True).decode('utf-8'))
@click.command()
@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1)
@click.argument('tsv-out-file', type=click.Path(), required=True, nargs=1)
@click.option('--ner-rest-endpoint', type=str, default=None,
help="REST endpoint of sbb_ner service. See https://github.com/qurator-spk/sbb_ner for details.")
@click.option('--ned-rest-endpoint', type=str, default=None,
help="REST endpoint of sbb_ned service. See https://github.com/qurator-spk/sbb_ned for details.")
@click.option('--ned-json-file', type=str, default=None)
@click.option('--noproxy', type=bool, is_flag=True, help='disable proxy. default: proxy is enabled.')
@click.option('--ned-threshold', type=float, default=None)
@click.option('--ned-priority', type=int, default=1)
def find_entities(tsv_file, tsv_out_file, ner_rest_endpoint, ned_rest_endpoint, ned_json_file, noproxy, ned_threshold,
ned_priority):
if noproxy:
os.environ['no_proxy'] = '*'
tsv, urls = read_tsv(tsv_file)
try:
if ner_rest_endpoint is not None:
tsv, ner_result = ner(tsv, ner_rest_endpoint)
elif os.path.exists(tsv_file):
print('Using NER information that is already contained in file: {}'.format(tsv_file))
tmp = tsv.copy()
tmp['sen'] = (tmp['No.'] == 0).cumsum()
tmp.loc[~tmp['NE-TAG'].isin(['O', 'B-PER', 'B-LOC', 'B-ORG', 'I-PER', 'I-LOC', 'I-ORG']), 'NE-TAG'] = 'O'
ner_result = [[{'word': str(row.TOKEN), 'prediction': row['NE-TAG']} for _, row in sen.iterrows()]
for _, sen in tmp.groupby('sen')]
else:
raise RuntimeError("Either NER rest endpoint or NER-TAG information within tsv_file required.")
if ned_rest_endpoint is not None:
tsv, ned_result = ned(tsv, ner_result, ned_rest_endpoint, json_file=ned_json_file, threshold=ned_threshold,
priority=ned_priority)
if ned_json_file is not None and not os.path.exists(ned_json_file):
with open(ned_json_file, "w") as fp_json:
json.dump(ned_result, fp_json, indent=2, separators=(',', ': '))
write_tsv(tsv, urls, tsv_out_file)
except requests.HTTPError as e:
print(e)
@click.command()
@click.option('--xls-file', type=click.Path(exists=True), default=None,
help="Read parameters from xls-file. Expected columns: Filename, iiif_url, scale_factor.")