mirror of
https://github.com/qurator-spk/page2tsv.git
synced 2025-07-11 19:19:52 +02:00
remove ner/ned code from page2tsv package
This commit is contained in:
parent
ed90193c45
commit
568e1cd104
6 changed files with 2 additions and 283 deletions
|
@ -1,4 +1,3 @@
|
|||
import json
|
||||
import glob
|
||||
import re
|
||||
import os
|
||||
|
@ -14,12 +13,9 @@ from lxml import etree as ET
|
|||
from ocrd_models.ocrd_page import parse
|
||||
from ocrd_utils import bbox_from_points
|
||||
|
||||
from .ned import ned
|
||||
from .ner import ner
|
||||
from .tsv import read_tsv, write_tsv, extract_doc_links
|
||||
from qurator.utils.tsv import read_tsv, write_tsv, extract_doc_links
|
||||
from .ocr import get_conf_color
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1)
|
||||
@click.argument('url-file', type=click.Path(exists=False), required=True, nargs=1)
|
||||
|
@ -218,59 +214,6 @@ def tsv2page(output_filename, keep_words, page_file, tsv_file):
|
|||
f.write(ET.tostring(tree, pretty_print=True).decode('utf-8'))
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1)
|
||||
@click.argument('tsv-out-file', type=click.Path(), required=True, nargs=1)
|
||||
@click.option('--ner-rest-endpoint', type=str, default=None,
|
||||
help="REST endpoint of sbb_ner service. See https://github.com/qurator-spk/sbb_ner for details.")
|
||||
@click.option('--ned-rest-endpoint', type=str, default=None,
|
||||
help="REST endpoint of sbb_ned service. See https://github.com/qurator-spk/sbb_ned for details.")
|
||||
@click.option('--ned-json-file', type=str, default=None)
|
||||
@click.option('--noproxy', type=bool, is_flag=True, help='disable proxy. default: proxy is enabled.')
|
||||
@click.option('--ned-threshold', type=float, default=None)
|
||||
@click.option('--ned-priority', type=int, default=1)
|
||||
def find_entities(tsv_file, tsv_out_file, ner_rest_endpoint, ned_rest_endpoint, ned_json_file, noproxy, ned_threshold,
|
||||
ned_priority):
|
||||
|
||||
if noproxy:
|
||||
os.environ['no_proxy'] = '*'
|
||||
|
||||
tsv, urls = read_tsv(tsv_file)
|
||||
|
||||
try:
|
||||
if ner_rest_endpoint is not None:
|
||||
|
||||
tsv, ner_result = ner(tsv, ner_rest_endpoint)
|
||||
|
||||
elif os.path.exists(tsv_file):
|
||||
|
||||
print('Using NER information that is already contained in file: {}'.format(tsv_file))
|
||||
|
||||
tmp = tsv.copy()
|
||||
tmp['sen'] = (tmp['No.'] == 0).cumsum()
|
||||
tmp.loc[~tmp['NE-TAG'].isin(['O', 'B-PER', 'B-LOC', 'B-ORG', 'I-PER', 'I-LOC', 'I-ORG']), 'NE-TAG'] = 'O'
|
||||
|
||||
ner_result = [[{'word': str(row.TOKEN), 'prediction': row['NE-TAG']} for _, row in sen.iterrows()]
|
||||
for _, sen in tmp.groupby('sen')]
|
||||
else:
|
||||
raise RuntimeError("Either NER rest endpoint or NER-TAG information within tsv_file required.")
|
||||
|
||||
if ned_rest_endpoint is not None:
|
||||
|
||||
tsv, ned_result = ned(tsv, ner_result, ned_rest_endpoint, json_file=ned_json_file, threshold=ned_threshold,
|
||||
priority=ned_priority)
|
||||
|
||||
if ned_json_file is not None and not os.path.exists(ned_json_file):
|
||||
|
||||
with open(ned_json_file, "w") as fp_json:
|
||||
json.dump(ned_result, fp_json, indent=2, separators=(',', ': '))
|
||||
|
||||
write_tsv(tsv, urls, tsv_out_file)
|
||||
|
||||
except requests.HTTPError as e:
|
||||
print(e)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--xls-file', type=click.Path(exists=True), default=None,
|
||||
help="Read parameters from xls-file. Expected columns: Filename, iiif_url, scale_factor.")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue