From aabcc4866db9a5993ef553280ac4f05de30bf8bc Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 21 Feb 2022 18:50:09 +0100 Subject: [PATCH] remove obsolete tsv.py (now in qurator-sbb-tools --- tsvtools/cli.py | 2 +- tsvtools/tsv.py | 87 ------------------------------------------------- 2 files changed, 1 insertion(+), 88 deletions(-) delete mode 100644 tsvtools/tsv.py diff --git a/tsvtools/cli.py b/tsvtools/cli.py index 8632174..cfb0a69 100644 --- a/tsvtools/cli.py +++ b/tsvtools/cli.py @@ -13,7 +13,7 @@ from lxml import etree as ET from ocrd_models.ocrd_page import parse from ocrd_utils import bbox_from_points -from .tsv import read_tsv, write_tsv, extract_doc_links +from qurator.utils.tsv import read_tsv, write_tsv, extract_doc_links from .ocr import get_conf_color from qurator.utils.ner import ner diff --git a/tsvtools/tsv.py b/tsvtools/tsv.py deleted file mode 100644 index aeafb8a..0000000 --- a/tsvtools/tsv.py +++ /dev/null @@ -1,87 +0,0 @@ -import pandas as pd -import re - - -def read_tsv(tsv_file): - - tsv = pd.read_csv(tsv_file, sep='\t', comment='#', quoting=3).rename(columns={'GND-ID': 'ID'}) - - parts = extract_doc_links(tsv_file) - - urls = [part['url'] for part in parts] - - return tsv, urls - - -def write_tsv(tsv, urls, tsv_out_file): - - if 'conf' in tsv.columns: - out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf'] - else: - out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom'] - - if len(urls) == 0: - print('Writing to {}...'.format(tsv_out_file)) - - tsv.to_csv(tsv_out_file, sep="\t", quoting=3, index=False) - else: - pd.DataFrame([], columns=out_columns).to_csv(tsv_out_file, sep="\t", quoting=3, index=False) - - for url_id, part in tsv.groupby('url_id'): - with open(tsv_out_file, 'a') as f: - f.write('# ' + urls[int(url_id)] + '\n') - - part.to_csv(tsv_out_file, sep="\t", quoting=3, index=False, mode='a', header=False) - - -def extract_doc_links(tsv_file): - parts = [] - - header = None - - with open(tsv_file, 'r') as f: - - text = [] - url = None - - for line in f: - - if header is None: - header = "\t".join(line.split()) + '\n' - continue - - urls = [url for url in - re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)] - - if len(urls) > 0: - if url is not None: - parts.append({"url": url, 'header': header, 'text': "".join(text)}) - text = [] - - url = urls[-1] - else: - if url is None: - continue - - line = '\t'.join(line.split()) - - if line.count('\t') == 2: - line = "\t" + line - - if line.count('\t') >= 3: - text.append(line + '\n') - - continue - - if line.startswith('#'): - continue - - if len(line) == 0: - continue - - print('Line error: |', line, '|Number of Tabs: ', line.count('\t')) - - if url is not None: - parts.append({"url": url, 'header': header, 'text': "".join(text)}) - - return parts