You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

164 lines
4.4 KiB
Python

import re
import pandas as pd
from tqdm import tqdm as tqdm
import click
import codecs
import os
import sqlite3
from qurator.utils.parallel import run as prun
class ChunkTask:
selection = None
def __init__(self, chunk, min_line_len):
self._chunk = chunk
self._min_line_len = min_line_len
def __call__(self, *args, **kwargs):
return ChunkTask.reformat_chunk(self._chunk, self._min_line_len)
@staticmethod
def reformat_chunk(chunk, min_line_len):
"""
Process a chunk of documents.
:param chunk: pandas DataFrame that contains one document per row.
:param min_line_len: Break the document text up in lines that have this minimum length.
:return: One big text where the documents are separated by an empty line.
"""
text = ''
for i, r in chunk.iterrows():
if type(r.text) != str:
continue
ppn = r.ppn if str(r.ppn).startswith('PPN') else 'PPN' + r.ppn
filename = str(r['file name'])
if not ChunkTask.selection.loc[(ppn, filename)].selected.iloc[0]:
continue
for se in sentence_split(str(r.text), min_line_len):
text += se
text += '\n\n'
return text
@staticmethod
def initialize(selection_file):
ChunkTask.selection = \
pd.read_pickle(selection_file).\
reset_index().\
set_index(['ppn', 'filename']).\
sort_index()
def get_csv_chunks(alto_csv_file, chunksize):
for ch in tqdm(pd.read_csv(alto_csv_file, chunksize=chunksize)):
yield ch
def get_sqlite_chunks(alto_sqlite_file, chunksize):
yield pd.DataFrame()
with sqlite3.connect(alto_sqlite_file) as conn:
conn.execute('pragma journal_mode=wal')
total = int(conn.execute('select count(*) from text;').fetchone()[0] / chunksize)
for ch in tqdm(pd.read_sql('select * from text', conn, chunksize=chunksize), total=total):
yield ch
def get_chunk_tasks(chunks, min_len_len):
for chunk in chunks:
if len(chunk) == 0:
continue
yield ChunkTask(chunk, min_len_len)
def sentence_split(s, min_len):
"""
Reformat text of an entire document such that each line has at least length min_len
:param s: str
:param min_len: minimum line length
:return: reformatted text
"""
parts = s.split(' ')
se = ''
for p in parts:
se += ' ' + p
if len(se) > min_len and len(p) > 2 and re.match(r'.*([^0-9])[.]$', p):
yield se + '\n'
se = ''
yield se + '\n'
@click.command()
@click.argument('fulltext-file', type=click.Path(exists=True), required=True, nargs=1)
@click.argument('selection-file', type=click.Path(exists=True), required=True, nargs=1)
@click.argument('corpus-file', type=click.Path(), required=True, nargs=1)
@click.option('--chunksize', default=10**4, help="Process the corpus in chunks of <chunksize>. default:10**4")
@click.option('--processes', default=6, help="Number of parallel processes. default: 6")
@click.option('--min-line-len', default=80, help="Lower bound of line length in output file. default:80")
def collect(fulltext_file, selection_file, corpus_file, chunksize, processes, min_line_len):
"""
Reads the fulltext from a CSV or SQLITE3 file (see also altotool) and write it to one big text file.
FULLTEXT_FILE: The CSV or SQLITE3 file to read from.
SELECTION_FILE: Consider only a subset of all pages that is defined by the DataFrame
that is stored in <selection_file>.
CORPUS_FILE: The output file that can be used by bert-pregenerate-trainingdata.
"""
os.makedirs(os.path.dirname(corpus_file), exist_ok=True)
print('Open {}.'.format(corpus_file))
corpus_fh = codecs.open(corpus_file, 'w+', 'utf-8')
corpus_fh.write(u'\ufeff')
if fulltext_file.endswith('.csv'):
chunks = get_csv_chunks(fulltext_file, chunksize)
elif fulltext_file.endswith('.sqlite3'):
chunks = get_sqlite_chunks(fulltext_file, chunksize)
else:
raise RuntimeError('Unsupported input file format.')
for text in prun(get_chunk_tasks(chunks, min_line_len), processes=processes, initializer=ChunkTask.initialize,
initargs=(selection_file,)):
corpus_fh.write(text)
corpus_fh.close()
return
if __name__ == '__main__':
main()