sbb_ner/qurator/sbb_ner/models/corpus.py

import re
import pandas as pd
from tqdm import tqdm as tqdm
import click
import codecs
import os
import sqlite3

from qurator.utils.parallel import run as prun


class ChunkTask:

    selection = None

    def __init__(self, chunk, min_line_len):

        self._chunk = chunk
        self._min_line_len = min_line_len

    def __call__(self, *args, **kwargs):

        return ChunkTask.reformat_chunk(self._chunk, self._min_line_len)

    @staticmethod
    def reformat_chunk(chunk, min_line_len):
        """
        Process a chunk of documents.

        :param chunk: pandas DataFrame that contains one document per row.
        :param min_line_len: Break the document text up in lines that have this minimum length.
        :return: One big text where the documents are separated by an empty line.
        """

        text = ''

        for i, r in chunk.iterrows():

            if type(r.text) != str:
                continue

            ppn = r.ppn if str(r.ppn).startswith('PPN') else 'PPN' + r.ppn

            filename = str(r['file name'])

            if not ChunkTask.selection.loc[(ppn, filename)].selected.iloc[0]:
                continue

            for se in sentence_split(str(r.text), min_line_len):

                text += se

            text += '\n\n'

        return text

    @staticmethod
    def initialize(selection_file):

        ChunkTask.selection = \
            pd.read_pickle(selection_file).\
                reset_index().\
                set_index(['ppn', 'filename']).\
                sort_index()


def get_csv_chunks(alto_csv_file, chunksize):

    for ch in tqdm(pd.read_csv(alto_csv_file, chunksize=chunksize)):

        yield ch


def get_sqlite_chunks(alto_sqlite_file, chunksize):

    yield pd.DataFrame()

    with sqlite3.connect(alto_sqlite_file) as conn:

        conn.execute('pragma journal_mode=wal')

        total = int(conn.execute('select count(*) from text;').fetchone()[0] / chunksize)

        for ch in tqdm(pd.read_sql('select * from text', conn, chunksize=chunksize), total=total):

            yield ch


def get_chunk_tasks(chunks, min_len_len):

    for chunk in chunks:

        if len(chunk) == 0:
            continue

        yield ChunkTask(chunk, min_len_len)


def sentence_split(s, min_len):
    """
    Reformat text of an entire document such that each line has at least length min_len
    :param s: str
    :param min_len: minimum line length
    :return: reformatted text
    """

    parts = s.split(' ')

    se = ''
    for p in parts:

        se += ' ' + p

        if len(se) > min_len and len(p) > 2 and re.match(r'.*([^0-9])[.]$', p):
            yield se + '\n'
            se = ''

    yield se + '\n'


@click.command()
@click.argument('fulltext-file', type=click.Path(exists=True), required=True, nargs=1)
@click.argument('selection-file', type=click.Path(exists=True), required=True, nargs=1)
@click.argument('corpus-file', type=click.Path(), required=True, nargs=1)
@click.option('--chunksize', default=10**4, help="Process the corpus in chunks of <chunksize>. default:10**4")
@click.option('--processes', default=6, help="Number of parallel processes. default: 6")
@click.option('--min-line-len', default=80, help="Lower bound of line length in output file. default:80")
def collect(fulltext_file, selection_file, corpus_file, chunksize, processes, min_line_len):
    """
    Reads the fulltext from a CSV or SQLITE3 file (see also altotool) and write it to one big text file.

    FULLTEXT_FILE: The CSV or SQLITE3 file to read from.

    SELECTION_FILE: Consider only a subset of all pages that is defined by the DataFrame
    that is stored in <selection_file>.

    CORPUS_FILE: The output file that can be used by bert-pregenerate-trainingdata.
    """
    os.makedirs(os.path.dirname(corpus_file), exist_ok=True)

    print('Open {}.'.format(corpus_file))
    corpus_fh = codecs.open(corpus_file, 'w+', 'utf-8')
    corpus_fh.write(u'\ufeff')

    if fulltext_file.endswith('.csv'):
        chunks = get_csv_chunks(fulltext_file, chunksize)
    elif fulltext_file.endswith('.sqlite3'):
        chunks = get_sqlite_chunks(fulltext_file, chunksize)
    else:
        raise RuntimeError('Unsupported input file format.')

    for text in prun(get_chunk_tasks(chunks, min_line_len), processes=processes, initializer=ChunkTask.initialize,
                     initargs=(selection_file,)):

        corpus_fh.write(text)

    corpus_fh.close()

    return


if __name__ == '__main__':
    main()
move bert pre-training code to sbb_ner 2022-02-21 15:41:27 +01:00			`import re`
			`import pandas as pd`
			`from tqdm import tqdm as tqdm`
			`import click`
			`import codecs`
			`import os`
			`import sqlite3`

			`from qurator.utils.parallel import run as prun`


			`class ChunkTask:`

			`selection = None`

			`def __init__(self, chunk, min_line_len):`

			`self._chunk = chunk`
			`self._min_line_len = min_line_len`

			`def __call__(self, args, *kwargs):`

			`return ChunkTask.reformat_chunk(self._chunk, self._min_line_len)`

			`@staticmethod`
			`def reformat_chunk(chunk, min_line_len):`
			`"""`
			`Process a chunk of documents.`

			`:param chunk: pandas DataFrame that contains one document per row.`
			`:param min_line_len: Break the document text up in lines that have this minimum length.`
			`:return: One big text where the documents are separated by an empty line.`
			`"""`

			`text = ''`

			`for i, r in chunk.iterrows():`

			`if type(r.text) != str:`
			`continue`

			`ppn = r.ppn if str(r.ppn).startswith('PPN') else 'PPN' + r.ppn`

			`filename = str(r['file name'])`

			`if not ChunkTask.selection.loc[(ppn, filename)].selected.iloc[0]:`
			`continue`

			`for se in sentence_split(str(r.text), min_line_len):`

			`text += se`

			`text += '\n\n'`

			`return text`

			`@staticmethod`
			`def initialize(selection_file):`

			`ChunkTask.selection = \`
			`pd.read_pickle(selection_file).\`
			`reset_index().\`
			`set_index(['ppn', 'filename']).\`
			`sort_index()`


			`def get_csv_chunks(alto_csv_file, chunksize):`

			`for ch in tqdm(pd.read_csv(alto_csv_file, chunksize=chunksize)):`

			`yield ch`


			`def get_sqlite_chunks(alto_sqlite_file, chunksize):`

			`yield pd.DataFrame()`

			`with sqlite3.connect(alto_sqlite_file) as conn:`

			`conn.execute('pragma journal_mode=wal')`

			`total = int(conn.execute('select count(*) from text;').fetchone()[0] / chunksize)`

			`for ch in tqdm(pd.read_sql('select * from text', conn, chunksize=chunksize), total=total):`

			`yield ch`


			`def get_chunk_tasks(chunks, min_len_len):`

			`for chunk in chunks:`

			`if len(chunk) == 0:`
			`continue`

			`yield ChunkTask(chunk, min_len_len)`


			`def sentence_split(s, min_len):`
			`"""`
			`Reformat text of an entire document such that each line has at least length min_len`
			`:param s: str`
			`:param min_len: minimum line length`
			`:return: reformatted text`
			`"""`

			`parts = s.split(' ')`

			`se = ''`
			`for p in parts:`

			`se += ' ' + p`

			`if len(se) > min_len and len(p) > 2 and re.match(r'.*([^0-9])[.]$', p):`
			`yield se + '\n'`
			`se = ''`

			`yield se + '\n'`


			`@click.command()`
			`@click.argument('fulltext-file', type=click.Path(exists=True), required=True, nargs=1)`
			`@click.argument('selection-file', type=click.Path(exists=True), required=True, nargs=1)`
			`@click.argument('corpus-file', type=click.Path(), required=True, nargs=1)`
			`@click.option('--chunksize', default=104, help="Process the corpus in chunks of <chunksize>. default:104")`
			`@click.option('--processes', default=6, help="Number of parallel processes. default: 6")`
			`@click.option('--min-line-len', default=80, help="Lower bound of line length in output file. default:80")`
			`def collect(fulltext_file, selection_file, corpus_file, chunksize, processes, min_line_len):`
			`"""`
			`Reads the fulltext from a CSV or SQLITE3 file (see also altotool) and write it to one big text file.`

			`FULLTEXT_FILE: The CSV or SQLITE3 file to read from.`

			`SELECTION_FILE: Consider only a subset of all pages that is defined by the DataFrame`
			`that is stored in <selection_file>.`

			`CORPUS_FILE: The output file that can be used by bert-pregenerate-trainingdata.`
			`"""`
			`os.makedirs(os.path.dirname(corpus_file), exist_ok=True)`

			`print('Open {}.'.format(corpus_file))`
			`corpus_fh = codecs.open(corpus_file, 'w+', 'utf-8')`
			`corpus_fh.write(u'\ufeff')`

			`if fulltext_file.endswith('.csv'):`
			`chunks = get_csv_chunks(fulltext_file, chunksize)`
			`elif fulltext_file.endswith('.sqlite3'):`
			`chunks = get_sqlite_chunks(fulltext_file, chunksize)`
			`else:`
			`raise RuntimeError('Unsupported input file format.')`

			`for text in prun(get_chunk_tasks(chunks, min_line_len), processes=processes, initializer=ChunkTask.initialize,`
			`initargs=(selection_file,)):`

			`corpus_fh.write(text)`

			`corpus_fh.close()`

			`return`


			`if __name__ == '__main__':`
			`main()`