re-structure repo
commit
16e63b4673
@ -0,0 +1,20 @@
|
|||||||
|
FROM nvidia/cuda:10.0-base
|
||||||
|
|
||||||
|
ARG http_proxy
|
||||||
|
ENV http_proxy=$http_proxy
|
||||||
|
ENV https_proxy=$http_proxy
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get -y install build-essential && \
|
||||||
|
apt-get -y install python3-pip && \
|
||||||
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY requirements.txt /tmp
|
||||||
|
RUN pip3 --no-cache-dir install -r /tmp/requirements.txt
|
||||||
|
|
||||||
|
COPY . /usr/src/qurator-mono-repo
|
||||||
|
|
||||||
|
RUN pip3 --no-cache-dir install -e /usr/src/qurator-mono-repo
|
||||||
|
|
||||||
|
WORKDIR /usr/src/qurator-mono-repo
|
||||||
|
CMD export LANG=C.UTF-8; env FLASK_APP=qurator/qurator_sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=True flask run --host=0.0.0.0
|
@ -0,0 +1,19 @@
|
|||||||
|
FROM python:3.6-slim-stretch
|
||||||
|
|
||||||
|
ARG http_proxy
|
||||||
|
ENV http_proxy=$http_proxy
|
||||||
|
ENV https_proxy=$http_proxy
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get -y install build-essential && \
|
||||||
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY requirements.txt /tmp
|
||||||
|
RUN pip3 --no-cache-dir install -r /tmp/requirements.txt
|
||||||
|
|
||||||
|
COPY . /usr/src/qurator-mono-repo
|
||||||
|
|
||||||
|
RUN pip3 --no-cache-dir install -e /usr/src/qurator-mono-repo
|
||||||
|
|
||||||
|
WORKDIR /usr/src/qurator-mono-repo
|
||||||
|
CMD env FLASK_APP=qurator/qurator_sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=False flask run --host=0.0.0.0
|
@ -0,0 +1,201 @@
|
|||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction,
|
||||||
|
and distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by
|
||||||
|
the copyright owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all
|
||||||
|
other entities that control, are controlled by, or are under common
|
||||||
|
control with that entity. For the purposes of this definition,
|
||||||
|
"control" means (i) the power, direct or indirect, to cause the
|
||||||
|
direction or management of such entity, whether by contract or
|
||||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity
|
||||||
|
exercising permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications,
|
||||||
|
including but not limited to software source code, documentation
|
||||||
|
source, and configuration files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical
|
||||||
|
transformation or translation of a Source form, including but
|
||||||
|
not limited to compiled object code, generated documentation,
|
||||||
|
and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or
|
||||||
|
Object form, made available under the License, as indicated by a
|
||||||
|
copyright notice that is included in or attached to the work
|
||||||
|
(an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object
|
||||||
|
form, that is based on (or derived from) the Work and for which the
|
||||||
|
editorial revisions, annotations, elaborations, or other modifications
|
||||||
|
represent, as a whole, an original work of authorship. For the purposes
|
||||||
|
of this License, Derivative Works shall not include works that remain
|
||||||
|
separable from, or merely link (or bind by name) to the interfaces of,
|
||||||
|
the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including
|
||||||
|
the original version of the Work and any modifications or additions
|
||||||
|
to that Work or Derivative Works thereof, that is intentionally
|
||||||
|
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||||
|
or by an individual or Legal Entity authorized to submit on behalf of
|
||||||
|
the copyright owner. For the purposes of this definition, "submitted"
|
||||||
|
means any form of electronic, verbal, or written communication sent
|
||||||
|
to the Licensor or its representatives, including but not limited to
|
||||||
|
communication on electronic mailing lists, source code control systems,
|
||||||
|
and issue tracking systems that are managed by, or on behalf of, the
|
||||||
|
Licensor for the purpose of discussing and improving the Work, but
|
||||||
|
excluding communication that is conspicuously marked or otherwise
|
||||||
|
designated in writing by the copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||||
|
on behalf of whom a Contribution has been received by Licensor and
|
||||||
|
subsequently incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
copyright license to reproduce, prepare Derivative Works of,
|
||||||
|
publicly display, publicly perform, sublicense, and distribute the
|
||||||
|
Work and such Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
(except as stated in this section) patent license to make, have made,
|
||||||
|
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||||
|
where such license applies only to those patent claims licensable
|
||||||
|
by such Contributor that are necessarily infringed by their
|
||||||
|
Contribution(s) alone or by combination of their Contribution(s)
|
||||||
|
with the Work to which such Contribution(s) was submitted. If You
|
||||||
|
institute patent litigation against any entity (including a
|
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||||
|
or a Contribution incorporated within the Work constitutes direct
|
||||||
|
or contributory patent infringement, then any patent licenses
|
||||||
|
granted to You under this License for that Work shall terminate
|
||||||
|
as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the
|
||||||
|
Work or Derivative Works thereof in any medium, with or without
|
||||||
|
modifications, and in Source or Object form, provided that You
|
||||||
|
meet the following conditions:
|
||||||
|
|
||||||
|
(a) You must give any other recipients of the Work or
|
||||||
|
Derivative Works a copy of this License; and
|
||||||
|
|
||||||
|
(b) You must cause any modified files to carry prominent notices
|
||||||
|
stating that You changed the files; and
|
||||||
|
|
||||||
|
(c) You must retain, in the Source form of any Derivative Works
|
||||||
|
that You distribute, all copyright, patent, trademark, and
|
||||||
|
attribution notices from the Source form of the Work,
|
||||||
|
excluding those notices that do not pertain to any part of
|
||||||
|
the Derivative Works; and
|
||||||
|
|
||||||
|
(d) If the Work includes a "NOTICE" text file as part of its
|
||||||
|
distribution, then any Derivative Works that You distribute must
|
||||||
|
include a readable copy of the attribution notices contained
|
||||||
|
within such NOTICE file, excluding those notices that do not
|
||||||
|
pertain to any part of the Derivative Works, in at least one
|
||||||
|
of the following places: within a NOTICE text file distributed
|
||||||
|
as part of the Derivative Works; within the Source form or
|
||||||
|
documentation, if provided along with the Derivative Works; or,
|
||||||
|
within a display generated by the Derivative Works, if and
|
||||||
|
wherever such third-party notices normally appear. The contents
|
||||||
|
of the NOTICE file are for informational purposes only and
|
||||||
|
do not modify the License. You may add Your own attribution
|
||||||
|
notices within Derivative Works that You distribute, alongside
|
||||||
|
or as an addendum to the NOTICE text from the Work, provided
|
||||||
|
that such additional attribution notices cannot be construed
|
||||||
|
as modifying the License.
|
||||||
|
|
||||||
|
You may add Your own copyright statement to Your modifications and
|
||||||
|
may provide additional or different license terms and conditions
|
||||||
|
for use, reproduction, or distribution of Your modifications, or
|
||||||
|
for any such Derivative Works as a whole, provided Your use,
|
||||||
|
reproduction, and distribution of the Work otherwise complies with
|
||||||
|
the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||||
|
any Contribution intentionally submitted for inclusion in the Work
|
||||||
|
by You to the Licensor shall be under the terms and conditions of
|
||||||
|
this License, without any additional terms or conditions.
|
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify
|
||||||
|
the terms of any separate license agreement you may have executed
|
||||||
|
with Licensor regarding such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade
|
||||||
|
names, trademarks, service marks, or product names of the Licensor,
|
||||||
|
except as required for reasonable and customary use in describing the
|
||||||
|
origin of the Work and reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||||
|
agreed to in writing, Licensor provides the Work (and each
|
||||||
|
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied, including, without limitation, any warranties or conditions
|
||||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||||
|
appropriateness of using or redistributing the Work and assume any
|
||||||
|
risks associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory,
|
||||||
|
whether in tort (including negligence), contract, or otherwise,
|
||||||
|
unless required by applicable law (such as deliberate and grossly
|
||||||
|
negligent acts) or agreed to in writing, shall any Contributor be
|
||||||
|
liable to You for damages, including any direct, indirect, special,
|
||||||
|
incidental, or consequential damages of any character arising as a
|
||||||
|
result of this License or out of the use or inability to use the
|
||||||
|
Work (including but not limited to damages for loss of goodwill,
|
||||||
|
work stoppage, computer failure or malfunction, or any and all
|
||||||
|
other commercial damages or losses), even if such Contributor
|
||||||
|
has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing
|
||||||
|
the Work or Derivative Works thereof, You may choose to offer,
|
||||||
|
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||||
|
or other liability obligations and/or rights consistent with this
|
||||||
|
License. However, in accepting such obligations, You may act only
|
||||||
|
on Your own behalf and on Your sole responsibility, not on behalf
|
||||||
|
of any other Contributor, and only if You agree to indemnify,
|
||||||
|
defend, and hold each Contributor harmless for any liability
|
||||||
|
incurred by, or claims asserted against, such Contributor by reason
|
||||||
|
of your accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
APPENDIX: How to apply the Apache License to your work.
|
||||||
|
|
||||||
|
To apply the Apache License to your work, attach the following
|
||||||
|
boilerplate notice, with the fields enclosed by brackets "{}"
|
||||||
|
replaced with your own identifying information. (Don't include
|
||||||
|
the brackets!) The text should be enclosed in the appropriate
|
||||||
|
comment syntax for the file format. We also recommend that a
|
||||||
|
file or class name and description of purpose be included on the
|
||||||
|
same "printed page" as the copyright notice for easier
|
||||||
|
identification within third-party archives.
|
||||||
|
|
||||||
|
Copyright 2019 qurator
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
@ -0,0 +1,66 @@
|
|||||||
|
|
||||||
|
|
||||||
|
***
|
||||||
|
# Preprocessing of NER ground-truth:
|
||||||
|
|
||||||
|
|
||||||
|
## compile_conll
|
||||||
|
|
||||||
|
Read CONLL 2003 ner ground truth files from directory and
|
||||||
|
write the outcome of the data parsing to some pandas DataFrame that is
|
||||||
|
stored as pickle.
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
```
|
||||||
|
compile_conll --help
|
||||||
|
```
|
||||||
|
|
||||||
|
## compile_germ_eval
|
||||||
|
|
||||||
|
Read germ eval .tsv files from directory and write the
|
||||||
|
outcome of the data parsing to some pandas DataFrame that is stored as
|
||||||
|
pickle.
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
```
|
||||||
|
compile_germ_eval --help
|
||||||
|
```
|
||||||
|
|
||||||
|
## compile_europeana_historic
|
||||||
|
|
||||||
|
Read europeana historic ner ground truth .bio files from directory
|
||||||
|
and write the outcome of the data parsing to some pandas
|
||||||
|
DataFrame that is stored as pickle.
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
```
|
||||||
|
compile_europeana_historic --help
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## compile_wikiner
|
||||||
|
|
||||||
|
Read wikiner files from directory and write the outcome
|
||||||
|
of the data parsing to some pandas DataFrame that is stored as pickle.
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
```
|
||||||
|
compile_wikiner --help
|
||||||
|
```
|
||||||
|
|
||||||
|
***
|
||||||
|
# Train BERT - NER model:
|
||||||
|
|
||||||
|
## bert-ner
|
||||||
|
|
||||||
|
Perform BERT for NER supervised training and test/cross-validation.
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
```
|
||||||
|
bert-ner --help
|
||||||
|
```
|
@ -0,0 +1 @@
|
|||||||
|
__import__('pkg_resources').declare_namespace(__name__)
|
@ -0,0 +1 @@
|
|||||||
|
__import__('pkg_resources').declare_namespace(__name__)
|
@ -0,0 +1 @@
|
|||||||
|
__import__('pkg_resources').declare_namespace(__name__)
|
@ -0,0 +1,77 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import click
|
||||||
|
import codecs
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def read_gt(files, datasets):
|
||||||
|
sentence_number = 300000
|
||||||
|
gt_data = list()
|
||||||
|
|
||||||
|
for filename, dataset in zip(files, datasets):
|
||||||
|
gt_lines = [l.strip() for l in codecs.open(filename, 'r', 'latin-1')]
|
||||||
|
|
||||||
|
word_number = 0
|
||||||
|
|
||||||
|
for li in gt_lines:
|
||||||
|
|
||||||
|
if li == '':
|
||||||
|
|
||||||
|
if word_number > 0:
|
||||||
|
|
||||||
|
sentence_number += 1
|
||||||
|
word_number = 0
|
||||||
|
|
||||||
|
continue
|
||||||
|
|
||||||
|
if li.startswith('-DOCSTART-'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
parts = li.split()
|
||||||
|
|
||||||
|
if len(parts) == 5:
|
||||||
|
word, _, _, _, tag = li.split()
|
||||||
|
else:
|
||||||
|
word, _, _, tag = li.split()
|
||||||
|
|
||||||
|
tag = tag.upper()
|
||||||
|
tag = tag.replace('_', '-')
|
||||||
|
tag = tag.replace('.', '-')
|
||||||
|
|
||||||
|
if tag not in {'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-ORG', 'I-LOC'}:
|
||||||
|
tag = 'O'
|
||||||
|
|
||||||
|
gt_data.append((sentence_number, word_number, word, tag, dataset))
|
||||||
|
|
||||||
|
word_number += 1
|
||||||
|
|
||||||
|
return pd.DataFrame(gt_data, columns=['nsentence', 'nword', 'word', 'tag', 'dataset'])
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.argument('path-to-conll', type=click.Path(exists=True), required=True, nargs=1)
|
||||||
|
@click.argument('conll-ground-truth-file', type=click.Path(), required=True, nargs=1)
|
||||||
|
def main(path_to_conll, conll_ground_truth_file):
|
||||||
|
"""
|
||||||
|
Read CONLL 2003 ner ground truth files from directory <path-to-conll> and
|
||||||
|
write the outcome of the data parsing to some pandas DataFrame
|
||||||
|
that is stored as pickle in file <conll-ground-truth-file>.
|
||||||
|
"""
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(conll_ground_truth_file), exist_ok=True)
|
||||||
|
|
||||||
|
gt_all = read_gt(['{}/deu.dev'.format(path_to_conll),
|
||||||
|
'{}/deu.testa'.format(path_to_conll),
|
||||||
|
'{}/deu.testb'.format(path_to_conll),
|
||||||
|
'{}/deu.train'.format(path_to_conll),
|
||||||
|
'{}/eng.testa'.format(path_to_conll),
|
||||||
|
'{}/eng.testb'.format(path_to_conll),
|
||||||
|
'{}/eng.train'.format(path_to_conll)],
|
||||||
|
['DE-CONLL-DEV', 'DE-CONLL-TESTA', 'DE-CONLL-TESTB', 'DE-CONLL-TRAIN',
|
||||||
|
'EN-CONLL-TESTA', 'EN-CONLL-TESTB', 'EN-CONLL-TRAIN'])
|
||||||
|
|
||||||
|
gt_all.to_pickle(conll_ground_truth_file)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -0,0 +1,435 @@
|
|||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||||
|
TensorDataset, Dataset)
|
||||||
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
|
|
||||||
|
|
||||||
|
class InputExample(object):
|
||||||
|
"""A single training/test example for simple sequence classification."""
|
||||||
|
|
||||||
|
def __init__(self, guid, text_a, text_b=None, label=None):
|
||||||
|
"""Constructs a InputExample.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
guid: Unique id for the example.
|
||||||
|
text_a: string. The untokenized text of the first sequence. For single
|
||||||
|
sequence tasks, only this sequence must be specified.
|
||||||
|
text_b: (Optional) string. The untokenized text of the second sequence.
|
||||||
|
Only must be specified for sequence pair tasks.
|
||||||
|
label: (Optional) string. The label of the example. This should be
|
||||||
|
specified for train and dev examples, but not for test examples.
|
||||||
|
"""
|
||||||
|
self.guid = guid
|
||||||
|
self.text_a = text_a
|
||||||
|
self.text_b = text_b
|
||||||
|
self.label = label
|
||||||
|
|
||||||
|
|
||||||
|
class InputFeatures(object):
|
||||||
|
"""A single set of features of data."""
|
||||||
|
|
||||||
|
def __init__(self, input_ids, input_mask, segment_ids, label_id, tokens):
|
||||||
|
self.input_ids = input_ids
|
||||||
|
self.input_mask = input_mask
|
||||||
|
self.segment_ids = segment_ids
|
||||||
|
self.label_id = label_id
|
||||||
|
self.tokens = tokens
|
||||||
|
|
||||||
|
|
||||||
|
class WikipediaDataset(Dataset):
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, set_file, gt_file, data_epochs, epoch_size,
|
||||||
|
label_map, tokenizer, max_seq_length,
|
||||||
|
queue_size=1000, no_entity_fraction=0.0, seed=23,
|
||||||
|
min_sen_len=10, min_article_len=20):
|
||||||
|
|
||||||
|
self._set_file = set_file
|
||||||
|
self._subset = pd.read_pickle(set_file)
|
||||||
|
self._gt_file = gt_file
|
||||||
|
self._data_epochs = data_epochs
|
||||||
|
self._epoch_size = epoch_size
|
||||||
|
self._label_map = label_map
|
||||||
|
self._tokenizer = tokenizer
|
||||||
|
self._max_seq_length = max_seq_length
|
||||||
|
self._queue_size = queue_size
|
||||||
|
self._no_entity_fraction = no_entity_fraction
|
||||||
|
self._seed = seed
|
||||||
|
self._min_sen_len = min_sen_len
|
||||||
|
self._min_article_len = min_article_len
|
||||||
|
|
||||||
|
self._queue = None
|
||||||
|
self._data_sequence = None
|
||||||
|
self._counter = None
|
||||||
|
# noinspection PyUnresolvedReferences
|
||||||
|
self._random_state = np.random.RandomState(seed=self._seed)
|
||||||
|
|
||||||
|
self._reset()
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
def _next_sample_should_have_entities(self):
|
||||||
|
|
||||||
|
if self._no_entity_fraction <= 0.0:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return int(self._counter) % int(1.0 / self._no_entity_fraction) != 0
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
|
||||||
|
del index
|
||||||
|
if self._counter > self._data_epochs * self._epoch_size:
|
||||||
|
self._reset()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
|
||||||
|
# get next random sentence
|
||||||
|
sen_words, sen_tags = self._queue_next()
|
||||||
|
|
||||||
|
if len(sen_words) < self._min_sen_len: # Skip all sentences that are to short.
|
||||||
|
continue
|
||||||
|
|
||||||
|
if self._has_entities(sen_tags):
|
||||||
|
|
||||||
|
if not self._next_sample_should_have_entities(): # Skip sample if next sample is supposed to
|
||||||
|
# be a no-entity sample
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
if self._next_sample_should_have_entities(): # Skip sample if next sample is supposed to be a entity
|
||||||
|
# sample
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
|
||||||
|
sample = InputExample(guid="%s-%s" % (self._set_file, self._counter),
|
||||||
|
text_a=sen_words, text_b=None, label=sen_tags)
|
||||||
|
|
||||||
|
features = convert_examples_to_features(sample, self._label_map, self._max_seq_length, self._tokenizer)
|
||||||
|
|
||||||
|
self._counter += 1
|
||||||
|
|
||||||
|
return torch.tensor(features.input_ids, dtype=torch.long), \
|
||||||
|
torch.tensor(features.input_mask, dtype=torch.long), \
|
||||||
|
torch.tensor(features.segment_ids, dtype=torch.long), \
|
||||||
|
torch.tensor(features.label_id, dtype=torch.long)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
|
||||||
|
return int(self._epoch_size)
|
||||||
|
|
||||||
|
def _reset(self):
|
||||||
|
|
||||||
|
# print('================= WikipediaDataset:_reset ====================== ')
|
||||||
|
|
||||||
|
self._queue = list()
|
||||||
|
self._data_sequence = self._sequence()
|
||||||
|
self._counter = 0
|
||||||
|
# noinspection PyUnresolvedReferences
|
||||||
|
# self._random_state = np.random.RandomState(seed=self._seed)
|
||||||
|
|
||||||
|
for _ in range(0, self._queue_size):
|
||||||
|
self._queue.append(list())
|
||||||
|
|
||||||
|
def _sequence(self):
|
||||||
|
|
||||||
|
while True:
|
||||||
|
|
||||||
|
for row in pd.read_csv(self._gt_file, chunksize=1, sep=';'):
|
||||||
|
|
||||||
|
page_id = row.page_id.iloc[0]
|
||||||
|
text = row.text.iloc[0]
|
||||||
|
tags = row.tags.iloc[0]
|
||||||
|
|
||||||
|
if page_id not in self._subset.index:
|
||||||
|
continue
|
||||||
|
|
||||||
|
sentences = [(sen_text, sen_tag) for sen_text, sen_tag in zip(json.loads(text), json.loads(tags))]
|
||||||
|
|
||||||
|
if len(sentences) < self._min_article_len: # Skip very short articles.
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(page_id)
|
||||||
|
|
||||||
|
yield sentences
|
||||||
|
|
||||||
|
def _queue_next(self):
|
||||||
|
|
||||||
|
nqueue = self._random_state.randint(len(self._queue))
|
||||||
|
|
||||||
|
while len(self._queue[nqueue]) <= 0:
|
||||||
|
self._queue[nqueue] = next(self._data_sequence)
|
||||||
|
|
||||||
|
return self._queue[nqueue].pop()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _has_entities(sen_tags):
|
||||||
|
|
||||||
|
for t in sen_tags:
|
||||||
|
|
||||||
|
if t != 'O':
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class DataProcessor(object):
|
||||||
|
"""Base class for data converters for sequence classification data sets."""
|
||||||
|
|
||||||
|
def get_train_examples(self, batch_size, local_rank):
|
||||||
|
"""Gets a collection of `InputExample`s for the train set."""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def get_dev_examples(self, batch_size, local_rank):
|
||||||
|
"""Gets a collection of `InputExample`s for the dev set."""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def get_labels(self):
|
||||||
|
"""Gets the list of labels for this data set."""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def get_evaluation_file(self):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
class WikipediaNerProcessor(DataProcessor):
|
||||||
|
|
||||||
|
def __init__(self, train_sets, dev_sets, test_sets, gt_file, max_seq_length, tokenizer,
|
||||||
|
data_epochs, epoch_size, **kwargs):
|
||||||
|
del kwargs
|
||||||
|
|
||||||
|
self._max_seq_length = max_seq_length
|
||||||
|
self._tokenizer = tokenizer
|
||||||
|
self._train_set_file = train_sets
|
||||||
|
self._dev_set_file = dev_sets
|
||||||
|
self._test_set_file = test_sets
|
||||||
|
self._gt_file = gt_file
|
||||||
|
self._data_epochs = data_epochs
|
||||||
|
self._epoch_size = epoch_size
|
||||||
|
|
||||||
|
def get_train_examples(self, batch_size, local_rank):
|
||||||
|
"""See base class."""
|
||||||
|
|
||||||
|
return self._make_data_loader(self._train_set_file, batch_size, local_rank)
|
||||||
|
|
||||||
|
def get_dev_examples(self, batch_size, local_rank):
|
||||||
|
"""See base class."""
|
||||||
|
|
||||||
|
return self._make_data_loader(self._dev_set_file, batch_size, local_rank)
|
||||||
|
|
||||||
|
def get_labels(self):
|
||||||
|
"""See base class."""
|
||||||
|
|
||||||
|
labels = ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG", "X", "[CLS]", "[SEP]"]
|
||||||
|
|
||||||
|
return {label: i for i, label in enumerate(labels)}
|
||||||
|
|
||||||
|
def get_evaluation_file(self):
|
||||||
|
dev_set_name = os.path.splitext(os.path.basename(self._dev_set_file))[0]
|
||||||
|
|
||||||
|
return "eval_results-{}.pkl".format(dev_set_name)
|
||||||
|
|
||||||
|
def _make_data_loader(self, set_file, batch_size, local_rank):
|
||||||
|
del local_rank
|
||||||
|
|
||||||
|
data = WikipediaDataset(set_file=set_file, gt_file=self._gt_file,
|
||||||
|
data_epochs=self._data_epochs, epoch_size=self._epoch_size,
|
||||||
|
label_map=self.get_labels(), tokenizer=self._tokenizer,
|
||||||
|
max_seq_length=self._max_seq_length)
|
||||||
|
|
||||||
|
sampler = SequentialSampler(data)
|
||||||
|
|
||||||
|
return DataLoader(data, sampler=sampler, batch_size=batch_size)
|
||||||
|
|
||||||
|
|
||||||
|
class NerProcessor(DataProcessor):
|
||||||
|
|
||||||
|
def __init__(self, train_sets, dev_sets, test_sets, max_seq_length, tokenizer,
|
||||||
|
label_map=None, gt=None, gt_file=None, **kwargs):
|
||||||
|
|
||||||
|
del kwargs
|
||||||
|
|
||||||
|
self._max_seg_length = max_seq_length
|
||||||
|
self._tokenizer = tokenizer
|
||||||
|
self._train_sets = set(train_sets.split('|')) if train_sets is not None else set()
|
||||||
|
self._dev_sets = set(dev_sets.split('|')) if dev_sets is not None else set()
|
||||||
|
self._test_sets = set(test_sets.split('|')) if test_sets is not None else set()
|
||||||
|
|
||||||
|
self._gt = gt
|
||||||
|
|
||||||
|
if self._gt is None:
|
||||||
|
self._gt = pd.read_pickle(gt_file)
|
||||||
|
|
||||||
|
self._label_map = label_map
|
||||||
|
|
||||||
|
print('TRAIN SETS: ', train_sets)
|
||||||
|
print('DEV SETS: ', dev_sets)
|
||||||
|
print('TEST SETS: ', test_sets)
|
||||||
|
|
||||||
|
def get_train_examples(self, batch_size, local_rank):
|
||||||
|
"""See base class."""
|
||||||
|
|
||||||
|
return self.make_data_loader(
|
||||||
|
self.create_examples(self._read_lines(self._train_sets), "train"), batch_size, local_rank,
|
||||||
|
self.get_labels(), self._max_seg_length, self._tokenizer)
|
||||||
|
|
||||||
|
def get_dev_examples(self, batch_size, local_rank):
|
||||||
|
"""See base class."""
|
||||||
|
return self.make_data_loader(
|
||||||
|
self.create_examples(self._read_lines(self._dev_sets), "dev"), batch_size, local_rank,
|
||||||
|
self.get_labels(), self._max_seg_length, self._tokenizer)
|
||||||
|
|
||||||
|
def get_labels(self):
|
||||||
|
"""See base class."""
|
||||||
|
|
||||||
|
if self._label_map is not None:
|
||||||
|
return self._label_map
|
||||||
|
|
||||||
|
gt = self._gt
|
||||||
|
gt = gt.loc[gt.dataset.isin(self._train_sets.union(self._dev_sets).union(self._test_sets))]
|
||||||
|
|
||||||
|
labels = sorted(gt.tag.unique().tolist()) + ["X", "[CLS]", "[SEP]"]
|
||||||
|
|
||||||
|
self._label_map = {label: i for i, label in enumerate(labels, 1)}
|
||||||
|
|
||||||
|
self._label_map['UNK'] = 0
|
||||||
|
|
||||||
|
return self._label_map
|
||||||
|
|
||||||
|
def get_evaluation_file(self):
|
||||||
|
|
||||||
|
return "eval_results-{}.pkl".format("-".join(sorted(self._dev_sets)))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_examples(lines, set_type):
|
||||||
|
|
||||||
|
for i, (sentence, label) in enumerate(lines):
|
||||||
|
guid = "%s-%s" % (set_type, i)
|
||||||
|
text_a = sentence
|
||||||
|
text_b = None
|
||||||
|
label = label
|
||||||
|
|
||||||
|
yield InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def make_data_loader(examples, batch_size, local_rank, label_map, max_seq_length, tokenizer, features=None,
|
||||||
|
sequential=False):
|
||||||
|
|
||||||
|
if features is None:
|
||||||
|
features = [convert_examples_to_features(ex, label_map, max_seq_length, tokenizer)
|
||||||
|
for ex in examples]
|
||||||
|
|
||||||
|
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
||||||
|
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
|
||||||
|
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
|
||||||
|
all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
|
||||||
|
|
||||||
|
data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
|
||||||
|
|
||||||
|
if local_rank == -1:
|
||||||
|
if sequential:
|
||||||
|
train_sampler = SequentialSampler(data)
|
||||||
|
else:
|
||||||
|
train_sampler = RandomSampler(data)
|
||||||
|
else:
|
||||||
|
if sequential:
|
||||||
|
train_sampler = SequentialSampler(data)
|
||||||
|
else:
|
||||||
|
train_sampler = DistributedSampler(data)
|
||||||
|
|
||||||
|
return DataLoader(data, sampler=train_sampler, batch_size=batch_size)
|
||||||
|
|
||||||
|
def _read_lines(self, sets):
|
||||||
|
|
||||||
|
gt = self._gt
|
||||||
|
gt = gt.loc[gt.dataset.isin(sets)]
|
||||||
|
|
||||||
|
data = list()
|
||||||
|
for i, sent in gt.groupby('nsentence'):
|
||||||
|
|
||||||
|
sent = sent.sort_values('nword', ascending=True)
|
||||||
|
|
||||||
|
data.append((sent.word.tolist(), sent.tag.tolist()))
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def convert_examples_to_features(example, label_map, max_seq_length, tokenizer):
|
||||||
|
"""
|
||||||
|
:param example: instance of InputExample
|
||||||
|
:param label_map:
|
||||||
|
:param max_seq_length:
|
||||||
|
:param tokenizer:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
|
||||||
|
words = example.text_a
|
||||||
|
word_labels = example.label
|
||||||
|
tokens = []
|
||||||
|
labels = []
|
||||||
|
|
||||||
|
for i, word in enumerate(words):
|
||||||
|
|
||||||
|
token = tokenizer.tokenize(word)
|
||||||
|
tokens.extend(token)
|
||||||
|
|
||||||
|
label_1 = word_labels[i] if i < len(word_labels) else 'O'
|
||||||
|
|
||||||
|
for m in range(len(token)):
|
||||||
|
if m == 0:
|
||||||
|
labels.append(label_1)
|
||||||
|
else:
|
||||||
|
labels.append("X")
|
||||||
|
|
||||||
|
if len(tokens) >= max_seq_length - 1:
|
||||||
|
tokens = tokens[0:(max_seq_length - 2)]
|
||||||
|
labels = labels[0:(max_seq_length - 2)]
|
||||||
|
|
||||||
|
n_tokens = []
|
||||||
|
segment_ids = []
|
||||||
|
label_ids = []
|
||||||
|
n_tokens.append("[CLS]")
|
||||||
|
segment_ids.append(0)
|
||||||
|
label_ids.append(label_map["[CLS]"])
|
||||||
|
for i, token in enumerate(tokens):
|
||||||
|
n_tokens.append(token)
|
||||||
|
segment_ids.append(0)
|
||||||
|
label_ids.append(label_map[labels[i]])
|
||||||
|
n_tokens.append("[SEP]")
|
||||||
|
segment_ids.append(0)
|
||||||
|
label_ids.append(label_map["[SEP]"])
|
||||||
|
input_ids = tokenizer.convert_tokens_to_ids(n_tokens)
|
||||||
|
input_mask = [1] * len(input_ids)
|
||||||
|
|
||||||
|
while len(input_ids) < max_seq_length:
|
||||||
|
input_ids.append(0)
|
||||||
|
input_mask.append(0)
|
||||||
|
segment_ids.append(0)
|
||||||
|
label_ids.append(0)
|
||||||
|
|
||||||
|
assert len(input_ids) == max_seq_length
|
||||||
|
assert len(input_mask) == max_seq_length
|
||||||
|
assert len(segment_ids) == max_seq_length
|
||||||
|
assert len(label_ids) == max_seq_length
|
||||||
|
|
||||||
|
# if ex_index < 5:
|
||||||
|
# logger.info("*** Example ***")
|
||||||
|
# logger.info("guid: %s" % example.guid)
|
||||||
|
# logger.info("tokens: %s" % " ".join(
|
||||||
|
# [str(x) for x in tokens]))
|
||||||
|
# logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
||||||
|
# logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
|
||||||
|
# logger.info(
|
||||||
|
# "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
|
||||||
|
# logger.info("label: %s (id = %d)" % (example.label, label_ids))
|
||||||
|
|
||||||
|
return InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_ids,
|
||||||
|
tokens=n_tokens)
|
@ -0,0 +1,70 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import re
|
||||||
|
import click
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def read_gt(files, datasets):
|
||||||
|
sentence_number = 100000
|
||||||
|
sentence = ''
|
||||||
|
gt_data = list()
|
||||||
|
|
||||||
|
for filename, dataset in zip(files, datasets):
|
||||||
|
gt_lines = [l.strip() for l in open(filename) if not l.startswith('<--')]
|
||||||
|
|
||||||
|
word_number = 0
|
||||||
|
|
||||||
|
for l in gt_lines:
|
||||||
|
|
||||||
|
try:
|
||||||
|
word, tag = l.split(' ')
|
||||||
|
except ValueError:
|
||||||
|
word = l.replace(' ', '_')
|
||||||
|
tag = 'O'
|
||||||
|
|
||||||
|
tag = tag.upper()
|
||||||
|
|
||||||
|
tag = tag.replace('_', '-')
|
||||||
|
tag = tag.replace('.', '-')
|
||||||
|
|
||||||
|
if tag not in {'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-ORG', 'I-LOC'}:
|
||||||
|
tag = 'O'
|
||||||
|
|
||||||
|
gt_data.append((sentence_number, word_number, word, tag, dataset))
|
||||||
|
|
||||||
|
if re.match(r'.*[.|?|!]$', word) \
|
||||||
|
and not re.match(r'[0-9]+[.]$', word) \
|
||||||
|
and not re.match(r'.*[0-9]+\s*$', sentence)\
|
||||||
|
and not re.match(r'.*\s+[\S]{1,2}$', sentence):
|
||||||
|
|
||||||
|
sentence_number += 1
|
||||||
|
sentence = ''
|
||||||
|
word_number = 0
|
||||||
|
else:
|
||||||
|
word_number += 1
|
||||||
|
sentence += ' ' + word
|
||||||
|
|
||||||
|
return pd.DataFrame(gt_data, columns=['nsentence', 'nword', 'word', 'tag', 'dataset'])
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.argument('path-to-ner-corpora', type=click.Path(exists=True), required=True, nargs=1)
|
||||||
|
@click.argument('ner-ground-truth-file', type=click.Path(), required=True, nargs=1)
|
||||||
|
def main(path_to_ner_corpora, ner_ground_truth_file):
|
||||||
|
"""
|
||||||
|
Read europeana historic ner ground truth .bio files from directory <path-to-ner-corpora> and
|
||||||
|
write the outcome of the data parsing to some pandas DataFrame
|
||||||
|
that is stored as pickle in file <ner-ground-truth-file>.
|
||||||
|
"""
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(ner_ground_truth_file), exist_ok=True)
|
||||||
|
|
||||||
|
gt_all = read_gt(['{}/enp_DE.sbb.bio/enp_DE.sbb.bio'.format(path_to_ner_corpora),
|
||||||
|
'{}/enp_DE.onb.bio/enp_DE.onb.bio'.format(path_to_ner_corpora),
|
||||||
|
'{}/enp_DE.lft.bio/enp_DE.lft.bio'.format(path_to_ner_corpora)], ['SBB', 'ONB', 'LFT'])
|
||||||
|
|
||||||
|
gt_all.to_pickle(ner_ground_truth_file)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -0,0 +1,68 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import click
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def read_gt(files, datasets):
|
||||||
|
sentence_number = 200000
|
||||||
|
gt_data = list()
|
||||||
|
|
||||||
|
for filename, dataset in zip(files, datasets):
|
||||||
|
gt_lines = [l.strip() for l in open(filename)]
|
||||||
|
|
||||||
|
word_number = 0
|
||||||
|
|
||||||
|
for li in gt_lines:
|
||||||
|
|
||||||
|
if li == '':
|
||||||
|
|
||||||
|
if word_number > 0:
|
||||||
|
sentence_number += 1
|
||||||
|
word_number = 0
|
||||||
|
|
||||||
|
continue
|
||||||
|
|
||||||
|
if li.startswith('#'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
_, word, tag, _ = li.split()
|
||||||
|
|
||||||
|
tag = tag.upper()
|
||||||
|
tag = tag.replace('_', '-')
|
||||||
|
tag = tag.replace('.', '-')
|
||||||
|
|
||||||
|
if len(tag) > 5:
|
||||||
|
tag = tag[0:5]
|
||||||
|
|
||||||
|
if tag not in {'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-ORG', 'I-LOC'}:
|
||||||
|
tag = 'O'
|
||||||
|
|
||||||
|
gt_data.append((sentence_number, word_number, word, tag, dataset))
|
||||||
|
|
||||||
|
word_number += 1
|
||||||
|
|
||||||
|
return pd.DataFrame(gt_data, columns=['nsentence', 'nword', 'word', 'tag', 'dataset'])
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.argument('path-to-germ-eval', type=click.Path(exists=True), required=True, nargs=1)
|
||||||
|
@click.argument('germ-eval-ground-truth-file', type=click.Path(), required=True, nargs=1)
|
||||||
|
def main(path_to_germ_eval, germ_eval_ground_truth_file):
|
||||||
|
"""
|
||||||
|
Read germ eval .tsv files from directory <path-to-germ-eval> and
|
||||||
|
write the outcome of the data parsing to some pandas DataFrame
|
||||||
|
that is stored as pickle in file <germ-eval-ground-truth-file>.
|
||||||
|
"""
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(germ_eval_ground_truth_file), exist_ok=True)
|
||||||
|
|
||||||
|
gt_all = read_gt(['{}/NER-de-dev.tsv'.format(path_to_germ_eval),
|
||||||
|
'{}/NER-de-test.tsv'.format(path_to_germ_eval),
|
||||||
|
'{}/NER-de-train.tsv'.format(path_to_germ_eval)],
|
||||||
|
['GERM-EVAL-DEV', 'GERM-EVAL-TEST', 'GERM-EVAL-TRAIN'])
|
||||||
|
|
||||||
|
gt_all.to_pickle(germ_eval_ground_truth_file)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -0,0 +1,29 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import click
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.argument('files', nargs=-1, type=click.Path())
|
||||||
|
def main(files):
|
||||||
|
"""
|
||||||
|
Join multiple pandas DataFrame pickles of NER ground-truth into one big file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
assert(len(files) > 1)
|
||||||
|
|
||||||
|
gt = list()
|
||||||
|
|
||||||
|
for filename in files[:-1]:
|
||||||
|
|
||||||
|
gt.append(pd.read_pickle(filename))
|
||||||
|
|
||||||
|
gt = pd.concat(gt, axis=0)
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(files[-1]), exist_ok=True)
|
||||||
|
|
||||||
|
gt.to_pickle(files[-1])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -0,0 +1,68 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import click
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def read_gt(files, datasets):
|
||||||
|
|
||||||
|
sentence_number = 1000000
|
||||||
|
gt_data = list()
|
||||||
|
|
||||||
|
for filename, dataset in zip(files, datasets):
|
||||||
|
|
||||||
|
for li in open(filename, encoding='iso-8859-1'):
|
||||||
|
|
||||||
|
li = li.strip()
|
||||||
|
|
||||||
|
parts = li.split(' ')
|
||||||
|
|
||||||
|
prev_tag = 'O'
|
||||||
|
for word_number, pa in enumerate(parts):
|
||||||
|
|
||||||
|
if len(pa) == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
word, pos, tag = pa.split('|')
|
||||||
|
|
||||||
|
tag = tag.upper()
|
||||||
|
tag = tag.replace('_', '-')
|
||||||
|
tag = tag.replace('.', '-')
|
||||||
|
|
||||||
|
if len(tag) > 5:
|
||||||
|
tag = tag[0:5]
|
||||||
|
|
||||||
|
if tag not in {'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-ORG', 'I-LOC'}:
|
||||||
|
tag = 'O'
|
||||||
|
|
||||||
|
if tag.startswith('I') and prev_tag == 'O':
|
||||||
|
tag = 'B' + tag[1:]
|
||||||
|
|
||||||
|
prev_tag = tag
|
||||||
|
gt_data.append((sentence_number, word_number, word, tag, dataset))
|
||||||
|
|
||||||
|
sentence_number += 1
|
||||||
|
|
||||||
|
return pd.DataFrame(gt_data, columns=['nsentence', 'nword', 'word', 'tag', 'dataset'])
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.argument('path-to-wikiner', type=click.Path(exists=True), required=True, nargs=1)
|
||||||
|
@click.argument('wikiner-ground-truth-file', type=click.Path(), required=True, nargs=1)
|
||||||
|
def main(path_to_wikiner, wikiner_ground_truth_file):
|
||||||
|
"""
|
||||||
|
Read wikiner files from directory <path-to-wikiner> and
|
||||||
|
write the outcome of the data parsing to some pandas DataFrame
|
||||||
|
that is stored as pickle in file <wikiner-ground-truth-file>.
|
||||||
|
"""
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(wikiner_ground_truth_file), exist_ok=True)
|
||||||
|
|
||||||
|
gt_all = read_gt(['{}/aij-wikiner-de-wp2'.format(path_to_wikiner),
|
||||||
|
'{}/aij-wikiner-de-wp3'.format(path_to_wikiner)],
|
||||||
|
['WIKINER-WP2', 'WIKINER-WP3'])
|
||||||
|
|
||||||
|
gt_all.to_pickle(wikiner_ground_truth_file)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -0,0 +1 @@
|
|||||||
|
__import__('pkg_resources').declare_namespace(__name__)
|
@ -0,0 +1,693 @@
|
|||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
# from inspect import currentframe
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import json
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||||
|
from pytorch_pretrained_bert.modeling import (CONFIG_NAME, # WEIGHTS_NAME,
|
||||||
|
BertConfig,
|
||||||
|
BertForTokenClassification)
|
||||||
|
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
|
||||||
|
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
from conlleval import evaluate as conll_eval
|
||||||
|
|
||||||
|
from tqdm import tqdm, trange
|
||||||
|
|
||||||
|
from qurator.sbb_ner.ground_truth.data_processor import NerProcessor, WikipediaNerProcessor
|
||||||
|
|
||||||
|
from sklearn.model_selection import GroupKFold
|
||||||
|
|
||||||
|
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
|
datefmt='%m/%d/%Y %H:%M:%S',
|
||||||
|
level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def model_train(bert_model, max_seq_length, do_lower_case,
|
||||||
|
num_train_epochs, train_batch_size, gradient_accumulation_steps,
|
||||||
|
learning_rate, weight_decay, loss_scale, warmup_proportion,
|
||||||
|
processor, device, n_gpu, fp16, cache_dir, local_rank,
|
||||||
|
dry_run, no_cuda, output_dir=None):
|
||||||
|
|
||||||
|
label_map = processor.get_labels()
|
||||||
|
|
||||||
|
if gradient_accumulation_steps < 1:
|
||||||
|
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
|
||||||
|
gradient_accumulation_steps))
|
||||||
|
|
||||||
|
train_batch_size = train_batch_size // gradient_accumulation_steps
|
||||||
|
|
||||||
|
train_dataloader = processor.get_train_examples(train_batch_size, local_rank)
|
||||||
|
|
||||||
|
# Batch sampler divides by batch_size!
|
||||||
|
num_train_optimization_steps = int(len(train_dataloader)*num_train_epochs/gradient_accumulation_steps)
|
||||||
|
|
||||||
|
if local_rank != -1:
|
||||||
|
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
|
||||||
|
|
||||||
|
# Prepare model
|
||||||
|
cache_dir = cache_dir if cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE),
|
||||||
|
'distributed_{}'.format(local_rank))
|
||||||
|
|
||||||
|
model = BertForTokenClassification.from_pretrained(bert_model, cache_dir=cache_dir, num_labels=len(label_map))
|
||||||
|
|
||||||
|
if fp16:
|
||||||
|
model.half()
|
||||||
|
|
||||||
|
model.to(device)
|
||||||
|
|
||||||
|
if local_rank != -1:
|
||||||
|
try:
|
||||||
|
from apex.parallel import DistributedDataParallel as DDP
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||||
|
|
||||||
|
model = DDP(model)
|
||||||
|
elif n_gpu > 1:
|
||||||
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
|
param_optimizer = list(model.named_parameters())
|
||||||
|
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||||
|
optimizer_grouped_parameters = [
|
||||||
|
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
|
||||||
|
'weight_decay': weight_decay},
|
||||||
|
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
|
]
|
||||||
|
|
||||||
|
if fp16:
|
||||||
|
try:
|
||||||
|
from apex.optimizers import FP16_Optimizer
|
||||||
|
from apex.optimizers import FusedAdam
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||||
|
|
||||||
|
optimizer = FusedAdam(optimizer_grouped_parameters,
|
||||||
|
lr=learning_rate,
|
||||||
|
bias_correction=False,
|
||||||
|
max_grad_norm=1.0)
|
||||||
|
if loss_scale == 0:
|
||||||
|
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
||||||
|
else:
|
||||||
|
optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale)
|
||||||
|
|
||||||
|
warmup_linear = WarmupLinearSchedule(warmup=warmup_proportion, t_total=num_train_optimization_steps)
|
||||||
|
else:
|
||||||
|
optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion,
|
||||||
|
t_total=num_train_optimization_steps)
|
||||||
|
warmup_linear = None
|
||||||
|
|
||||||
|
global_step = 0
|
||||||
|
logger.info("***** Running training *****")
|
||||||
|
logger.info(" Num examples = %d", len(train_dataloader))
|
||||||
|
logger.info(" Batch size = %d", train_batch_size)
|
||||||
|
logger.info(" Num steps = %d", num_train_optimization_steps)
|
||||||
|
logger.info(" Num epochs = %d", num_train_epochs)
|
||||||
|
|
||||||
|
model_config = {"bert_model": bert_model, "do_lower": do_lower_case,
|
||||||
|
"max_seq_length": max_seq_length, "label_map": label_map}
|
||||||
|
|
||||||
|
def save_model(lh):
|
||||||
|
|
||||||
|
if output_dir is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
output_model_file = os.path.join(output_dir, "pytorch_model_ep{}.bin".format(ep))
|
||||||
|
|
||||||
|
# Save a trained model and the associated configuration
|
||||||
|
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||||
|
|
||||||
|
torch.save(model_to_save.state_dict(), output_model_file)
|
||||||
|
|
||||||
|
output_config_file = os.path.join(output_dir, CONFIG_NAME)
|
||||||
|
with open(output_config_file, 'w') as f:
|
||||||
|
f.write(model_to_save.config.to_json_string())
|
||||||
|
|
||||||
|
json.dump(model_config, open(os.path.join(output_dir, "model_config.json"), "w"))
|
||||||
|
|
||||||
|
lh = pd.DataFrame(lh, columns=['global_step', 'loss'])
|
||||||
|
|
||||||
|
loss_history_file = os.path.join(output_dir, "loss_ep{}.pkl".format(ep))
|
||||||
|
|
||||||
|
lh.to_pickle(loss_history_file)
|
||||||
|
|
||||||
|
def load_model(epoch):
|
||||||
|
|
||||||
|
if output_dir is None:
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
output_model_file = os.path.join(output_dir, "pytorch_model_ep{}.bin".format(epoch))
|
||||||
|
|
||||||
|
if not os.path.exists(output_model_file):
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
logger.info("Loading epoch {} from disk...".format(epoch))
|
||||||
|
model.load_state_dict(torch.load(output_model_file,
|
||||||
|
map_location=lambda storage, loc: storage if no_cuda else None))
|
||||||
|
return True
|
||||||
|
|
||||||
|
model.train()
|
||||||
|
for ep in trange(1, int(num_train_epochs) + 1, desc="Epoch"):
|
||||||
|
|
||||||
|
if dry_run and ep > 1:
|
||||||
|
logger.info("Dry run. Stop.")
|
||||||
|
break
|
||||||
|
|
||||||
|
if load_model(ep):
|
||||||
|
global_step += len(train_dataloader) // gradient_accumulation_steps
|
||||||
|
continue
|
||||||
|
|
||||||
|
loss_history = list()
|
||||||
|
tr_loss = 0
|
||||||
|
nb_tr_examples, nb_tr_steps = 0, 0
|
||||||
|
with tqdm(total=len(train_dataloader), desc=f"Epoch {ep}") as pbar:
|
||||||
|
|
||||||
|
for step, batch in enumerate(train_dataloader):
|
||||||
|
|
||||||
|
batch = tuple(t.to(device) for t in batch)
|
||||||
|
|
||||||
|
input_ids, input_mask, segment_ids, label_ids = batch
|
||||||
|
|
||||||
|
loss = model(input_ids, segment_ids, input_mask, label_ids)
|
||||||
|
|
||||||
|
if n_gpu > 1:
|
||||||
|
loss = loss.mean() # mean() to average on multi-gpu.
|
||||||
|
if gradient_accumulation_steps > 1:
|
||||||
|
loss = loss / gradient_accumulation_steps
|
||||||
|
|
||||||
|
if fp16:
|
||||||
|
optimizer.backward(loss)
|
||||||
|
else:
|
||||||
|
loss.backward()
|
||||||
|
|
||||||
|
loss_history.append((global_step, loss.item()))
|
||||||
|
|
||||||
|
tr_loss += loss.item()
|
||||||
|
nb_tr_examples += input_ids.size(0)
|
||||||
|
nb_tr_steps += 1
|
||||||
|
pbar.update(1)
|
||||||
|
mean_loss = tr_loss * gradient_accumulation_steps / nb_tr_steps
|
||||||
|
pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
|
||||||
|
|
||||||
|
if dry_run and len(loss_history) > 2:
|
||||||
|
logger.info("Dry run. Stop.")
|
||||||
|
break
|
||||||
|
|
||||||
|
if (step + 1) % gradient_accumulation_steps == 0:
|
||||||
|
if fp16:
|
||||||
|
# modify learning rate with special warm up BERT uses
|
||||||
|
# if args.fp16 is False, BertAdam is used that handles this automatically
|
||||||
|
lr_this_step = learning_rate * warmup_linear.get_lr(global_step, warmup_proportion)
|
||||||
|
|
||||||
|
for param_group in optimizer.param_groups:
|
||||||
|
param_group['lr'] = lr_this_step
|
||||||
|
|
||||||
|
optimizer.step()
|
||||||
|
optimizer.zero_grad()
|
||||||
|
global_step += 1
|
||||||
|
|
||||||
|
save_model(loss_history)
|
||||||
|
|
||||||
|
return model, model_config
|
||||||
|
|
||||||
|
|
||||||
|
def model_eval(batch_size, label_map, processor, device, num_train_epochs=1, output_dir=None, model=None,
|
||||||
|
local_rank=-1, no_cuda=False, dry_run=False):
|
||||||
|
|
||||||
|
output_eval_file = None
|
||||||
|
if output_dir is not None:
|
||||||
|
output_eval_file = os.path.join(output_dir, processor.get_evaluation_file())
|
||||||
|
logger.info('Write evaluation results to: {}'.format(output_eval_file))
|
||||||
|
|
||||||
|
dataloader = processor.get_dev_examples(batch_size, local_rank)
|
||||||
|
|
||||||
|
logger.info("***** Running evaluation *****")
|
||||||
|
logger.info(" Num examples = %d", len(dataloader))
|
||||||
|
logger.info(" Batch size = %d", batch_size)
|
||||||
|
|
||||||
|
results = list()
|
||||||
|
|
||||||
|
output_config_file = None
|
||||||
|
if output_dir is not None:
|
||||||
|
output_config_file = os.path.join(output_dir, CONFIG_NAME)
|
||||||
|
|
||||||
|
for ep in trange(1, int(num_train_epochs) + 1, desc="Epoch"):
|
||||||
|
|
||||||
|
if dry_run and ep > 1:
|
||||||
|
logger.info("Dry run. Stop.")
|
||||||
|
break
|
||||||
|
|
||||||
|
if output_config_file is not None:
|
||||||
|
# Load a trained model and config that you have fine-tuned
|
||||||
|
output_model_file = os.path.join(output_dir, "pytorch_model_ep{}.bin".format(ep))
|
||||||
|
|
||||||
|
if not os.path.exists(output_model_file):
|
||||||
|
logger.info("Stopping at epoch {} since model file is missing.".format(ep))
|
||||||
|
break
|
||||||
|
|
||||||
|
config = BertConfig(output_config_file)
|
||||||
|
model = BertForTokenClassification(config, num_labels=len(label_map))
|
||||||
|
model.load_state_dict(torch.load(output_model_file,
|
||||||
|
map_location=lambda storage, loc: storage if no_cuda else None))
|
||||||
|
model.to(device)
|
||||||
|
|
||||||
|
if model is None:
|
||||||
|
raise ValueError('Model required for evaluation.')
|
||||||
|
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
y_pred, y_true = model_predict_compare(dataloader, device, label_map, model, dry_run)
|
||||||
|
|
||||||
|
lines = ['empty ' + 'XXX ' + v + ' ' + p for yt, yp in zip(y_true, y_pred) for v, p in zip(yt, yp)]
|
||||||
|
|
||||||
|
res = conll_eval(lines)
|
||||||
|
|
||||||
|
# print(res)
|
||||||
|
|
||||||
|
evals = \
|
||||||
|
pd.concat([pd.DataFrame.from_dict(res['overall']['evals'], orient='index', columns=['ALL']),
|
||||||
|
pd.DataFrame.from_dict(res['slots']['LOC']['evals'], orient='index', columns=['LOC']),
|
||||||
|
pd.DataFrame.from_dict(res['slots']['PER']['evals'], orient='index', columns=['PER']),
|
||||||
|
pd.DataFrame.from_dict(res['slots']['ORG']['evals'], orient='index', columns=['ORG']),
|
||||||
|
], axis=1).T
|
||||||
|
|
||||||
|
stats = \
|
||||||
|
pd.concat(
|
||||||
|
[pd.DataFrame.from_dict(res['overall']['stats'], orient='index', columns=['ALL']),
|
||||||
|
pd.DataFrame.from_dict(res['slots']['LOC']['stats'], orient='index', columns=['LOC']),
|
||||||
|
pd.DataFrame.from_dict(res['slots']['PER']['stats'], orient='index', columns=['PER']),
|
||||||
|
pd.DataFrame.from_dict(res['slots']['ORG']['stats'], orient='index', columns=['ORG'])],
|
||||||
|
axis=1, sort=True).T
|
||||||
|
|
||||||
|
evals['epoch'] = ep
|
||||||
|
stats['epoch'] = ep
|
||||||
|
|
||||||
|
results.append(pd.concat([evals.reset_index().set_index(['index', 'epoch']),
|
||||||
|
stats.reset_index().set_index(['index', 'epoch'])], axis=1))
|
||||||
|
|
||||||
|
if output_eval_file is not None:
|
||||||
|
pd.concat(results).to_pickle(output_eval_file)
|
||||||
|
|
||||||
|
results = pd.concat(results)
|
||||||
|
print(results)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def model_predict_compare(dataloader, device, label_map, model, dry_run=False):
|
||||||
|
|
||||||
|
y_true = []
|
||||||
|
y_pred = []
|
||||||
|
covered = set()
|
||||||
|
for input_ids, input_mask, segment_ids, label_ids in tqdm(dataloader, desc="Evaluating"):
|
||||||
|
input_ids = input_ids.to(device)
|
||||||
|
input_mask = input_mask.to(device)
|
||||||
|
segment_ids = segment_ids.to(device)
|
||||||
|
label_ids = label_ids.to(device)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
logits = model(input_ids, segment_ids, input_mask)
|
||||||
|
|
||||||
|
logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
|
||||||
|
logits = logits.detach().cpu().numpy()
|
||||||
|
label_ids = label_ids.to('cpu').numpy()
|
||||||
|
input_mask = input_mask.to('cpu').numpy()
|
||||||
|
|
||||||
|
for i, mask in enumerate(input_mask):
|
||||||
|
temp_1 = []
|
||||||
|
temp_2 = []
|
||||||
|
for j, m in enumerate(mask):
|
||||||
|
if j == 0:
|
||||||
|
continue
|
||||||
|
if m:
|
||||||
|
if label_map[label_ids[i][j]] != "X":
|
||||||
|
temp_1.append(label_map[label_ids[i][j]])
|
||||||
|
temp_2.append(label_map[logits[i][j]])
|
||||||
|
else:
|
||||||
|
temp_1.pop()
|
||||||
|
temp_2.pop()
|
||||||
|
y_true.append(temp_1)
|
||||||
|
y_pred.append(temp_2)
|
||||||
|
|
||||||
|
covered = covered.union(set(temp_1))
|
||||||
|
break
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
|
||||||
|
if 'I-LOC' not in covered:
|
||||||
|
continue
|
||||||
|
if 'I-ORG' not in covered:
|
||||||
|
continue
|
||||||
|
if 'I-PER' not in covered:
|
||||||
|
continue
|
||||||
|
|
||||||
|
break
|
||||||
|
return y_pred, y_true
|
||||||
|
|
||||||
|
|
||||||
|
def model_predict(dataloader, device, label_map, model):
|
||||||
|
|
||||||
|
y_pred = []
|
||||||
|
for input_ids, input_mask, segment_ids, label_ids in dataloader:
|
||||||
|
input_ids = input_ids.to(device)
|
||||||
|
input_mask = input_mask.to(device)
|
||||||
|
segment_ids = segment_ids.to(device)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
logits = model(input_ids, segment_ids, input_mask)
|
||||||
|
|
||||||
|
logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
|
||||||
|
logits = logits.detach().cpu().numpy()
|
||||||
|
input_mask = input_mask.to('cpu').numpy()
|
||||||
|
|
||||||
|
for i, mask in enumerate(input_mask):
|
||||||
|
temp_2 = []
|
||||||
|
for j, m in enumerate(mask):
|
||||||
|
if j == 0: # skip first token since its [CLS]
|
||||||
|
continue
|
||||||
|
if m:
|
||||||
|
temp_2.append(label_map[logits[i][j]])
|
||||||
|
else:
|
||||||
|
temp_2.pop() # skip last token since its [SEP]
|
||||||
|
y_pred.append(temp_2)
|
||||||
|
break
|
||||||
|
|
||||||
|
return y_pred
|
||||||
|
|
||||||
|
|
||||||
|
def get_device(local_rank=-1, no_cuda=False):
|
||||||
|
if local_rank == -1 or no_cuda:
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
|
||||||
|
n_gpu = torch.cuda.device_count()
|
||||||
|
else:
|
||||||
|
torch.cuda.set_device(local_rank)
|
||||||
|
device = torch.device("cuda", local_rank)
|
||||||
|
n_gpu = 1
|
||||||
|
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||||
|
torch.distributed.init_process_group(backend='nccl')
|
||||||
|
return device, n_gpu
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
parser = get_arg_parser()
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
do_eval = len(args.dev_sets) > 0 and not args.do_cross_validation
|
||||||
|
do_train = len(args.train_sets) > 0 and not args.do_cross_validation
|
||||||
|
|
||||||
|
device, n_gpu = get_device(args.local_rank, args.no_cuda)
|
||||||
|
|
||||||
|
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
|
||||||
|
device, n_gpu, bool(args.local_rank != -1), args.fp16))
|
||||||
|
|
||||||
|
random.seed(args.seed)
|
||||||
|
np.random.seed(args.seed)
|
||||||
|
torch.manual_seed(args.seed)
|
||||||
|
|
||||||
|
if not do_train and not do_eval and not args.do_cross_validation:
|
||||||
|
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
|
||||||
|
|
||||||
|
if not os.path.exists(args.output_dir):
|
||||||
|
os.makedirs(args.output_dir)
|
||||||
|
|
||||||
|
task_name = args.task_name.lower()
|
||||||
|
|
||||||
|
processors = {"ner": NerProcessor, "wikipedia-ner": WikipediaNerProcessor}
|
||||||
|
|
||||||
|
if task_name not in processors:
|
||||||
|
raise ValueError("Task not found: %s" % task_name)
|
||||||
|
|
||||||
|
if args.do_cross_validation:
|
||||||
|
|
||||||
|
cross_val_result_file = "cross_validation_results.pkl"
|
||||||
|
|
||||||
|
cross_val_result_file = os.path.join(args.output_dir, cross_val_result_file)
|
||||||
|
|
||||||
|
sets = set(args.train_sets.split('|')) if args.train_sets is not None else set()
|
||||||
|
|
||||||
|
gt = pd.read_pickle(args.gt_file)
|
||||||
|
|
||||||
|
gt = gt.loc[gt.dataset.isin(sets)]
|
||||||
|
|
||||||
|
k_fold = GroupKFold(n_splits=args.n_splits)
|
||||||
|
|
||||||
|
eval_results = list()
|
||||||
|
|
||||||
|
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||||
|
|
||||||
|
for ep in range(1, int(args.num_train_epochs) + 1):
|
||||||
|
|
||||||
|
for sp, (train, test) in enumerate(k_fold.split(X=gt, groups=gt.nsentence)):
|
||||||
|
|
||||||
|
tr = gt.iloc[train].copy()
|
||||||
|
te = gt.iloc[test].copy()
|
||||||
|
|
||||||
|
tr['dataset'] = 'TRAIN'
|
||||||
|
te['dataset'] = 'TEST'
|
||||||
|
|
||||||
|
gt_tmp = pd.concat([tr, te])
|
||||||
|
|
||||||
|
processor = \
|
||||||
|
processors[task_name](train_sets='TRAIN', dev_sets='TEST', test_sets='TEST',
|
||||||
|
gt=gt_tmp, max_seq_length=args.max_seq_length,
|
||||||
|
tokenizer=tokenizer, data_epochs=args.num_data_epochs,
|
||||||
|
epoch_size=args.epoch_size)
|
||||||
|
|
||||||
|
model, model_config = \
|
||||||
|
model_train(bert_model=args.bert_model, max_seq_length=args.max_seq_length,
|
||||||
|
do_lower_case=args.do_lower_case, num_train_epochs=ep,
|
||||||
|
train_batch_size=args.train_batch_size,
|
||||||
|
gradient_accumulation_steps=args.gradient_accumulation_steps,
|
||||||
|
learning_rate=args.learning_rate, weight_decay=args.weight_decay,
|
||||||
|
loss_scale=args.loss_scale, warmup_proportion=args.warmup_proportion,
|
||||||
|
processor=processor, device=device, n_gpu=n_gpu, fp16=args.fp16,
|
||||||
|
cache_dir=args.cache_dir, local_rank=args.local_rank, dry_run=args.dry_run,
|
||||||
|
no_cuda=args.no_cuda)
|
||||||
|
|
||||||
|
label_map = {v: k for k, v in model_config['label_map'].items()}
|
||||||
|
|
||||||
|
eval_result =\
|
||||||
|
model_eval(model=model, label_map=label_map, processor=processor, device=device,
|
||||||
|
batch_size=args.eval_batch_size, local_rank=args.local_rank,
|
||||||
|
no_cuda=args.no_cuda, dry_run=args.dry_run).reset_index()
|
||||||
|
|
||||||
|
eval_result['split'] = sp
|
||||||
|
eval_result['epoch'] = ep
|
||||||
|
eval_results.append(eval_result)
|
||||||
|
|
||||||
|
del model # release CUDA memory
|
||||||
|
|
||||||
|
pd.concat(eval_results).to_pickle(cross_val_result_file)
|
||||||
|
|
||||||
|
if do_train:
|
||||||
|
|
||||||
|
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||||
|
|
||||||
|
processor = \
|
||||||
|
processors[task_name](train_sets=args.train_sets, dev_sets=args.dev_sets, test_sets=args.test_sets,
|
||||||
|
gt_file=args.gt_file, max_seq_length=args.max_seq_length,
|
||||||
|
tokenizer=tokenizer, data_epochs=args.num_data_epochs,
|
||||||
|
epoch_size=args.epoch_size)
|
||||||
|
|
||||||
|
model_train(bert_model=args.bert_model, output_dir=args.output_dir, max_seq_length=args.max_seq_length,
|
||||||
|
do_lower_case=args.do_lower_case, num_train_epochs=args.num_train_epochs,
|
||||||
|
train_batch_size=args.train_batch_size,
|
||||||
|
gradient_accumulation_steps=args.gradient_accumulation_steps,
|
||||||
|
learning_rate=args.learning_rate, weight_decay=args.weight_decay, loss_scale=args.loss_scale,
|
||||||
|
warmup_proportion=args.warmup_proportion, processor=processor, device=device, n_gpu=n_gpu,
|
||||||
|
fp16=args.fp16, cache_dir=args.cache_dir, local_rank=args.local_rank, dry_run=args.dry_run,
|
||||||
|
no_cuda=args.no_cuda)
|
||||||
|
|
||||||
|
if do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||||
|
|
||||||
|
model_config = json.load(open(os.path.join(args.output_dir, "model_config.json"), "r"))
|
||||||
|
|
||||||
|
label_to_id = model_config['label_map']
|
||||||
|
|
||||||
|
label_map = {v: k for k, v in model_config['label_map'].items()}
|
||||||
|
|
||||||
|
tokenizer = BertTokenizer.from_pretrained(model_config['bert_model'],
|
||||||
|
do_lower_case=model_config['do_lower'])
|
||||||
|
|
||||||
|
processor = \
|
||||||
|
processors[task_name](train_sets=None, dev_sets=args.dev_sets, test_sets=args.test_sets,
|
||||||
|
gt_file=args.gt_file, max_seq_length=model_config['max_seq_length'],
|
||||||
|
tokenizer=tokenizer, data_epochs=args.num_data_epochs,
|
||||||
|
epoch_size=args.epoch_size, label_map=label_to_id)
|
||||||
|
|
||||||
|
model_eval(label_map=label_map, processor=processor, device=device, num_train_epochs=args.num_train_epochs,
|
||||||
|
output_dir=args.output_dir, batch_size=args.eval_batch_size, local_rank=args.local_rank,
|
||||||
|
no_cuda=args.no_cuda, dry_run=args.dry_run)
|
||||||
|
|
||||||
|
|
||||||
|
def get_arg_parser():
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
|
||||||
|
parser.add_argument("--gt_file",
|
||||||
|
default=None,
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="The pickle file that contains all NER ground truth as pandas DataFrame."
|
||||||
|
" Required columns: ['nsentence', 'nword', 'word', 'tag', 'dataset]."
|
||||||
|
" The selection of training, test and dev set is performed on the 'dataset' column.")
|
||||||
|
|
||||||
|
parser.add_argument("--train_sets",
|
||||||
|
default='',
|
||||||
|
type=str,
|
||||||
|
required=False,
|
||||||
|
help="Specifiy one or more tags from the dataset column in order to mark samples"
|
||||||
|
" that belong to the training set. Example: 'GERM-EVAL-TRAIN|DE-CONLL-TRAIN'. ")
|
||||||
|
|
||||||
|
parser.add_argument("--dev_sets",
|
||||||
|
default='',
|
||||||
|
type=str,
|
||||||
|
required=False,
|
||||||
|
help="Specifiy one or more tags from the dataset column in order to mark samples"
|
||||||
|
" that belong to the dev set. Example: 'GERM-EVAL-DEV|DE-CONLL-TESTA'. ")
|
||||||
|
|
||||||
|
parser.add_argument("--test_sets",
|
||||||
|
default='',
|
||||||
|
type=str,
|
||||||
|
required=False,
|
||||||
|
help="Specifiy one or more tags from the dataset column in order to mark samples"
|
||||||
|
" that belong to the test set. Example: 'GERM-EVAL-TEST|DE-CONLL-TESTB'. ")
|
||||||
|
|
||||||
|
parser.add_argument("--bert_model", default=None, type=str, required=False,
|
||||||
|
help="Bert pre-trained model selected in the list: bert-base-uncased, "
|
||||||
|
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
|
||||||
|
"bert-base-multilingual-cased, bert-base-chinese.")
|
||||||
|
|
||||||
|
parser.add_argument("--task_name",
|
||||||
|
default=None,
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="The name of the task to train.")
|
||||||
|
|
||||||
|
parser.add_argument("--output_dir",
|
||||||
|
default=None,
|
||||||
|
type=str,
|
||||||
|
required=False,
|
||||||
|
help="The output directory where the model predictions and checkpoints will be written.")
|
||||||
|
|
||||||
|
# Other parameters
|
||||||
|
parser.add_argument("--cache_dir",
|
||||||
|
default="",
|
||||||
|
type=str,
|
||||||
|
help="Where do you want to store the pre-trained models downloaded from s3")
|
||||||
|
|
||||||
|
parser.add_argument("--max_seq_length",
|
||||||
|
default=128,
|
||||||
|
type=int,
|
||||||
|
help="The maximum total input sequence length after WordPiece tokenization. \n"
|
||||||
|
"Sequences longer than this will be truncated, and sequences shorter \n"
|
||||||
|
"than this will be padded.")
|
||||||
|
|
||||||
|
parser.add_argument("--do_lower_case",
|
||||||
|
action='store_true',
|
||||||
|
help="Set this flag if you are using an uncased model.")
|
||||||
|
|
||||||
|
parser.add_argument("--train_batch_size",
|
||||||
|
default=32,
|
||||||
|
type=int,
|
||||||
|
help="Total batch size for training.")
|
||||||
|
|
||||||
|
parser.add_argument("--eval_batch_size",
|
||||||
|
default=8,
|
||||||
|
type=int,
|
||||||
|
help="Total batch size for eval.")
|
||||||
|
|
||||||
|
parser.add_argument("--learning_rate",
|
||||||
|
default=3e-5,
|
||||||
|
type=float,
|
||||||
|
help="The initial learning rate for Adam.")
|
||||||
|
|
||||||
|
parser.add_argument("--weight_decay",
|
||||||
|
default=0.01,
|
||||||
|
type=float,
|
||||||
|
help="Weight decay for Adam.")
|
||||||
|
|
||||||
|
parser.add_argument("--num_train_epochs",
|
||||||
|
default=3.0,
|
||||||
|
type=float,
|
||||||
|
help="Total number of training epochs to perform/evaluate.")
|
||||||
|
|
||||||
|
parser.add_argument("--num_data_epochs",
|
||||||
|
default=1.0,
|
||||||
|
type=float,
|
||||||
|
help="Re-cycle data after num_data_epochs.")
|
||||||
|
|
||||||
|
parser.add_argument("--epoch_size",
|
||||||
|
default=10000,
|
||||||
|
type=float,
|
||||||
|
help="Size of one epoch.")
|
||||||
|
|
||||||
|
parser.add_argument("--do_cross_validation",
|
||||||
|
action='store_true',
|
||||||
|
help="Do cross-validation.")
|
||||||
|
|
||||||
|
parser.add_argument("--n_splits",
|
||||||
|
default=5,
|
||||||
|
type=int,
|
||||||
|
help="Number of folds in cross_validation.")
|
||||||
|
|
||||||
|
parser.add_argument("--warmup_proportion",
|
||||||
|
default=0.1,
|
||||||
|
type=float,
|
||||||
|
help="Proportion of training to perform linear learning rate warmup for. "
|
||||||
|
"E.g., 0.1 = 10%% of training.")
|
||||||
|
|
||||||
|
parser.add_argument("--no_cuda",
|
||||||
|
action='store_true',
|
||||||
|
help="Whether not to use CUDA when available")
|
||||||
|
|
||||||
|
parser.add_argument("--dry_run",
|
||||||
|
action='store_true',
|
||||||
|
help="Test mode.")
|
||||||
|
|
||||||
|
parser.add_argument("--local_rank",
|
||||||
|
type=int,
|
||||||
|
default=-1,
|
||||||
|
help="local_rank for distributed training on gpus")
|
||||||
|
|
||||||
|
parser.add_argument('--seed',
|
||||||
|
type=int,
|
||||||
|
default=42,
|
||||||
|
help="random seed for initialization")
|
||||||
|
|
||||||
|
parser.add_argument('--gradient_accumulation_steps',
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||||
|
|
||||||
|
parser.add_argument('--fp16',
|
||||||
|
action='store_true',
|
||||||
|
help="Whether to use 16-bit float precision instead of 32-bit")
|
||||||
|
|
||||||
|
parser.add_argument('--loss_scale',
|
||||||
|
type=float, default=0,
|
||||||
|
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
|
||||||
|
"0 (default value): dynamic loss scaling.\n"
|
||||||
|
"Positive power of 2: static loss scaling value.\n")
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -0,0 +1 @@
|
|||||||
|
__import__('pkg_resources').declare_namespace(__name__)
|
@ -0,0 +1,353 @@
|
|||||||
|
import os
|
||||||
|
from flask import Flask, send_from_directory, redirect, jsonify, request
|
||||||
|
import pandas as pd
|
||||||
|
from sqlite3 import Error
|
||||||
|
import sqlite3
|
||||||
|
import html
|
||||||
|
import json
|
||||||
|
import torch
|
||||||
|
from somajo import Tokenizer, SentenceSplitter
|
||||||
|
|
||||||
|
from qurator.sbb_ner.models.bert import get_device, model_predict
|
||||||
|
from qurator.sbb_ner.ground_truth.data_processor import NerProcessor, convert_examples_to_features
|
||||||
|
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||||
|
from pytorch_pretrained_bert.modeling import (CONFIG_NAME,
|
||||||
|
BertConfig,
|
||||||
|
BertForTokenClassification)
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
app.config.from_json('config.json')
|
||||||
|
|
||||||
|
|
||||||
|
class Digisam:
|
||||||
|
|
||||||
|
_conn = None
|
||||||
|
|
||||||
|
def __init__(self, data_path):
|
||||||
|
|
||||||
|
self._data_path = data_path
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_connection(db_file):
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(db_file, check_same_thread=False)
|
||||||
|
|
||||||
|
conn.execute('pragma journal_mode=wal')
|
||||||
|
|
||||||
|
return conn
|
||||||
|
except Error as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get(self, ppn):
|
||||||
|
|
||||||
|
if Digisam._conn is None:
|
||||||
|
Digisam._conn = self.create_connection(self._data_path)
|
||||||
|
|
||||||
|
df = pd.read_sql_query("select file_name, text from text where ppn=?;", Digisam._conn, params=(ppn,)). \
|
||||||
|
sort_values('file_name')
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
class NERPredictor:
|
||||||
|
|
||||||
|
def __init__(self, model_dir, batch_size, epoch, max_seq_length=128, local_rank=-1, no_cuda=False):
|
||||||
|
|
||||||
|
self._batch_size = batch_size
|
||||||
|
self._local_rank = local_rank
|
||||||
|
self._max_seq_length = max_seq_length
|
||||||
|
|
||||||
|
self._device, self._n_gpu = get_device(no_cuda=no_cuda)
|
||||||
|
|
||||||
|
self._model_config = json.load(open(os.path.join(model_dir, "model_config.json"), "r"))
|
||||||
|
|
||||||
|
self._label_to_id = self._model_config['label_map']
|
||||||
|
|
||||||
|
self._label_map = {v: k for k, v in self._model_config['label_map'].items()}
|
||||||
|
|
||||||
|
self._bert_tokenizer = \
|
||||||
|
BertTokenizer.from_pretrained(model_dir,
|
||||||
|
do_lower_case=self._model_config['do_lower'])
|
||||||
|
|
||||||
|
output_config_file = os.path.join(model_dir, CONFIG_NAME)
|
||||||
|
|
||||||
|
output_model_file = os.path.join(model_dir, "pytorch_model_ep{}.bin".format(epoch))
|
||||||
|
|
||||||
|
config = BertConfig(output_config_file)
|
||||||
|
|
||||||
|
self._model = BertForTokenClassification(config, num_labels=len(self._label_map))
|
||||||
|
self._model.load_state_dict(torch.load(output_model_file,
|
||||||
|
map_location=lambda storage, loc: storage if no_cuda else None))
|
||||||
|
self._model.to(self._device)
|
||||||
|
self._model.eval()
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
def classify_text(self, sentences):
|
||||||
|
|
||||||
|
examples = NerProcessor.create_examples(sentences, 'test')
|
||||||
|
|
||||||
|
features = [convert_examples_to_features(ex, self._label_to_id, self._max_seq_length, self._bert_tokenizer)
|
||||||
|
for ex in examples]
|
||||||
|
|
||||||
|
data_loader = NerProcessor.make_data_loader(None, self._batch_size, self._local_rank, self._label_to_id,
|
||||||
|
self._max_seq_length, self._bert_tokenizer, features=features,
|
||||||
|
sequential=True)
|
||||||
|
|
||||||
|
prediction_tmp = model_predict(data_loader, self._device, self._label_map, self._model)
|
||||||
|
|
||||||
|
prediction = []
|
||||||
|
for fe, pr in zip(features, prediction_tmp):
|
||||||
|
prediction.append((fe.tokens[1:-1], pr))
|
||||||
|
|
||||||
|
return prediction
|
||||||
|
|
||||||
|
|
||||||
|
class NERTokenizer:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
|
||||||
|
self._word_tokenizer = Tokenizer(split_camel_case=True, token_classes=False, extra_info=False)
|
||||||
|
|
||||||
|
self._sentence_splitter = SentenceSplitter()
|
||||||
|
|
||||||
|
def parse_text(self, text):
|
||||||
|
tokens = self._word_tokenizer.tokenize_paragraph(text)
|
||||||
|
|
||||||
|
sentences_tokenized = self._sentence_splitter.split(tokens)
|
||||||
|
|
||||||
|
sentences = []
|
||||||
|
for sen in sentences_tokenized:
|
||||||
|
sentences.append((sen, []))
|
||||||
|
|
||||||
|
return sentences
|
||||||
|
|
||||||
|
|
||||||
|
class PredictorStore:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
|
||||||
|
self._predictor = None
|
||||||
|
self._model_id = None
|
||||||
|
|
||||||
|
def get(self, model_id):
|
||||||
|
|
||||||
|
model = next((m for m in app.config['MODELS'] if m['id'] == int(model_id)))
|
||||||
|
|
||||||
|
if self._model_id != model_id:
|
||||||
|
|
||||||
|
self._predictor = NERPredictor(model_dir=model['model_dir'],
|
||||||
|
epoch=app.config['EPOCH'],
|
||||||
|
batch_size=app.config['BATCH_SIZE'],
|
||||||
|
no_cuda=False if not os.environ.get('USE_CUDA') else
|
||||||
|
os.environ.get('USE_CUDA').lower() == 'false')
|
||||||
|
self._model_id = model_id
|
||||||
|
|
||||||
|
return self._predictor
|
||||||
|
|
||||||
|
|
||||||
|
digisam = Digisam(app.config['DATA_PATH'])
|
||||||
|
|
||||||
|
predictor_store = PredictorStore()
|
||||||
|
|
||||||
|
tokenizer = NERTokenizer()
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/')
|
||||||
|
def entry():
|
||||||
|
return redirect("/index.html", code=302)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/models')
|
||||||
|
def get_models():
|
||||||
|
return jsonify(app.config['MODELS'])
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/ppnexamples')
|
||||||
|
def get_ppnexamples():
|
||||||
|
return jsonify(app.config['PPN_EXAMPLES'])
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/digisam-fulltext/<ppn>')
|
||||||
|
def fulltext(ppn):
|
||||||
|
|
||||||
|
df = digisam.get(ppn)
|
||||||
|
|
||||||
|
if len(df) == 0:
|
||||||
|
return 'bad request!', 400
|
||||||
|
|
||||||
|
text = ''
|
||||||
|
for row_index, row_data in df.iterrows():
|
||||||
|
|
||||||
|
if row_data.text is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
text += html.escape(str(row_data.text)) + '<br><br><br>'
|
||||||
|
|
||||||
|
ret = {'text': text, 'ppn': ppn}
|
||||||
|
|
||||||
|
return jsonify(ret)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/digisam-tokenized/<ppn>')
|
||||||
|
def tokenized(ppn):
|
||||||
|
|
||||||
|
df = digisam.get(ppn)
|
||||||
|
|
||||||
|
if len(df) == 0:
|
||||||
|
return 'bad request!', 400
|
||||||
|
|
||||||
|
text = ''
|
||||||
|
for row_index, row_data in df.iterrows():
|
||||||
|
|
||||||
|
if row_data.text is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
sentences = tokenizer.parse_text(row_data.text)
|
||||||
|
|
||||||
|
for sen, _ in sentences:
|
||||||
|
|
||||||
|
text += html.escape(str(sen)) + '<br>'
|
||||||
|
|
||||||
|
text += '<br><br><br>'
|
||||||
|
|
||||||
|
ret = {'text': text, 'ppn': ppn}
|
||||||
|
|
||||||
|
return jsonify(ret)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/ner-bert-tokens/<model_id>/<ppn>')
|
||||||
|
def ner_bert_tokens(model_id, ppn):
|
||||||
|
|
||||||
|
df = digisam.get(ppn)
|
||||||
|
|
||||||
|
if len(df) == 0:
|
||||||
|
return 'bad request!', 400
|
||||||
|
|
||||||
|
text = ''
|
||||||
|
for row_index, row_data in df.iterrows():
|
||||||
|
|
||||||
|
if row_data.text is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
sentences = tokenizer.parse_text(row_data.text)
|
||||||
|
|
||||||
|
prediction = predictor_store.get(model_id).classify_text(sentences)
|
||||||
|
|
||||||
|
for tokens, word_predictions in prediction:
|
||||||
|
|
||||||
|
for token, word_pred in zip(tokens, word_predictions):
|
||||||
|
|
||||||
|
text += html.escape("{}({})".format(token, word_pred))
|
||||||
|
|
||||||
|
text += '<br>'
|
||||||
|
|
||||||
|
text += '<br><br><br>'
|
||||||
|
|
||||||
|
ret = {'text': text, 'ppn': ppn}
|
||||||
|
|
||||||
|
return jsonify(ret)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/digisam-ner/<model_id>/<ppn>')
|
||||||
|
def digisam_ner(model_id, ppn):
|
||||||
|
|
||||||
|
df = digisam.get(ppn)
|
||||||
|
|
||||||
|
if len(df) == 0:
|
||||||
|
return 'bad request!', 400
|
||||||
|
|
||||||
|
text = ''
|
||||||
|
|
||||||
|
for row_index, row_data in df.iterrows():
|
||||||
|
|
||||||
|
if row_data.text is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
sentences = tokenizer.parse_text(row_data.text)
|
||||||
|
|
||||||
|
prediction = predictor_store.get(model_id).classify_text(sentences)
|
||||||
|
|
||||||
|
for tokens, word_predictions in prediction:
|
||||||
|
|
||||||
|
last_prediction = 'O'
|
||||||
|
|
||||||
|
for token, word_pred in zip(tokens, word_predictions):
|
||||||
|
|
||||||
|
if token == '[UNK]':
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not token.startswith('##'):
|
||||||
|
text += ' '
|
||||||
|
|
||||||
|
token = token[2:] if token.startswith('##') else token
|
||||||
|
|
||||||
|
if word_pred != 'X':
|
||||||
|
last_prediction = word_pred
|
||||||
|
|
||||||
|
if last_prediction == 'O':
|
||||||
|
text += html.escape(token)
|
||||||
|
elif last_prediction.endswith('PER'):
|
||||||
|
text += '<font color="red">' + html.escape(token) + '</font>'
|
||||||
|
elif last_prediction.endswith('LOC'):
|
||||||
|
text += '<font color="green">' + html.escape(token) + '</font>'
|
||||||
|
elif last_prediction.endswith('ORG'):
|
||||||
|
text += '<font color="blue">' + html.escape(token) + '</font>'
|
||||||
|
|
||||||
|
text += '<br>'
|
||||||
|
|
||||||
|
text += '<br><br><br>'
|
||||||
|
|
||||||
|
ret = {'text': text, 'ppn': ppn}
|
||||||
|
|
||||||
|
return jsonify(ret)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/ner/<model_id>', methods=['GET', 'POST'])
|
||||||
|
def ner(model_id):
|
||||||
|
|
||||||
|
raw_text = request.json['text']
|
||||||
|
|
||||||
|
sentences = tokenizer.parse_text(raw_text)
|
||||||
|
|
||||||
|
prediction = predictor_store.get(model_id).classify_text(sentences)
|
||||||
|
|
||||||
|
output = []
|
||||||
|
|
||||||
|
word = None
|
||||||
|
last_prediction = 'O'
|
||||||
|
|
||||||
|
for tokens, word_predictions in prediction:
|
||||||
|
|
||||||
|
last_prediction = 'O'
|
||||||
|
|
||||||
|
for token, word_pred in zip(tokens, word_predictions):
|
||||||
|
|
||||||
|
if token == '[UNK]':
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not token.startswith('##'):
|
||||||
|
if word is not None:
|
||||||
|
output.append({'word': word, 'prediction': last_prediction})
|
||||||
|
|
||||||
|
word = ''
|
||||||
|
|
||||||
|
token = token[2:] if token.startswith('##') else token
|
||||||
|
|
||||||
|
word += token
|
||||||
|
|
||||||
|
if word_pred != 'X':
|
||||||
|
last_prediction = word_pred
|
||||||
|
|
||||||
|
if word is not None and len(word) > 0:
|
||||||
|
output.append({'word': word, 'prediction': last_prediction})
|
||||||
|
|
||||||
|
return jsonify(output)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/<path:path>')
|
||||||
|
def send_js(path):
|
||||||
|
return send_from_directory('static', path)
|
@ -0,0 +1,77 @@
|
|||||||
|
{
|
||||||
|
"DATA_PATH": "data/digisam/fulltext.sqlite3",
|
||||||
|
"EPOCH": 7,
|
||||||
|
"BATCH_SIZE": 256,
|
||||||
|
"MODELS": [
|
||||||
|
{
|
||||||
|
"name": "DC-SBB + CONLL + GERMEVAL",
|
||||||
|
"id": 1,
|
||||||
|
"model_dir": "data/konvens2019/build-wd_0.03/bert-all-german-de-finetuned",
|
||||||
|
"default": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "DC-SBB + CONLL + GERMEVAL + SBB",
|
||||||
|
"id": 2,
|
||||||
|
"model_dir": "data/konvens2019/build-on-all-german-de-finetuned/bert-sbb-de-finetuned",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "DC-SBB + SBB",
|
||||||
|
"id": 3,
|
||||||
|
"model_dir": "data/konvens2019/build-wd_0.03/bert-sbb-de-finetuned",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "CONLL + GERMEVAL",
|
||||||
|
"id": 4,
|
||||||
|
"model_dir": "data/konvens2019/build-wd_0.03/bert-all-german-baseline",
|
||||||
|
"default": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"PPN_EXAMPLES": [
|
||||||
|
{
|
||||||
|
"ppn": "633609536",
|
||||||
|
"name": "Der achtzehnte Brumaire des Louis Bonaparte"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ppn": "778819027",
|
||||||
|
"name": "Der zerbrochene Krug"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ppn": "71807789X",
|
||||||
|
"name": "Praktischer Kommentar zu den Gebühren-Taxen für Notare und Rechtsanwälte"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ppn": "719153085",
|
||||||
|
"name": "Der Weltkrieg im Rechenunterricht"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ppn": "719961289",
|
||||||
|
"name": "Das Kriegs-Schaubuch des XVIII. A.K."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ppn": "720942748",
|
||||||
|
"name": "Ein Gebot der Stunde"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ppn": "819155217",
|
||||||
|
"name": "Der Zirkel, 1883"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ppn": "847022595",
|
||||||
|
"name": "Mecklenburgisches Logenblatt"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ppn": "756689090",
|
||||||
|
"name": "Das Buch wunderbarer Erfindungen"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ppn": "865468370",
|
||||||
|
"name": "Carl Robert Lessings Bücher- und Handschriftensammlung"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ppn": "818985976",
|
||||||
|
"name": "\nDie älteste Berliner Zeitung\nOCR\n\nDie älteste Berliner Zeitung : Fragmente der Berliner Wochenzeitung von 1626 aus dem Besitz der Preußischen Staatsbibliothek"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,77 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<!-- Required meta tags -->
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||||
|
|
||||||
|
<!-- Bootstrap CSS -->
|
||||||
|
<link rel="stylesheet" href="css/bootstrap.min.css"
|
||||||
|
integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
|
||||||
|
|
||||||
|
<title>NER auf den digitalen Sammlungen</title>
|
||||||
|
<script src="js/jquery-3.4.1.js"></script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="container-fluid" style="height: 95vh;">
|
||||||
|
|
||||||
|
<div class="row" style="margin-top: 5vh">
|
||||||
|
|
||||||
|
<div class="col-2">
|
||||||
|
</div>
|
||||||
|
<div class="col-10">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-9 text-center">
|
||||||
|
<h1>NER auf den digitalen Sammlungen</h1>
|
||||||
|
</div>
|
||||||
|
<div class="col">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="row" style="margin-top: 2vh">
|
||||||
|
<div class="col-9">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-block">
|
||||||
|
<form class="mt-3 mb-3" role="form" id="nerform">
|
||||||
|
<div class="form-group row ml-2">
|
||||||
|
<label for="task" class="col-sm-2 col-form-label">Task:</label>
|
||||||
|
<select id="task" class="selectpicker col-md-auto" onchange="task_select()">
|
||||||
|
<option value="1">OCR-Text aus ALTO Datei</option>
|
||||||
|
<option value="2">Wort- und Satztokenisierung</option>
|
||||||
|
<option value="3" selected>Named Entity Recognition</option>
|
||||||
|
<option value="4">BERT Tokens</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div class="form-group row ml-2" id="model_select">
|
||||||
|
<label for="model" class="col-sm-2 col-form-label">Model:</label>
|
||||||
|
<select id="model" class="selectpicker col-md-auto">
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="form-group row ml-2">
|
||||||
|
<label for="ppn" class="col-sm-2 col-form-label">PPN:</label>
|
||||||
|
<input id="ppn" list="ppnexamples" class="col-sm-8" type="text"/>
|
||||||
|
<datalist id="ppnexamples">
|
||||||
|
</datalist>
|
||||||
|
<button class="btn btn-primary" type="submit">Go</button>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row mt-5">
|
||||||
|
<div class="col-9" id="resultregion">
|
||||||
|
</div>
|
||||||
|
<div class="col" id="legende">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<script src="js/ner.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,155 @@
|
|||||||
|
|
||||||
|
$(document).ready(function(){
|
||||||
|
|
||||||
|
$('#nerform').submit(
|
||||||
|
function(e){
|
||||||
|
e.preventDefault();
|
||||||
|
load_ppn();
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
$.get( "/models")
|
||||||
|
.done(
|
||||||
|
function( data ) {
|
||||||
|
var tmp="";
|
||||||
|
$.each(data,
|
||||||
|
function(index, item){
|
||||||
|
|
||||||
|
selected=""
|
||||||
|
if (item.default) {
|
||||||
|
selected = "selected"
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp += '<option value="' + item.id + '" ' + selected + ' >' + item.name + '</option>'
|
||||||
|
});
|
||||||
|
$('#model').html(tmp);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
$.get( "/ppnexamples")
|
||||||
|
.done(
|
||||||
|
function( data ) {
|
||||||
|
var tmp="";
|
||||||
|
$.each(data,
|
||||||
|
function(index, item){
|
||||||
|
|
||||||
|
tmp += '<option value="' + item.ppn + '">' + item.name + '</option>'
|
||||||
|
});
|
||||||
|
$('#ppnexamples').html(tmp);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
task_select()
|
||||||
|
});
|
||||||
|
|
||||||
|
function task_select() {
|
||||||
|
|
||||||
|
var task = $('#task').val();
|
||||||
|
|
||||||
|
if (task < 3) {
|
||||||
|
$('#model_select').hide()
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
$('#model_select').show()
|
||||||
|
}
|
||||||
|
|
||||||
|
$("#resultregion").html("");
|
||||||
|
$("#legende").html("");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function load_ppn() {
|
||||||
|
|
||||||
|
var ppn = $('#ppn').val()
|
||||||
|
|
||||||
|
var text_region_html =
|
||||||
|
`<div class="card">
|
||||||
|
<div class="card-header">
|
||||||
|
Ergebnis:
|
||||||
|
</div>
|
||||||
|
<div class="card-block">
|
||||||
|
<div id="textregion" style="overflow-y:scroll;height: 65vh;"></div>
|
||||||
|
</div>
|
||||||
|
</div>`;
|
||||||
|
|
||||||
|
var legende_html =
|
||||||
|
`<div class="card">
|
||||||
|
<div class="card-header">
|
||||||
|
Legende:
|
||||||
|
<div class="ml-2" >[<font color="red">Person</font>]</div>
|
||||||
|
<div class="ml-2" >[<font color="green">Ort</font>]</div>
|
||||||
|
<div class="ml-2" >[<font color="blue">Organisation</font>]</div>
|
||||||
|
<div class="ml-2" >[keine Named Entity]</div>
|
||||||
|
</div>
|
||||||
|
</div>`;
|
||||||
|
|
||||||
|
var spinner_html =
|
||||||
|
`<div class="d-flex justify-content-center">
|
||||||
|
<div class="spinner-border align-center" role="status">
|
||||||
|
<span class="sr-only">Loading...</span>
|
||||||
|
</div>
|
||||||
|
</div>`;
|
||||||
|
|
||||||
|
$("#legende").html("");
|
||||||
|
|
||||||
|
var task = $('#task').val();
|
||||||
|
var model_id = $('#model').val();
|
||||||
|
|
||||||
|
console.log("Task: " + task);
|
||||||
|
|
||||||
|
if (task == 1) {
|
||||||
|
$("#resultregion").html(spinner_html);
|
||||||
|
|
||||||
|
$.get( "/digisam-fulltext/" + ppn)
|
||||||
|
.done(function( data ) {
|
||||||
|
$("#resultregion").html(text_region_html)
|
||||||
|
$("#textregion").html(data.text)
|
||||||
|
})
|
||||||
|
.fail(
|
||||||
|
function() {
|
||||||
|
console.log('Failed.');
|
||||||
|
$("#resultregion").html('Failed.');
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else if (task == 2) {
|
||||||
|
$("#resultregion").html(spinner_html);
|
||||||
|
|
||||||
|
$.get( "/digisam-tokenized/" + ppn,
|
||||||
|
function( data ) {
|
||||||
|
$("#resultregion").html(text_region_html)
|
||||||
|
$("#textregion").html(data.text)
|
||||||
|
}).fail(
|
||||||
|
function() {
|
||||||
|
console.log('Failed.')
|
||||||
|
$("#resultregion").html('Failed.')
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else if (task == 3) {
|
||||||
|
|
||||||
|
$("#resultregion").html(spinner_html);
|
||||||
|
|
||||||
|
$.get( "/digisam-ner/" + model_id + "/" + ppn,
|
||||||
|
function( data ) {
|
||||||
|
$("#resultregion").html(text_region_html)
|
||||||
|
$("#textregion").html(data.text)
|
||||||
|
$("#legende").html(legende_html)
|
||||||
|
}).fail(
|
||||||
|
function(a,b,c) {
|
||||||
|
console.log('Failed.')
|
||||||
|
$("#resultregion").html('Failed.')
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else if (task == 4) {
|
||||||
|
$("#resultregion").html(spinner_html);
|
||||||
|
|
||||||
|
$.get( "/digisam-ner-bert-tokens/" + model_id + "/" + ppn,
|
||||||
|
function( data ) {
|
||||||
|
$("#resultregion").html(text_region_html)
|
||||||
|
$("#textregion").html(data.text)
|
||||||
|
}).fail(
|
||||||
|
function(a,b,c) {
|
||||||
|
console.log('Failed.')
|
||||||
|
$("#resultregion").html('Failed.')
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,16 @@
|
|||||||
|
numpy
|
||||||
|
pandas
|
||||||
|
dask==1.1.4
|
||||||
|
pyarrow==0.12.1
|
||||||
|
tqdm
|
||||||
|
pytorch-pretrained-bert==0.6.2
|
||||||
|
click
|
||||||
|
langid
|
||||||
|
seqeval
|
||||||
|
conlleval
|
||||||
|
toolz
|
||||||
|
cloudpickle
|
||||||
|
pytest
|
||||||
|
pytest-cov
|
||||||
|
flask
|
||||||
|
somajo
|
@ -0,0 +1,38 @@
|
|||||||
|
from io import open
|
||||||
|
from setuptools import find_packages, setup
|
||||||
|
|
||||||
|
with open('requirements.txt') as fp:
|
||||||
|
install_requires = fp.read()
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="qurator-sbb-ner",
|
||||||
|
version="0.0.1",
|
||||||
|
author="The Qurator Team",
|
||||||
|
author_email="qurator@sbb.spk-berlin.de",
|
||||||
|
description="Qurator",
|
||||||
|
long_description=open("README.md", "r", encoding='utf-8').read(),
|
||||||
|
long_description_content_type="text/markdown",
|
||||||
|
keywords='qurator',
|
||||||
|
license='Apache',
|
||||||
|
url="https://qurator.ai",
|
||||||
|
packages=find_packages(exclude=["*.tests", "*.tests.*",
|
||||||
|
"tests.*", "tests"]),
|
||||||
|
install_requires=install_requires,
|
||||||
|
entry_points={
|
||||||
|
'console_scripts': [
|
||||||
|
"compile_europeana_historic=qurator.sbb_ner.ground_truth.europeana_historic:main",
|
||||||
|
"compile_germ_eval=qurator.sbb_ner.ground_truth.germeval:main",
|
||||||
|
"compile_conll=qurator.sbb_ner.ground_truth.conll:main",
|
||||||
|
"compile_wikiner=qurator.sbb_ner.ground_truth.wikiner:main",
|
||||||
|
"bert-ner=qurator.sbb_ner.models.bert:main"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
python_requires='>=3.6.0',
|
||||||
|
tests_require=['pytest'],
|
||||||
|
classifiers=[
|
||||||
|
'Intended Audience :: Science/Research',
|
||||||
|
'License :: OSI Approved :: Apache Software License',
|
||||||
|
'Programming Language :: Python :: 3',
|
||||||
|
'Topic :: Scientific/Engineering :: Artificial Intelligence',
|
||||||
|
],
|
||||||
|
)
|
Loading…
Reference in New Issue