mirror of
https://github.com/qurator-spk/sbb_ner.git
synced 2025-06-08 11:50:16 +02:00
de-couple SBB related stuff and general NER functionality
This commit is contained in:
parent
57cd9227f9
commit
181cbb9f53
6 changed files with 5 additions and 371 deletions
|
@ -1,9 +1,6 @@
|
|||
import os
|
||||
import logging
|
||||
from flask import Flask, send_from_directory, redirect, jsonify, request, send_file
|
||||
import pandas as pd
|
||||
from sqlite3 import Error
|
||||
import sqlite3
|
||||
from flask import Flask, send_from_directory, redirect, jsonify, request
|
||||
import html
|
||||
import json
|
||||
import torch
|
||||
|
@ -15,12 +12,6 @@ from qurator.sbb_ner.models.tokenization import BertTokenizer
|
|||
from pytorch_pretrained_bert.modeling import (CONFIG_NAME,
|
||||
BertConfig,
|
||||
BertForTokenClassification)
|
||||
|
||||
from qurator.sbb.xml import get_entity_coordinates
|
||||
|
||||
import io
|
||||
from PIL import Image, ImageDraw
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
app.config.from_json('config.json' if not os.environ.get('CONFIG') else os.environ.get('CONFIG'))
|
||||
|
@ -28,40 +19,6 @@ app.config.from_json('config.json' if not os.environ.get('CONFIG') else os.envir
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Digisam:
|
||||
|
||||
_conn = None
|
||||
|
||||
def __init__(self, data_path):
|
||||
|
||||
self._data_path = data_path
|
||||
|
||||
@staticmethod
|
||||
def create_connection(db_file):
|
||||
try:
|
||||
logger.debug('Connection to database: {}'.format(db_file))
|
||||
|
||||
conn = sqlite3.connect(db_file, check_same_thread=False)
|
||||
|
||||
conn.execute('pragma journal_mode=wal')
|
||||
|
||||
return conn
|
||||
except Error as e:
|
||||
logger.error(e)
|
||||
|
||||
return None
|
||||
|
||||
def get(self, ppn):
|
||||
|
||||
if Digisam._conn is None:
|
||||
Digisam._conn = self.create_connection(self._data_path)
|
||||
|
||||
df = pd.read_sql_query("select file_name, text from text where ppn=?;", Digisam._conn, params=(ppn,)). \
|
||||
sort_values('file_name')
|
||||
|
||||
return df
|
||||
|
||||
|
||||
class NERPredictor:
|
||||
|
||||
def __init__(self, model_dir, batch_size, epoch, max_seq_length=128, local_rank=-1, no_cuda=False):
|
||||
|
@ -184,8 +141,6 @@ class PredictorStore:
|
|||
return self._predictor
|
||||
|
||||
|
||||
digisam = Digisam(app.config['DATA_PATH'])
|
||||
|
||||
predictor_store = PredictorStore()
|
||||
|
||||
tokenizer = NERTokenizer()
|
||||
|
@ -201,42 +156,6 @@ def get_models():
|
|||
return jsonify(app.config['MODELS'])
|
||||
|
||||
|
||||
@app.route('/ppnexamples')
|
||||
def get_ppnexamples():
|
||||
return jsonify(app.config['PPN_EXAMPLES'])
|
||||
|
||||
|
||||
@app.route('/digisam-fulltext/<ppn>')
|
||||
def fulltext(ppn):
|
||||
|
||||
df = digisam.get(ppn)
|
||||
|
||||
if len(df) == 0:
|
||||
|
||||
df = digisam.get('PPN' + ppn)
|
||||
|
||||
if len(df) == 0:
|
||||
|
||||
if ppn.startswith('PPN'):
|
||||
df = digisam.get(ppn[3:])
|
||||
|
||||
if len(df) == 0:
|
||||
|
||||
return 'bad request!', 400
|
||||
|
||||
text = ''
|
||||
for row_index, row_data in df.iterrows():
|
||||
|
||||
if row_data.text is None:
|
||||
continue
|
||||
|
||||
text += row_data.text + " "
|
||||
|
||||
ret = {'text': text, 'ppn': ppn}
|
||||
|
||||
return jsonify(ret)
|
||||
|
||||
|
||||
@app.route('/tokenized', methods=['GET', 'POST'])
|
||||
def tokenized():
|
||||
|
||||
|
@ -338,53 +257,6 @@ def ner(model_id):
|
|||
return jsonify(output)
|
||||
|
||||
|
||||
def find_file(path, ppn, page, ending):
|
||||
|
||||
file = (8 - len(str(page))) * '0' + page
|
||||
|
||||
if os.path.exists("{}/{}/{}{}".format(path, ppn, file, ending)):
|
||||
return "{}/{}/{}{}".format(path, ppn, file, ending)
|
||||
elif os.path.exists("{}/PPN{}/{}{}".format(path, ppn, file, ending)):
|
||||
return "{}/PPN{}/{}{}".format(path, ppn, file, ending)
|
||||
elif ppn.startswith('PPN') and os.path.exists("{}/{}/{}{}".format(path, ppn[3:], file, ending)):
|
||||
return "{}/{}/{}{}".format(path, ppn[3:], file, ending)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
@app.route('/image/<ppn>/<page>')
|
||||
def get_image(ppn, page):
|
||||
|
||||
image_file = find_file(app.config['IMAGE_PATH'], ppn, page, '.tif')
|
||||
|
||||
if image_file is None:
|
||||
return 'bad request!', 400
|
||||
|
||||
img = Image.open(image_file)
|
||||
|
||||
img = img.convert('RGB')
|
||||
|
||||
alto_file = find_file(app.config['ALTO_PATH'], ppn, page, '.xml')
|
||||
|
||||
if alto_file is not None:
|
||||
|
||||
ner_coordinates, entity_map = get_entity_coordinates(alto_file, img)
|
||||
|
||||
draw = ImageDraw.Draw(img, 'RGBA')
|
||||
|
||||
for idx, row in ner_coordinates.iterrows():
|
||||
|
||||
draw.rectangle(xy=((row.x0, row.y0), (row.x1, row.y1)),
|
||||
fill=(255 if row.ner_id.startswith('PER') else 0,
|
||||
255 if row.ner_id.startswith('LOC') else 0,
|
||||
255 if row.ner_id.startswith('ORG') else 0, 50))
|
||||
buffer = io.BytesIO()
|
||||
img.save(buffer, "JPEG")
|
||||
buffer.seek(0)
|
||||
|
||||
return send_file(buffer, mimetype='image/jpeg')
|
||||
|
||||
|
||||
@app.route('/<path:path>')
|
||||
def send_js(path):
|
||||
return send_from_directory('static', path)
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
{
|
||||
"DATA_PATH": "data/digisam/fulltext.sqlite3",
|
||||
"BATCH_SIZE": 256,
|
||||
"MODELS": [
|
||||
{
|
||||
|
@ -30,51 +29,5 @@
|
|||
"epoch": 7,
|
||||
"default": false
|
||||
}
|
||||
],
|
||||
"PPN_EXAMPLES": [
|
||||
{
|
||||
"ppn": "633609536",
|
||||
"name": "Der achtzehnte Brumaire des Louis Bonaparte"
|
||||
},
|
||||
{
|
||||
"ppn": "778819027",
|
||||
"name": "Der zerbrochene Krug"
|
||||
},
|
||||
{
|
||||
"ppn": "71807789X",
|
||||
"name": "Praktischer Kommentar zu den Gebühren-Taxen für Notare und Rechtsanwälte"
|
||||
},
|
||||
{
|
||||
"ppn": "719153085",
|
||||
"name": "Der Weltkrieg im Rechenunterricht"
|
||||
},
|
||||
{
|
||||
"ppn": "719961289",
|
||||
"name": "Das Kriegs-Schaubuch des XVIII. A.K."
|
||||
},
|
||||
{
|
||||
"ppn": "720942748",
|
||||
"name": "Ein Gebot der Stunde"
|
||||
},
|
||||
{
|
||||
"ppn": "819155217",
|
||||
"name": "Der Zirkel, 1883"
|
||||
},
|
||||
{
|
||||
"ppn": "847022595",
|
||||
"name": "Mecklenburgisches Logenblatt"
|
||||
},
|
||||
{
|
||||
"ppn": "756689090",
|
||||
"name": "Das Buch wunderbarer Erfindungen"
|
||||
},
|
||||
{
|
||||
"ppn": "865468370",
|
||||
"name": "Carl Robert Lessings Bücher- und Handschriftensammlung"
|
||||
},
|
||||
{
|
||||
"ppn": "818985976",
|
||||
"name": "\nDie älteste Berliner Zeitung\nOCR\n\nDie älteste Berliner Zeitung : Fragmente der Berliner Wochenzeitung von 1626 aus dem Besitz der Preußischen Staatsbibliothek"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -8,7 +8,7 @@ $(document).ready(function(){
|
|||
}
|
||||
);
|
||||
|
||||
$.get( "/models")
|
||||
$.get( "models")
|
||||
.done(
|
||||
function( data ) {
|
||||
var tmp="";
|
||||
|
|
|
@ -1,113 +0,0 @@
|
|||
|
||||
$(document).ready(function(){
|
||||
|
||||
$('#nerform').submit(
|
||||
function(e){
|
||||
e.preventDefault();
|
||||
|
||||
update();
|
||||
}
|
||||
);
|
||||
|
||||
$.get( "/models")
|
||||
.done(
|
||||
function( data ) {
|
||||
var tmp="";
|
||||
$.each(data,
|
||||
function(index, item){
|
||||
|
||||
selected=""
|
||||
if (item.default) {
|
||||
selected = "selected"
|
||||
}
|
||||
|
||||
tmp += '<option value="' + item.id + '" ' + selected + ' >' + item.name + '</option>'
|
||||
});
|
||||
$('#model').html(tmp);
|
||||
|
||||
var url_params = new URLSearchParams(window.location.search);
|
||||
|
||||
var do_update=false;
|
||||
|
||||
if (url_params.has('ppn')) {
|
||||
|
||||
var ppn = url_params.get('ppn')
|
||||
|
||||
$('#ppn').val(ppn);
|
||||
|
||||
do_update = true;
|
||||
}
|
||||
|
||||
if (url_params.has('model_id')) {
|
||||
|
||||
var model_id = url_params.get('model_id')
|
||||
|
||||
$('#model').val(model_id);
|
||||
|
||||
do_update = true;
|
||||
}
|
||||
|
||||
if (url_params.has('task')) {
|
||||
|
||||
var task = url_params.get('task')
|
||||
|
||||
$('#task').val(task);
|
||||
|
||||
do_update = true;
|
||||
}
|
||||
|
||||
task_select()
|
||||
|
||||
if (do_update) update();
|
||||
}
|
||||
);
|
||||
|
||||
$.get( "/ppnexamples")
|
||||
.done(
|
||||
function( data ) {
|
||||
var tmp="";
|
||||
$.each(data,
|
||||
function(index, item){
|
||||
|
||||
tmp += '<option value="' + item.ppn + '">' + item.name + '</option>'
|
||||
});
|
||||
$('#ppnexamples').html(tmp);
|
||||
}
|
||||
);
|
||||
});
|
||||
|
||||
|
||||
function update() {
|
||||
|
||||
var spinner_html =
|
||||
`<div class="d-flex justify-content-center">
|
||||
<div class="spinner-border align-center" role="status">
|
||||
<span class="sr-only">Loading...</span>
|
||||
</div>
|
||||
</div>`;
|
||||
|
||||
var task = $('#task').val();
|
||||
var model_id = $('#model').val();
|
||||
var ppn = $('#ppn').val();
|
||||
|
||||
var url_params = new URLSearchParams(window.location.search);
|
||||
|
||||
url_params.set('ppn', ppn)
|
||||
url_params.set('model_id', model_id)
|
||||
url_params.set('task', task)
|
||||
|
||||
window.history.replaceState({}, '', `${location.pathname}?${url_params}`);
|
||||
|
||||
$("#resultregion").html(spinner_html);
|
||||
|
||||
$.get( "/digisam-fulltext/" + ppn)
|
||||
.done(function( data ) {
|
||||
|
||||
do_task(task, model_id, data.text)
|
||||
})
|
||||
.fail(
|
||||
function() {
|
||||
console.log('Failed.');
|
||||
$("#resultregion").html('Failed.');
|
||||
});
|
||||
}
|
|
@ -58,7 +58,7 @@ function do_task(task, model_id, input_text) {
|
|||
|
||||
$.ajax(
|
||||
{
|
||||
url: "/tokenized",
|
||||
url: "tokenized",
|
||||
data: JSON.stringify(post_data),
|
||||
type: 'POST',
|
||||
contentType: "application/json",
|
||||
|
@ -89,7 +89,7 @@ function do_task(task, model_id, input_text) {
|
|||
$("#resultregion").html(spinner_html)
|
||||
|
||||
$.ajax({
|
||||
url: "/ner/" + model_id,
|
||||
url: "ner/" + model_id,
|
||||
data: JSON.stringify(post_data),
|
||||
type: 'POST',
|
||||
contentType: "application/json",
|
||||
|
@ -130,7 +130,7 @@ function do_task(task, model_id, input_text) {
|
|||
|
||||
$.ajax(
|
||||
{
|
||||
url: "/ner-bert-tokens/" + model_id,
|
||||
url: "ner-bert-tokens/" + model_id,
|
||||
data: JSON.stringify(post_data),
|
||||
type: 'POST',
|
||||
contentType: "application/json",
|
||||
|
|
|
@ -1,78 +0,0 @@
|
|||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<!-- Required meta tags -->
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||
|
||||
<!-- Bootstrap CSS -->
|
||||
<link rel="stylesheet" href="css/bootstrap.min.css"
|
||||
integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
|
||||
|
||||
<title>NER auf den digitalen Sammlungen</title>
|
||||
<script src="js/jquery-3.4.1.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container-fluid" style="height: 95vh;">
|
||||
|
||||
<div class="row" style="margin-top: 5vh">
|
||||
|
||||
<div class="col-2">
|
||||
</div>
|
||||
<div class="col-10">
|
||||
<div class="row">
|
||||
<div class="col-9 text-center">
|
||||
<h1>NER auf den digitalen Sammlungen</h1>
|
||||
</div>
|
||||
<div class="col">
|
||||
</div>
|
||||
</div>
|
||||
<div class="row" style="margin-top: 2vh">
|
||||
<div class="col-9">
|
||||
<div class="card">
|
||||
<div class="card-block">
|
||||
<form class="mt-3 mb-3" role="form" id="nerform">
|
||||
<div class="form-group row ml-2">
|
||||
<label for="task" class="col-sm-2 col-form-label">Task:</label>
|
||||
<select id="task" class="selectpicker col-md-auto" onchange="task_select()">
|
||||
<option value="fulltext">OCR-Text aus ALTO Datei</option>
|
||||
<option value="tokenize">Wort- und Satztokenisierung</option>
|
||||
<option value="ner" selected>Named Entity Recognition</option>
|
||||
<option value="bert-tokens">BERT Tokens</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-group row ml-2" id="model_select">
|
||||
<label for="model" class="col-sm-2 col-form-label">Model:</label>
|
||||
<select id="model" class="selectpicker col-md-auto">
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div class="form-group row ml-2">
|
||||
<label for="ppn" class="col-sm-2 col-form-label">PPN:</label>
|
||||
<input id="ppn" list="ppnexamples" class="col-sm-8" type="text" required/>
|
||||
<datalist id="ppnexamples">
|
||||
</datalist>
|
||||
<button class="btn btn-primary" type="submit">Go</button>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mt-5">
|
||||
<div class="col-9" id="resultregion">
|
||||
</div>
|
||||
<div class="col" id="legende">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<script src="js/ner.js"></script>
|
||||
<script src="js/ner-ds-sbb.js"></script>
|
||||
</body>
|
||||
</html>
|
Loading…
Add table
Add a link
Reference in a new issue