de-couple SBB related stuff and general NER functionality

pull/2/head
Kai Labusch 5 years ago
parent 57cd9227f9
commit 181cbb9f53

@ -1,9 +1,6 @@
import os
import logging
from flask import Flask, send_from_directory, redirect, jsonify, request, send_file
import pandas as pd
from sqlite3 import Error
import sqlite3
from flask import Flask, send_from_directory, redirect, jsonify, request
import html
import json
import torch
@ -15,12 +12,6 @@ from qurator.sbb_ner.models.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import (CONFIG_NAME,
BertConfig,
BertForTokenClassification)
from qurator.sbb.xml import get_entity_coordinates
import io
from PIL import Image, ImageDraw
app = Flask(__name__)
app.config.from_json('config.json' if not os.environ.get('CONFIG') else os.environ.get('CONFIG'))
@ -28,40 +19,6 @@ app.config.from_json('config.json' if not os.environ.get('CONFIG') else os.envir
logger = logging.getLogger(__name__)
class Digisam:
_conn = None
def __init__(self, data_path):
self._data_path = data_path
@staticmethod
def create_connection(db_file):
try:
logger.debug('Connection to database: {}'.format(db_file))
conn = sqlite3.connect(db_file, check_same_thread=False)
conn.execute('pragma journal_mode=wal')
return conn
except Error as e:
logger.error(e)
return None
def get(self, ppn):
if Digisam._conn is None:
Digisam._conn = self.create_connection(self._data_path)
df = pd.read_sql_query("select file_name, text from text where ppn=?;", Digisam._conn, params=(ppn,)). \
sort_values('file_name')
return df
class NERPredictor:
def __init__(self, model_dir, batch_size, epoch, max_seq_length=128, local_rank=-1, no_cuda=False):
@ -184,8 +141,6 @@ class PredictorStore:
return self._predictor
digisam = Digisam(app.config['DATA_PATH'])
predictor_store = PredictorStore()
tokenizer = NERTokenizer()
@ -201,42 +156,6 @@ def get_models():
return jsonify(app.config['MODELS'])
@app.route('/ppnexamples')
def get_ppnexamples():
return jsonify(app.config['PPN_EXAMPLES'])
@app.route('/digisam-fulltext/<ppn>')
def fulltext(ppn):
df = digisam.get(ppn)
if len(df) == 0:
df = digisam.get('PPN' + ppn)
if len(df) == 0:
if ppn.startswith('PPN'):
df = digisam.get(ppn[3:])
if len(df) == 0:
return 'bad request!', 400
text = ''
for row_index, row_data in df.iterrows():
if row_data.text is None:
continue
text += row_data.text + " "
ret = {'text': text, 'ppn': ppn}
return jsonify(ret)
@app.route('/tokenized', methods=['GET', 'POST'])
def tokenized():
@ -338,53 +257,6 @@ def ner(model_id):
return jsonify(output)
def find_file(path, ppn, page, ending):
file = (8 - len(str(page))) * '0' + page
if os.path.exists("{}/{}/{}{}".format(path, ppn, file, ending)):
return "{}/{}/{}{}".format(path, ppn, file, ending)
elif os.path.exists("{}/PPN{}/{}{}".format(path, ppn, file, ending)):
return "{}/PPN{}/{}{}".format(path, ppn, file, ending)
elif ppn.startswith('PPN') and os.path.exists("{}/{}/{}{}".format(path, ppn[3:], file, ending)):
return "{}/{}/{}{}".format(path, ppn[3:], file, ending)
else:
return None
@app.route('/image/<ppn>/<page>')
def get_image(ppn, page):
image_file = find_file(app.config['IMAGE_PATH'], ppn, page, '.tif')
if image_file is None:
return 'bad request!', 400
img = Image.open(image_file)
img = img.convert('RGB')
alto_file = find_file(app.config['ALTO_PATH'], ppn, page, '.xml')
if alto_file is not None:
ner_coordinates, entity_map = get_entity_coordinates(alto_file, img)
draw = ImageDraw.Draw(img, 'RGBA')
for idx, row in ner_coordinates.iterrows():
draw.rectangle(xy=((row.x0, row.y0), (row.x1, row.y1)),
fill=(255 if row.ner_id.startswith('PER') else 0,
255 if row.ner_id.startswith('LOC') else 0,
255 if row.ner_id.startswith('ORG') else 0, 50))
buffer = io.BytesIO()
img.save(buffer, "JPEG")
buffer.seek(0)
return send_file(buffer, mimetype='image/jpeg')
@app.route('/<path:path>')
def send_js(path):
return send_from_directory('static', path)

@ -1,5 +1,4 @@
{
"DATA_PATH": "data/digisam/fulltext.sqlite3",
"BATCH_SIZE": 256,
"MODELS": [
{
@ -30,51 +29,5 @@
"epoch": 7,
"default": false
}
],
"PPN_EXAMPLES": [
{
"ppn": "633609536",
"name": "Der achtzehnte Brumaire des Louis Bonaparte"
},
{
"ppn": "778819027",
"name": "Der zerbrochene Krug"
},
{
"ppn": "71807789X",
"name": "Praktischer Kommentar zu den Gebühren-Taxen für Notare und Rechtsanwälte"
},
{
"ppn": "719153085",
"name": "Der Weltkrieg im Rechenunterricht"
},
{
"ppn": "719961289",
"name": "Das Kriegs-Schaubuch des XVIII. A.K."
},
{
"ppn": "720942748",
"name": "Ein Gebot der Stunde"
},
{
"ppn": "819155217",
"name": "Der Zirkel, 1883"
},
{
"ppn": "847022595",
"name": "Mecklenburgisches Logenblatt"
},
{
"ppn": "756689090",
"name": "Das Buch wunderbarer Erfindungen"
},
{
"ppn": "865468370",
"name": "Carl Robert Lessings Bücher- und Handschriftensammlung"
},
{
"ppn": "818985976",
"name": "\nDie älteste Berliner Zeitung\nOCR\n\nDie älteste Berliner Zeitung : Fragmente der Berliner Wochenzeitung von 1626 aus dem Besitz der Preußischen Staatsbibliothek"
}
]
}

@ -8,7 +8,7 @@ $(document).ready(function(){
}
);
$.get( "/models")
$.get( "models")
.done(
function( data ) {
var tmp="";

@ -1,113 +0,0 @@
$(document).ready(function(){
$('#nerform').submit(
function(e){
e.preventDefault();
update();
}
);
$.get( "/models")
.done(
function( data ) {
var tmp="";
$.each(data,
function(index, item){
selected=""
if (item.default) {
selected = "selected"
}
tmp += '<option value="' + item.id + '" ' + selected + ' >' + item.name + '</option>'
});
$('#model').html(tmp);
var url_params = new URLSearchParams(window.location.search);
var do_update=false;
if (url_params.has('ppn')) {
var ppn = url_params.get('ppn')
$('#ppn').val(ppn);
do_update = true;
}
if (url_params.has('model_id')) {
var model_id = url_params.get('model_id')
$('#model').val(model_id);
do_update = true;
}
if (url_params.has('task')) {
var task = url_params.get('task')
$('#task').val(task);
do_update = true;
}
task_select()
if (do_update) update();
}
);
$.get( "/ppnexamples")
.done(
function( data ) {
var tmp="";
$.each(data,
function(index, item){
tmp += '<option value="' + item.ppn + '">' + item.name + '</option>'
});
$('#ppnexamples').html(tmp);
}
);
});
function update() {
var spinner_html =
`<div class="d-flex justify-content-center">
<div class="spinner-border align-center" role="status">
<span class="sr-only">Loading...</span>
</div>
</div>`;
var task = $('#task').val();
var model_id = $('#model').val();
var ppn = $('#ppn').val();
var url_params = new URLSearchParams(window.location.search);
url_params.set('ppn', ppn)
url_params.set('model_id', model_id)
url_params.set('task', task)
window.history.replaceState({}, '', `${location.pathname}?${url_params}`);
$("#resultregion").html(spinner_html);
$.get( "/digisam-fulltext/" + ppn)
.done(function( data ) {
do_task(task, model_id, data.text)
})
.fail(
function() {
console.log('Failed.');
$("#resultregion").html('Failed.');
});
}

@ -58,7 +58,7 @@ function do_task(task, model_id, input_text) {
$.ajax(
{
url: "/tokenized",
url: "tokenized",
data: JSON.stringify(post_data),
type: 'POST',
contentType: "application/json",
@ -89,7 +89,7 @@ function do_task(task, model_id, input_text) {
$("#resultregion").html(spinner_html)
$.ajax({
url: "/ner/" + model_id,
url: "ner/" + model_id,
data: JSON.stringify(post_data),
type: 'POST',
contentType: "application/json",
@ -130,7 +130,7 @@ function do_task(task, model_id, input_text) {
$.ajax(
{
url: "/ner-bert-tokens/" + model_id,
url: "ner-bert-tokens/" + model_id,
data: JSON.stringify(post_data),
type: 'POST',
contentType: "application/json",

@ -1,78 +0,0 @@
<!doctype html>
<html lang="en">
<head>
<!-- Required meta tags -->
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<!-- Bootstrap CSS -->
<link rel="stylesheet" href="css/bootstrap.min.css"
integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
<title>NER auf den digitalen Sammlungen</title>
<script src="js/jquery-3.4.1.js"></script>
</head>
<body>
<div class="container-fluid" style="height: 95vh;">
<div class="row" style="margin-top: 5vh">
<div class="col-2">
</div>
<div class="col-10">
<div class="row">
<div class="col-9 text-center">
<h1>NER auf den digitalen Sammlungen</h1>
</div>
<div class="col">
</div>
</div>
<div class="row" style="margin-top: 2vh">
<div class="col-9">
<div class="card">
<div class="card-block">
<form class="mt-3 mb-3" role="form" id="nerform">
<div class="form-group row ml-2">
<label for="task" class="col-sm-2 col-form-label">Task:</label>
<select id="task" class="selectpicker col-md-auto" onchange="task_select()">
<option value="fulltext">OCR-Text aus ALTO Datei</option>
<option value="tokenize">Wort- und Satztokenisierung</option>
<option value="ner" selected>Named Entity Recognition</option>
<option value="bert-tokens">BERT Tokens</option>
</select>
</div>
<div class="form-group row ml-2" id="model_select">
<label for="model" class="col-sm-2 col-form-label">Model:</label>
<select id="model" class="selectpicker col-md-auto">
</select>
</div>
<div class="form-group row ml-2">
<label for="ppn" class="col-sm-2 col-form-label">PPN:</label>
<input id="ppn" list="ppnexamples" class="col-sm-8" type="text" required/>
<datalist id="ppnexamples">
</datalist>
<button class="btn btn-primary" type="submit">Go</button>
</div>
</form>
</div>
</div>
</div>
<div class="col">
</div>
</div>
<div class="row mt-5">
<div class="col-9" id="resultregion">
</div>
<div class="col" id="legende">
</div>
</div>
</div>
</div>
</div>
<script src="js/ner.js"></script>
<script src="js/ner-ds-sbb.js"></script>
</body>
</html>
Loading…
Cancel
Save