diff --git a/qurator/sbb_ner/webapp/app.py b/qurator/sbb_ner/webapp/app.py index a627e91..017c18f 100644 --- a/qurator/sbb_ner/webapp/app.py +++ b/qurator/sbb_ner/webapp/app.py @@ -1,9 +1,6 @@ import os import logging -from flask import Flask, send_from_directory, redirect, jsonify, request, send_file -import pandas as pd -from sqlite3 import Error -import sqlite3 +from flask import Flask, send_from_directory, redirect, jsonify, request import html import json import torch @@ -15,12 +12,6 @@ from qurator.sbb_ner.models.tokenization import BertTokenizer from pytorch_pretrained_bert.modeling import (CONFIG_NAME, BertConfig, BertForTokenClassification) - -from qurator.sbb.xml import get_entity_coordinates - -import io -from PIL import Image, ImageDraw - app = Flask(__name__) app.config.from_json('config.json' if not os.environ.get('CONFIG') else os.environ.get('CONFIG')) @@ -28,40 +19,6 @@ app.config.from_json('config.json' if not os.environ.get('CONFIG') else os.envir logger = logging.getLogger(__name__) -class Digisam: - - _conn = None - - def __init__(self, data_path): - - self._data_path = data_path - - @staticmethod - def create_connection(db_file): - try: - logger.debug('Connection to database: {}'.format(db_file)) - - conn = sqlite3.connect(db_file, check_same_thread=False) - - conn.execute('pragma journal_mode=wal') - - return conn - except Error as e: - logger.error(e) - - return None - - def get(self, ppn): - - if Digisam._conn is None: - Digisam._conn = self.create_connection(self._data_path) - - df = pd.read_sql_query("select file_name, text from text where ppn=?;", Digisam._conn, params=(ppn,)). \ - sort_values('file_name') - - return df - - class NERPredictor: def __init__(self, model_dir, batch_size, epoch, max_seq_length=128, local_rank=-1, no_cuda=False): @@ -184,8 +141,6 @@ class PredictorStore: return self._predictor -digisam = Digisam(app.config['DATA_PATH']) - predictor_store = PredictorStore() tokenizer = NERTokenizer() @@ -201,42 +156,6 @@ def get_models(): return jsonify(app.config['MODELS']) -@app.route('/ppnexamples') -def get_ppnexamples(): - return jsonify(app.config['PPN_EXAMPLES']) - - -@app.route('/digisam-fulltext/') -def fulltext(ppn): - - df = digisam.get(ppn) - - if len(df) == 0: - - df = digisam.get('PPN' + ppn) - - if len(df) == 0: - - if ppn.startswith('PPN'): - df = digisam.get(ppn[3:]) - - if len(df) == 0: - - return 'bad request!', 400 - - text = '' - for row_index, row_data in df.iterrows(): - - if row_data.text is None: - continue - - text += row_data.text + " " - - ret = {'text': text, 'ppn': ppn} - - return jsonify(ret) - - @app.route('/tokenized', methods=['GET', 'POST']) def tokenized(): @@ -338,53 +257,6 @@ def ner(model_id): return jsonify(output) -def find_file(path, ppn, page, ending): - - file = (8 - len(str(page))) * '0' + page - - if os.path.exists("{}/{}/{}{}".format(path, ppn, file, ending)): - return "{}/{}/{}{}".format(path, ppn, file, ending) - elif os.path.exists("{}/PPN{}/{}{}".format(path, ppn, file, ending)): - return "{}/PPN{}/{}{}".format(path, ppn, file, ending) - elif ppn.startswith('PPN') and os.path.exists("{}/{}/{}{}".format(path, ppn[3:], file, ending)): - return "{}/{}/{}{}".format(path, ppn[3:], file, ending) - else: - return None - - -@app.route('/image//') -def get_image(ppn, page): - - image_file = find_file(app.config['IMAGE_PATH'], ppn, page, '.tif') - - if image_file is None: - return 'bad request!', 400 - - img = Image.open(image_file) - - img = img.convert('RGB') - - alto_file = find_file(app.config['ALTO_PATH'], ppn, page, '.xml') - - if alto_file is not None: - - ner_coordinates, entity_map = get_entity_coordinates(alto_file, img) - - draw = ImageDraw.Draw(img, 'RGBA') - - for idx, row in ner_coordinates.iterrows(): - - draw.rectangle(xy=((row.x0, row.y0), (row.x1, row.y1)), - fill=(255 if row.ner_id.startswith('PER') else 0, - 255 if row.ner_id.startswith('LOC') else 0, - 255 if row.ner_id.startswith('ORG') else 0, 50)) - buffer = io.BytesIO() - img.save(buffer, "JPEG") - buffer.seek(0) - - return send_file(buffer, mimetype='image/jpeg') - - @app.route('/') def send_js(path): return send_from_directory('static', path) diff --git a/qurator/sbb_ner/webapp/config.json b/qurator/sbb_ner/webapp/config.json index 4601ac2..3bddc68 100644 --- a/qurator/sbb_ner/webapp/config.json +++ b/qurator/sbb_ner/webapp/config.json @@ -1,5 +1,4 @@ { - "DATA_PATH": "data/digisam/fulltext.sqlite3", "BATCH_SIZE": 256, "MODELS": [ { @@ -30,51 +29,5 @@ "epoch": 7, "default": false } - ], - "PPN_EXAMPLES": [ - { - "ppn": "633609536", - "name": "Der achtzehnte Brumaire des Louis Bonaparte" - }, - { - "ppn": "778819027", - "name": "Der zerbrochene Krug" - }, - { - "ppn": "71807789X", - "name": "Praktischer Kommentar zu den Gebühren-Taxen für Notare und Rechtsanwälte" - }, - { - "ppn": "719153085", - "name": "Der Weltkrieg im Rechenunterricht" - }, - { - "ppn": "719961289", - "name": "Das Kriegs-Schaubuch des XVIII. A.K." - }, - { - "ppn": "720942748", - "name": "Ein Gebot der Stunde" - }, - { - "ppn": "819155217", - "name": "Der Zirkel, 1883" - }, - { - "ppn": "847022595", - "name": "Mecklenburgisches Logenblatt" - }, - { - "ppn": "756689090", - "name": "Das Buch wunderbarer Erfindungen" - }, - { - "ppn": "865468370", - "name": "Carl Robert Lessings Bücher- und Handschriftensammlung" - }, - { - "ppn": "818985976", - "name": "\nDie älteste Berliner Zeitung\nOCR\n\nDie älteste Berliner Zeitung : Fragmente der Berliner Wochenzeitung von 1626 aus dem Besitz der Preußischen Staatsbibliothek" - } ] } \ No newline at end of file diff --git a/qurator/sbb_ner/webapp/static/js/ner-demo.js b/qurator/sbb_ner/webapp/static/js/ner-demo.js index a08f7e8..0f97604 100644 --- a/qurator/sbb_ner/webapp/static/js/ner-demo.js +++ b/qurator/sbb_ner/webapp/static/js/ner-demo.js @@ -8,7 +8,7 @@ $(document).ready(function(){ } ); - $.get( "/models") + $.get( "models") .done( function( data ) { var tmp=""; diff --git a/qurator/sbb_ner/webapp/static/js/ner-ds-sbb.js b/qurator/sbb_ner/webapp/static/js/ner-ds-sbb.js deleted file mode 100644 index 6115679..0000000 --- a/qurator/sbb_ner/webapp/static/js/ner-ds-sbb.js +++ /dev/null @@ -1,113 +0,0 @@ - -$(document).ready(function(){ - - $('#nerform').submit( - function(e){ - e.preventDefault(); - - update(); - } - ); - - $.get( "/models") - .done( - function( data ) { - var tmp=""; - $.each(data, - function(index, item){ - - selected="" - if (item.default) { - selected = "selected" - } - - tmp += '' - }); - $('#model').html(tmp); - - var url_params = new URLSearchParams(window.location.search); - - var do_update=false; - - if (url_params.has('ppn')) { - - var ppn = url_params.get('ppn') - - $('#ppn').val(ppn); - - do_update = true; - } - - if (url_params.has('model_id')) { - - var model_id = url_params.get('model_id') - - $('#model').val(model_id); - - do_update = true; - } - - if (url_params.has('task')) { - - var task = url_params.get('task') - - $('#task').val(task); - - do_update = true; - } - - task_select() - - if (do_update) update(); - } - ); - - $.get( "/ppnexamples") - .done( - function( data ) { - var tmp=""; - $.each(data, - function(index, item){ - - tmp += '' - }); - $('#ppnexamples').html(tmp); - } - ); -}); - - -function update() { - - var spinner_html = - `
-
- Loading... -
-
`; - - var task = $('#task').val(); - var model_id = $('#model').val(); - var ppn = $('#ppn').val(); - - var url_params = new URLSearchParams(window.location.search); - - url_params.set('ppn', ppn) - url_params.set('model_id', model_id) - url_params.set('task', task) - - window.history.replaceState({}, '', `${location.pathname}?${url_params}`); - - $("#resultregion").html(spinner_html); - - $.get( "/digisam-fulltext/" + ppn) - .done(function( data ) { - - do_task(task, model_id, data.text) - }) - .fail( - function() { - console.log('Failed.'); - $("#resultregion").html('Failed.'); - }); -} \ No newline at end of file diff --git a/qurator/sbb_ner/webapp/static/js/ner.js b/qurator/sbb_ner/webapp/static/js/ner.js index fa71a16..c74360b 100644 --- a/qurator/sbb_ner/webapp/static/js/ner.js +++ b/qurator/sbb_ner/webapp/static/js/ner.js @@ -58,7 +58,7 @@ function do_task(task, model_id, input_text) { $.ajax( { - url: "/tokenized", + url: "tokenized", data: JSON.stringify(post_data), type: 'POST', contentType: "application/json", @@ -89,7 +89,7 @@ function do_task(task, model_id, input_text) { $("#resultregion").html(spinner_html) $.ajax({ - url: "/ner/" + model_id, + url: "ner/" + model_id, data: JSON.stringify(post_data), type: 'POST', contentType: "application/json", @@ -130,7 +130,7 @@ function do_task(task, model_id, input_text) { $.ajax( { - url: "/ner-bert-tokens/" + model_id, + url: "ner-bert-tokens/" + model_id, data: JSON.stringify(post_data), type: 'POST', contentType: "application/json", diff --git a/qurator/sbb_ner/webapp/static/ner-ds-sbb.html b/qurator/sbb_ner/webapp/static/ner-ds-sbb.html deleted file mode 100644 index 5e3c0d9..0000000 --- a/qurator/sbb_ner/webapp/static/ner-ds-sbb.html +++ /dev/null @@ -1,78 +0,0 @@ - - - - - - - - - - - NER auf den digitalen Sammlungen - - - -
- -
- -
-
-
-
-
-

NER auf den digitalen Sammlungen

-
-
-
-
-
-
-
-
-
-
- - -
-
- - -
- -
- - - - - -
-
-
-
-
-
-
-
- -
-
-
-
-
-
-
-
- -
- - - - \ No newline at end of file