mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-07 19:05:06 +02:00
🚧 Use temporary SQLite DB for alto4pandas, too
This commit is contained in:
parent
ca8f165955
commit
39f7d8646a
2 changed files with 27 additions and 28 deletions
|
@ -5,6 +5,8 @@ import os
|
||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
import sys
|
import sys
|
||||||
|
import contextlib
|
||||||
|
import sqlite3
|
||||||
from xml.dom.expatbuilder import Namespaces
|
from xml.dom.expatbuilder import Namespaces
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
|
@ -17,7 +19,7 @@ import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from .lib import TagGroup, sorted_groupby, flatten, ns
|
from .lib import TagGroup, sorted_groupby, flatten, ns, insert_into_db
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger('alto4pandas')
|
logger = logging.getLogger('alto4pandas')
|
||||||
|
@ -121,18 +123,19 @@ def walk(m):
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1)
|
@click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1)
|
||||||
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file',
|
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file',
|
||||||
default='alto_info_df.pkl', show_default=True)
|
default='alto_info_df.parquet', show_default=True)
|
||||||
@click.option('--output-csv', type=click.Path(), help='Output CSV file')
|
def process(alto_files: List[str], output_file: str):
|
||||||
@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
|
|
||||||
def process(alto_files: List[str], output_file: str, output_csv: str, output_xlsx: str):
|
|
||||||
"""
|
"""
|
||||||
A tool to convert the ALTO metadata in INPUT to a pandas DataFrame.
|
A tool to convert the ALTO metadata in INPUT to a pandas DataFrame.
|
||||||
|
|
||||||
INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads
|
INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads
|
||||||
all files in the directory.
|
all files in the directory.
|
||||||
|
|
||||||
alto4pandas writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings.
|
alto4pandas writes multiple output files:
|
||||||
|
- A Parquet DataFrame
|
||||||
|
- A SQLite database
|
||||||
|
- and a CSV file with all conversion warnings.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Extend file list if directories are given
|
# Extend file list if directories are given
|
||||||
|
@ -141,10 +144,19 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls
|
||||||
for x in walk(m):
|
for x in walk(m):
|
||||||
alto_files_real.append(x)
|
alto_files_real.append(x)
|
||||||
|
|
||||||
|
# Prepare output files
|
||||||
|
with contextlib.suppress(FileNotFoundError):
|
||||||
|
os.remove(output_file)
|
||||||
|
output_file_sqlite3 = output_file + ".sqlite3"
|
||||||
|
with contextlib.suppress(FileNotFoundError):
|
||||||
|
os.remove(output_file_sqlite3)
|
||||||
|
|
||||||
|
logger.info('Writing SQLite DB to {}'.format(output_file_sqlite3))
|
||||||
|
con = sqlite3.connect(output_file_sqlite3)
|
||||||
|
|
||||||
# Process ALTO files
|
# Process ALTO files
|
||||||
with open(output_file + '.warnings.csv', 'w') as csvfile:
|
with open(output_file + '.warnings.csv', 'w') as csvfile:
|
||||||
csvwriter = csv.writer(csvfile)
|
csvwriter = csv.writer(csvfile)
|
||||||
alto_info = []
|
|
||||||
logger.info('Processing ALTO files')
|
logger.info('Processing ALTO files')
|
||||||
for alto_file in tqdm(alto_files_real, leave=False):
|
for alto_file in tqdm(alto_files_real, leave=False):
|
||||||
try:
|
try:
|
||||||
|
@ -160,7 +172,9 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls
|
||||||
d['alto_file'] = alto_file
|
d['alto_file'] = alto_file
|
||||||
d['alto_xmlns'] = ET.QName(alto).namespace
|
d['alto_xmlns'] = ET.QName(alto).namespace
|
||||||
|
|
||||||
alto_info.append(d)
|
# Save
|
||||||
|
insert_into_db(con, "alto_info", d)
|
||||||
|
con.commit
|
||||||
|
|
||||||
if caught_warnings:
|
if caught_warnings:
|
||||||
# PyCharm thinks caught_warnings is not Iterable:
|
# PyCharm thinks caught_warnings is not Iterable:
|
||||||
|
@ -171,25 +185,10 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls
|
||||||
logger.error('Exception in {}: {}'.format(alto_file, e))
|
logger.error('Exception in {}: {}'.format(alto_file, e))
|
||||||
import traceback; traceback.print_exc()
|
import traceback; traceback.print_exc()
|
||||||
|
|
||||||
# Convert the alto_info List[Dict] to a pandas DataFrame
|
# Convert the alto_info SQL to a pandas DataFrame
|
||||||
columns = []
|
alto_info_df = pd.read_sql_query("SELECT * FROM alto_info", con, index_col="alto_file")
|
||||||
for m in alto_info:
|
|
||||||
for c in m.keys():
|
|
||||||
if c not in columns:
|
|
||||||
columns.append(c)
|
|
||||||
data = [[m.get(c) for c in columns] for m in alto_info]
|
|
||||||
index = [m['alto_file'] for m in alto_info] # TODO use ppn + page?
|
|
||||||
alto_info_df = pd.DataFrame(data=data, index=index, columns=columns)
|
|
||||||
|
|
||||||
# Pickle the DataFrame
|
|
||||||
logger.info('Writing DataFrame to {}'.format(output_file))
|
logger.info('Writing DataFrame to {}'.format(output_file))
|
||||||
alto_info_df.to_pickle(output_file)
|
alto_info_df.to_parquet(output_file)
|
||||||
if output_csv:
|
|
||||||
logger.info('Writing CSV to {}'.format(output_csv))
|
|
||||||
alto_info_df.to_csv(output_csv)
|
|
||||||
if output_xlsx:
|
|
||||||
logger.info('Writing Excel .xlsx to {}'.format(output_xlsx))
|
|
||||||
alto_info_df.to_excel(output_xlsx)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
|
@ -332,7 +332,7 @@ def dicts_to_df(data_list: List[Dict], *, index_column) -> pd.DataFrame:
|
||||||
|
|
||||||
|
|
||||||
def valid_column_key(k):
|
def valid_column_key(k):
|
||||||
if re.match("^[a-zA-Z0-9 _-]+$", k):
|
if re.match("^[a-zA-Z0-9 _@/:\[\]-]+$", k):
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue