🚧 Use temporary SQLite DB for alto4pandas, too

fix/use-temp-sqlite3
Mike Gerber 4 weeks ago
parent ca8f165955
commit 39f7d8646a

@ -5,6 +5,8 @@ import os
import re import re
import warnings import warnings
import sys import sys
import contextlib
import sqlite3
from xml.dom.expatbuilder import Namespaces from xml.dom.expatbuilder import Namespaces
from lxml import etree as ET from lxml import etree as ET
from itertools import groupby from itertools import groupby
@ -17,7 +19,7 @@ import pandas as pd
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from .lib import TagGroup, sorted_groupby, flatten, ns from .lib import TagGroup, sorted_groupby, flatten, ns, insert_into_db
logger = logging.getLogger('alto4pandas') logger = logging.getLogger('alto4pandas')
@ -121,18 +123,19 @@ def walk(m):
@click.command() @click.command()
@click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1) @click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1)
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file', @click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file',
default='alto_info_df.pkl', show_default=True) default='alto_info_df.parquet', show_default=True)
@click.option('--output-csv', type=click.Path(), help='Output CSV file') def process(alto_files: List[str], output_file: str):
@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
def process(alto_files: List[str], output_file: str, output_csv: str, output_xlsx: str):
""" """
A tool to convert the ALTO metadata in INPUT to a pandas DataFrame. A tool to convert the ALTO metadata in INPUT to a pandas DataFrame.
INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads
all files in the directory. all files in the directory.
alto4pandas writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings. alto4pandas writes multiple output files:
- A Parquet DataFrame
- A SQLite database
- and a CSV file with all conversion warnings.
""" """
# Extend file list if directories are given # Extend file list if directories are given
@ -141,10 +144,19 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls
for x in walk(m): for x in walk(m):
alto_files_real.append(x) alto_files_real.append(x)
# Prepare output files
with contextlib.suppress(FileNotFoundError):
os.remove(output_file)
output_file_sqlite3 = output_file + ".sqlite3"
with contextlib.suppress(FileNotFoundError):
os.remove(output_file_sqlite3)
logger.info('Writing SQLite DB to {}'.format(output_file_sqlite3))
con = sqlite3.connect(output_file_sqlite3)
# Process ALTO files # Process ALTO files
with open(output_file + '.warnings.csv', 'w') as csvfile: with open(output_file + '.warnings.csv', 'w') as csvfile:
csvwriter = csv.writer(csvfile) csvwriter = csv.writer(csvfile)
alto_info = []
logger.info('Processing ALTO files') logger.info('Processing ALTO files')
for alto_file in tqdm(alto_files_real, leave=False): for alto_file in tqdm(alto_files_real, leave=False):
try: try:
@ -160,7 +172,9 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls
d['alto_file'] = alto_file d['alto_file'] = alto_file
d['alto_xmlns'] = ET.QName(alto).namespace d['alto_xmlns'] = ET.QName(alto).namespace
alto_info.append(d) # Save
insert_into_db(con, "alto_info", d)
con.commit
if caught_warnings: if caught_warnings:
# PyCharm thinks caught_warnings is not Iterable: # PyCharm thinks caught_warnings is not Iterable:
@ -171,25 +185,10 @@ def process(alto_files: List[str], output_file: str, output_csv: str, output_xls
logger.error('Exception in {}: {}'.format(alto_file, e)) logger.error('Exception in {}: {}'.format(alto_file, e))
import traceback; traceback.print_exc() import traceback; traceback.print_exc()
# Convert the alto_info List[Dict] to a pandas DataFrame # Convert the alto_info SQL to a pandas DataFrame
columns = [] alto_info_df = pd.read_sql_query("SELECT * FROM alto_info", con, index_col="alto_file")
for m in alto_info:
for c in m.keys():
if c not in columns:
columns.append(c)
data = [[m.get(c) for c in columns] for m in alto_info]
index = [m['alto_file'] for m in alto_info] # TODO use ppn + page?
alto_info_df = pd.DataFrame(data=data, index=index, columns=columns)
# Pickle the DataFrame
logger.info('Writing DataFrame to {}'.format(output_file)) logger.info('Writing DataFrame to {}'.format(output_file))
alto_info_df.to_pickle(output_file) alto_info_df.to_parquet(output_file)
if output_csv:
logger.info('Writing CSV to {}'.format(output_csv))
alto_info_df.to_csv(output_csv)
if output_xlsx:
logger.info('Writing Excel .xlsx to {}'.format(output_xlsx))
alto_info_df.to_excel(output_xlsx)
def main(): def main():

@ -332,7 +332,7 @@ def dicts_to_df(data_list: List[Dict], *, index_column) -> pd.DataFrame:
def valid_column_key(k): def valid_column_key(k):
if re.match("^[a-zA-Z0-9 _-]+$", k): if re.match("^[a-zA-Z0-9 _@/:\[\]-]+$", k):
return True return True
else: else:
return False return False

Loading…
Cancel
Save