mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-08 19:29:57 +02:00
🚧 Write out page_info
This commit is contained in:
parent
a1390699d4
commit
b385f27391
2 changed files with 66 additions and 48 deletions
|
@ -2,6 +2,7 @@ from itertools import groupby
|
||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
from typing import List, Sequence, MutableMapping, Dict
|
from typing import List, Sequence, MutableMapping, Dict
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -328,3 +329,45 @@ def dicts_to_df(data_list: List[Dict], *, index_column) -> pd.DataFrame:
|
||||||
|
|
||||||
df = pd.DataFrame(data=data, index=index, columns=columns)
|
df = pd.DataFrame(data=data, index=index, columns=columns)
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def valid_column_key(k):
|
||||||
|
if re.match("^[a-zA-Z0-9 _-]+$", k):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
current_columns = defaultdict(list)
|
||||||
|
|
||||||
|
def insert_into_db(con, table, d: Dict):
|
||||||
|
"""Insert the values from the dict into the table, creating columns if necessary"""
|
||||||
|
|
||||||
|
# Create table if necessary
|
||||||
|
if not current_columns[table]:
|
||||||
|
for k in d.keys():
|
||||||
|
assert valid_column_key(k), f"\"{k}\" is not a valid column name"
|
||||||
|
current_columns[table].append(k)
|
||||||
|
con.execute(f"CREATE TABLE {table} ({",".join(f"\"{c}\"" for c in current_columns[table])})")
|
||||||
|
|
||||||
|
# Add columns if necessary
|
||||||
|
for k in d.keys():
|
||||||
|
if not k in current_columns[table]:
|
||||||
|
assert valid_column_key(k), f"\"{k}\" is not a valid column name"
|
||||||
|
current_columns[table].append(k)
|
||||||
|
con.execute(f"ALTER TABLE {table} ADD COLUMN \"{k}\"")
|
||||||
|
|
||||||
|
# Insert
|
||||||
|
# Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we
|
||||||
|
# have use qmark style here.
|
||||||
|
columns = d.keys()
|
||||||
|
con.execute(
|
||||||
|
f"INSERT INTO {table}"
|
||||||
|
f"( {",".join(f"\"{c}\"" for c in columns)} )"
|
||||||
|
"VALUES"
|
||||||
|
f"( {",".join("?" for c in columns)} )",
|
||||||
|
[str(d[c]) for c in columns]
|
||||||
|
)
|
||||||
|
|
||||||
|
def insert_into_db_multiple(con, table, ld: List[Dict]):
|
||||||
|
for d in ld:
|
||||||
|
insert_into_db(con, table, d)
|
||||||
|
|
|
@ -10,13 +10,14 @@ from lxml import etree as ET
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
from operator import attrgetter
|
from operator import attrgetter
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
from collections import defaultdict
|
||||||
from collections.abc import MutableMapping, Sequence
|
from collections.abc import MutableMapping, Sequence
|
||||||
|
|
||||||
import click
|
import click
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df
|
from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df, insert_into_db, insert_into_db_multiple
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -396,45 +397,16 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
|
||||||
else:
|
else:
|
||||||
mets_files_real.append(m)
|
mets_files_real.append(m)
|
||||||
|
|
||||||
current_columns = []
|
|
||||||
|
|
||||||
def valid_column_key(k):
|
|
||||||
if re.match("^[a-zA-Z0-9 _-]+$", k):
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def insert_into_db(con, d: Dict):
|
|
||||||
# Create table if necessary
|
|
||||||
if not current_columns:
|
|
||||||
for k in d.keys():
|
|
||||||
assert valid_column_key(k), f"\"{k}\" is not a valid column name"
|
|
||||||
current_columns.append(k)
|
|
||||||
con.execute(f"CREATE TABLE mods_info({",".join(f"\"{c}\"" for c in current_columns)})")
|
|
||||||
|
|
||||||
# Add columns if necessary
|
|
||||||
for k in d.keys():
|
|
||||||
if not k in current_columns:
|
|
||||||
assert valid_column_key(k), f"\"{k}\" is not a valid column name"
|
|
||||||
current_columns.append(k)
|
|
||||||
con.execute(f"ALTER TABLE mods_info ADD COLUMN \"{k}\"")
|
|
||||||
|
|
||||||
# Insert
|
|
||||||
# Unfortunately, Python3's sqlite3 does not like named placeholders with spaces, so we
|
|
||||||
# have use qmark style here.
|
|
||||||
columns = d.keys()
|
|
||||||
con.execute(
|
|
||||||
"INSERT INTO mods_info"
|
|
||||||
f"( {",".join(f"\"{c}\"" for c in columns)} )"
|
|
||||||
"VALUES"
|
|
||||||
f"( {",".join("?" for c in columns)} )",
|
|
||||||
[str(d[c]) for c in columns]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Process METS files
|
# Process METS files
|
||||||
output_file_sqlite3 = output_file + ".sqlite3"
|
output_file_sqlite3 = output_file + ".sqlite3"
|
||||||
con = sqlite3.connect(output_file_sqlite3)
|
con = sqlite3.connect(output_file_sqlite3)
|
||||||
|
|
||||||
|
if output_page_info:
|
||||||
|
output_page_info_sqlite3 = output_page_info + ".sqlite3"
|
||||||
|
con_page_info = sqlite3.connect(output_page_info_sqlite3)
|
||||||
|
|
||||||
with open(output_file + '.warnings.csv', 'w') as csvfile:
|
with open(output_file + '.warnings.csv', 'w') as csvfile:
|
||||||
csvwriter = csv.writer(csvfile)
|
csvwriter = csv.writer(csvfile)
|
||||||
mods_info = []
|
mods_info = []
|
||||||
|
@ -463,11 +435,11 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
|
||||||
if output_page_info:
|
if output_page_info:
|
||||||
page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
|
page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
|
||||||
|
|
||||||
insert_into_db(con, d)
|
insert_into_db(con, "mods_info", d)
|
||||||
con.commit()
|
con.commit()
|
||||||
#TODO
|
if output_page_info:
|
||||||
#if output_page_info:
|
insert_into_db_multiple(con_page_info, "page_info", page_info_doc)
|
||||||
# page_info.extend(page_info_doc)
|
con_page_info.commit()
|
||||||
|
|
||||||
if caught_warnings:
|
if caught_warnings:
|
||||||
# PyCharm thinks caught_warnings is not Iterable:
|
# PyCharm thinks caught_warnings is not Iterable:
|
||||||
|
@ -478,18 +450,21 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
|
||||||
logger.exception('Exception in {}'.format(mets_file))
|
logger.exception('Exception in {}'.format(mets_file))
|
||||||
|
|
||||||
# Convert the mods_info List[Dict] to a pandas DataFrame
|
# Convert the mods_info List[Dict] to a pandas DataFrame
|
||||||
mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
|
# TODO
|
||||||
|
# mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
|
||||||
|
|
||||||
# Save the DataFrame
|
# Save the DataFrame
|
||||||
logger.info('Writing DataFrame to {}'.format(output_file))
|
# TODO
|
||||||
mods_info_df.to_parquet(output_file)
|
#logger.info('Writing DataFrame to {}'.format(output_file))
|
||||||
|
#mods_info_df.to_parquet(output_file)
|
||||||
|
|
||||||
# Convert page_info
|
# Convert page_info
|
||||||
if output_page_info:
|
# TODO
|
||||||
page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
|
# if output_page_info:
|
||||||
# Save the DataFrame
|
# page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
|
||||||
logger.info('Writing DataFrame to {}'.format(output_page_info))
|
# # Save the DataFrame
|
||||||
page_info_df.to_parquet(output_page_info)
|
# logger.info('Writing DataFrame to {}'.format(output_page_info))
|
||||||
|
# page_info_df.to_parquet(output_page_info)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue