mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-08 03:10:15 +02:00
🐛 Write mods_info Parquet file again
This commit is contained in:
parent
abb20b8ba9
commit
11a04916f3
1 changed files with 12 additions and 9 deletions
|
@ -399,19 +399,24 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
|
|||
mets_files_real.append(m)
|
||||
|
||||
|
||||
|
||||
# Process METS files
|
||||
# Prepare output files
|
||||
with contextlib.suppress(FileNotFoundError):
|
||||
os.remove(output_file)
|
||||
output_file_sqlite3 = output_file + ".sqlite3"
|
||||
with contextlib.suppress(FileNotFoundError):
|
||||
os.remove(output_file_sqlite3)
|
||||
|
||||
logger.info('Writing SQLite DB to {}'.format(output_file_sqlite3))
|
||||
con = sqlite3.connect(output_file_sqlite3)
|
||||
|
||||
if output_page_info:
|
||||
output_page_info_sqlite3 = output_page_info + ".sqlite3"
|
||||
logger.info('Writing SQLite DB to {}'.format(output_page_info_sqlite3))
|
||||
with contextlib.suppress(FileNotFoundError):
|
||||
os.remove(output_page_info_sqlite3)
|
||||
con_page_info = sqlite3.connect(output_page_info_sqlite3)
|
||||
|
||||
# Process METS files
|
||||
with open(output_file + '.warnings.csv', 'w') as csvfile:
|
||||
csvwriter = csv.writer(csvfile)
|
||||
mods_info = []
|
||||
|
@ -454,14 +459,12 @@ def process(mets_files: List[str], output_file: str, output_page_info: str):
|
|||
except Exception as e:
|
||||
logger.exception('Exception in {}'.format(mets_file))
|
||||
|
||||
# Convert the mods_info List[Dict] to a pandas DataFrame
|
||||
# TODO
|
||||
# mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
|
||||
# Convert the mods_info SQL to a pandas DataFrame
|
||||
mods_info_df = pd.read_sql_query("SELECT * FROM mods_info", con, index_col="recordInfo_recordIdentifier")
|
||||
|
||||
# Save the DataFrame
|
||||
# TODO
|
||||
#logger.info('Writing DataFrame to {}'.format(output_file))
|
||||
#mods_info_df.to_parquet(output_file)
|
||||
logger.info('Writing DataFrame to {}'.format(output_file))
|
||||
mods_info_df.to_parquet(output_file)
|
||||
|
||||
# Convert page_info
|
||||
# TODO
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue