mirror of
https://github.com/qurator-spk/modstool.git
synced 2025-06-09 11:49:55 +02:00
🚧 Write a Parquet file
This commit is contained in:
parent
03d86ce68a
commit
dd4febf24d
3 changed files with 12 additions and 11 deletions
|
@ -28,8 +28,8 @@ alto4pandas /path/to/a/directory/full/of/alto_files
|
||||||
## Example
|
## Example
|
||||||
In this example we convert the MODS metadata contained in the METS files in
|
In this example we convert the MODS metadata contained in the METS files in
|
||||||
`/srv/data/digisam_mets-sample-300` to a pandas DataFrame under
|
`/srv/data/digisam_mets-sample-300` to a pandas DataFrame under
|
||||||
`mods_info_df.pkl`. This file can then be read by your data scientist using
|
`mods_info_df.parquet`. This file can then be read by your data scientist using
|
||||||
`pd.read_pickle()`.
|
`pd.read_parquet()`.
|
||||||
|
|
||||||
~~~
|
~~~
|
||||||
% mods4pandas /srv/data/digisam_mets-sample-300
|
% mods4pandas /srv/data/digisam_mets-sample-300
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
click
|
click
|
||||||
pandas ~= 1.1.5
|
pandas
|
||||||
numpy < 2
|
numpy
|
||||||
tqdm
|
tqdm
|
||||||
lxml
|
lxml
|
||||||
openpyxl
|
openpyxl
|
||||||
|
pyarrow
|
||||||
|
|
|
@ -369,8 +369,8 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1)
|
@click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1)
|
||||||
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file',
|
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output file',
|
||||||
default='mods_info_df.pkl', show_default=True)
|
default='mods_info_df.parquet', show_default=True)
|
||||||
@click.option('--output-csv', type=click.Path(), help='Output CSV file')
|
@click.option('--output-csv', type=click.Path(), help='Output CSV file')
|
||||||
@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
|
@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
|
||||||
def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str):
|
def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str):
|
||||||
|
@ -436,9 +436,9 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
|
||||||
# Convert the mods_info List[Dict] to a pandas DataFrame
|
# Convert the mods_info List[Dict] to a pandas DataFrame
|
||||||
mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
|
mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
|
||||||
|
|
||||||
# Pickle the DataFrame
|
# Save the DataFrame
|
||||||
logger.info('Writing DataFrame to {}'.format(output_file))
|
logger.info('Writing DataFrame to {}'.format(output_file))
|
||||||
mods_info_df.to_pickle(output_file)
|
mods_info_df.to_parquet(output_file)
|
||||||
if output_csv:
|
if output_csv:
|
||||||
logger.info('Writing CSV to {}'.format(output_csv))
|
logger.info('Writing CSV to {}'.format(output_csv))
|
||||||
mods_info_df.to_csv(output_csv)
|
mods_info_df.to_csv(output_csv)
|
||||||
|
@ -449,9 +449,9 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
|
||||||
# Convert page_info
|
# Convert page_info
|
||||||
# XXX hardcoded filenames + other formats
|
# XXX hardcoded filenames + other formats
|
||||||
page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
|
page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
|
||||||
# Pickle the DataFrame
|
# Save the DataFrame
|
||||||
logger.info('Writing DataFrame to {}'.format("page_info_df.pkl"))
|
logger.info('Writing DataFrame to {}'.format("page_info_df.parquet"))
|
||||||
page_info_df.to_pickle("page_info_df.pkl")
|
page_info_df.to_parquet("page_info_df.parquet")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue