mirror of
				https://github.com/qurator-spk/modstool.git
				synced 2025-11-04 03:14:14 +01:00 
			
		
		
		
	Remove direct CSV/Excel support
Remove direct CVS/Excel support from the CLI; README now has instructions to convert. Closes gh-40.
This commit is contained in:
		
							parent
							
								
									a1f333f4a4
								
							
						
					
					
						commit
						7122f0265f
					
				
					 3 changed files with 19 additions and 13 deletions
				
			
		
							
								
								
									
										12
									
								
								README.md
									
										
									
									
									
								
							
							
						
						
									
										12
									
								
								README.md
									
										
									
									
									
								
							| 
						 | 
					@ -28,6 +28,18 @@ mods4pandas /path/to/a/directory/containing/mets_files
 | 
				
			||||||
alto4pandas /path/to/a/directory/full/of/alto_files
 | 
					alto4pandas /path/to/a/directory/full/of/alto_files
 | 
				
			||||||
~~~
 | 
					~~~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Conversion to other formats
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CSV:
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					python -c 'import pandas as pd; pd.read_parquet("mods_info_df.parquet").to_csv("mods_info_df.csv")'
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					Excel (requires `XlsxWriter`):
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					python -c 'import pandas as pd; pd.read_parquet("mods_info_df.parquet").to_excel("mods_info_df.xlsx"
 | 
				
			||||||
 | 
					, engine="xlsxwriter")'
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Example
 | 
					## Example
 | 
				
			||||||
In this example we convert the MODS metadata contained in the METS files in
 | 
					In this example we convert the MODS metadata contained in the METS files in
 | 
				
			||||||
`/srv/data/digisam_mets-sample-300` to a pandas DataFrame under
 | 
					`/srv/data/digisam_mets-sample-300` to a pandas DataFrame under
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,5 +3,5 @@ pandas
 | 
				
			||||||
numpy
 | 
					numpy
 | 
				
			||||||
tqdm
 | 
					tqdm
 | 
				
			||||||
lxml
 | 
					lxml
 | 
				
			||||||
openpyxl
 | 
					 | 
				
			||||||
pyarrow
 | 
					pyarrow
 | 
				
			||||||
 | 
					XslxWriter
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -369,19 +369,19 @@ def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@click.command()
 | 
					@click.command()
 | 
				
			||||||
@click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1)
 | 
					@click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1)
 | 
				
			||||||
@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output file',
 | 
					@click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file',
 | 
				
			||||||
              default='mods_info_df.parquet', show_default=True)
 | 
					              default='mods_info_df.parquet', show_default=True)
 | 
				
			||||||
@click.option('--output-csv', type=click.Path(), help='Output CSV file')
 | 
					@click.option('--output-page-info', type=click.Path(), help='Output page info Parquet file')
 | 
				
			||||||
@click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
 | 
					def process(mets_files: List[str], output_file: str, output_page_info: str):
 | 
				
			||||||
@click.option('--output-page-info', type=click.Path(), help='Save page info')
 | 
					 | 
				
			||||||
def process(mets_files: List[str], output_file: str, output_csv: str, output_xlsx: str, output_page_info: str):
 | 
					 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
 | 
					    A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    INPUT is assumed to be a METS document with MODS metadata. INPUT may optionally be a directory. The tool then reads
 | 
					    INPUT is assumed to be a METS document with MODS metadata. INPUT may optionally be a directory. The tool then reads
 | 
				
			||||||
    all files in the directory.
 | 
					    all files in the directory.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    mods4pandas writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings.
 | 
					    mods4pandas writes two output files: A pandas DataFrame (as Parquet) and a CSV file with all conversion warnings.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Per-page information (e.g. structure information) can be output to a separate Parquet file.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Extend file list if directories are given
 | 
					    # Extend file list if directories are given
 | 
				
			||||||
| 
						 | 
					@ -442,12 +442,6 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
 | 
				
			||||||
    # Save the DataFrame
 | 
					    # Save the DataFrame
 | 
				
			||||||
    logger.info('Writing DataFrame to {}'.format(output_file))
 | 
					    logger.info('Writing DataFrame to {}'.format(output_file))
 | 
				
			||||||
    mods_info_df.to_parquet(output_file)
 | 
					    mods_info_df.to_parquet(output_file)
 | 
				
			||||||
    if output_csv:
 | 
					 | 
				
			||||||
        logger.info('Writing CSV to {}'.format(output_csv))
 | 
					 | 
				
			||||||
        mods_info_df.to_csv(output_csv)
 | 
					 | 
				
			||||||
    if output_xlsx:
 | 
					 | 
				
			||||||
        logger.info('Writing Excel .xlsx to {}'.format(output_xlsx))
 | 
					 | 
				
			||||||
        mods_info_df.to_excel(output_xlsx)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Convert page_info
 | 
					    # Convert page_info
 | 
				
			||||||
    if output_page_info:
 | 
					    if output_page_info:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue