mirror of
				https://github.com/qurator-spk/modstool.git
				synced 2025-11-04 03:14:14 +01:00 
			
		
		
		
	🧹 Extract a function to convert list[dict] to a DataFrame
This commit is contained in:
		
							parent
							
								
									5c2dfa8505
								
							
						
					
					
						commit
						968572168e
					
				
					 2 changed files with 27 additions and 10 deletions
				
			
		| 
						 | 
					@ -1,8 +1,9 @@
 | 
				
			||||||
from itertools import groupby
 | 
					from itertools import groupby
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
from typing import List, Sequence, MutableMapping
 | 
					from typing import List, Sequence, MutableMapping, Dict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import pandas as pd
 | 
				
			||||||
import numpy as np
 | 
					import numpy as np
 | 
				
			||||||
from lxml import etree as ET
 | 
					from lxml import etree as ET
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -298,3 +299,26 @@ def flatten(d: MutableMapping, parent='', separator='_'):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return dict(items)
 | 
					    return dict(items)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def dicts_to_df(data_list: List[Dict], *, index_column: str) -> pd.DataFrame:
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Convert the given list of dicts to a Pandas DataFrame.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    The keys of the dicts make the columns.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Build columns from keys
 | 
				
			||||||
 | 
					    columns = []
 | 
				
			||||||
 | 
					    for m in data_list:
 | 
				
			||||||
 | 
					        for c in m.keys():
 | 
				
			||||||
 | 
					            if c not in columns:
 | 
				
			||||||
 | 
					                columns.append(c)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Build data table
 | 
				
			||||||
 | 
					    data = [[m.get(c) for c in columns] for m in data_list]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Build index
 | 
				
			||||||
 | 
					    index = [m[index_column] for m in data_list]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    df = pd.DataFrame(data=data, index=index, columns=columns)
 | 
				
			||||||
 | 
					    return df
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,7 +14,7 @@ import click
 | 
				
			||||||
import pandas as pd
 | 
					import pandas as pd
 | 
				
			||||||
from tqdm import tqdm
 | 
					from tqdm import tqdm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .lib import sorted_groupby, TagGroup, ns, flatten
 | 
					from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -404,14 +404,7 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
 | 
				
			||||||
                #import traceback; traceback.print_exc()
 | 
					                #import traceback; traceback.print_exc()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Convert the mods_info List[Dict] to a pandas DataFrame
 | 
					    # Convert the mods_info List[Dict] to a pandas DataFrame
 | 
				
			||||||
    columns = []
 | 
					    mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
 | 
				
			||||||
    for m in mods_info:
 | 
					 | 
				
			||||||
        for c in m.keys():
 | 
					 | 
				
			||||||
            if c not in columns:
 | 
					 | 
				
			||||||
                columns.append(c)
 | 
					 | 
				
			||||||
    data = [[m.get(c) for c in columns] for m in mods_info]
 | 
					 | 
				
			||||||
    index = [m['recordInfo_recordIdentifier'] for m in mods_info]  # PPN
 | 
					 | 
				
			||||||
    mods_info_df = pd.DataFrame(data=data, index=index, columns=columns)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Pickle the DataFrame
 | 
					    # Pickle the DataFrame
 | 
				
			||||||
    logger.info('Writing DataFrame to {}'.format(output_file))
 | 
					    logger.info('Writing DataFrame to {}'.format(output_file))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue