1
0
Fork 0
mirror of https://github.com/qurator-spk/modstool.git synced 2025-06-07 19:05:06 +02:00

🧹 Extract a function to convert list[dict] to a DataFrame

This commit is contained in:
Mike Gerber 2023-11-23 15:00:06 +01:00
parent 5c2dfa8505
commit 968572168e
2 changed files with 27 additions and 10 deletions

View file

@ -1,8 +1,9 @@
from itertools import groupby
import re
import warnings
from typing import List, Sequence, MutableMapping
from typing import List, Sequence, MutableMapping, Dict
import pandas as pd
import numpy as np
from lxml import etree as ET
@ -298,3 +299,26 @@ def flatten(d: MutableMapping, parent='', separator='_'):
return dict(items)
def dicts_to_df(data_list: List[Dict], *, index_column: str) -> pd.DataFrame:
"""
Convert the given list of dicts to a Pandas DataFrame.
The keys of the dicts make the columns.
"""
# Build columns from keys
columns = []
for m in data_list:
for c in m.keys():
if c not in columns:
columns.append(c)
# Build data table
data = [[m.get(c) for c in columns] for m in data_list]
# Build index
index = [m[index_column] for m in data_list]
df = pd.DataFrame(data=data, index=index, columns=columns)
return df

View file

@ -14,7 +14,7 @@ import click
import pandas as pd
from tqdm import tqdm
from .lib import sorted_groupby, TagGroup, ns, flatten
from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df
@ -404,14 +404,7 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
#import traceback; traceback.print_exc()
# Convert the mods_info List[Dict] to a pandas DataFrame
columns = []
for m in mods_info:
for c in m.keys():
if c not in columns:
columns.append(c)
data = [[m.get(c) for c in columns] for m in mods_info]
index = [m['recordInfo_recordIdentifier'] for m in mods_info] # PPN
mods_info_df = pd.DataFrame(data=data, index=index, columns=columns)
mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
# Pickle the DataFrame
logger.info('Writing DataFrame to {}'.format(output_file))