1
0
Fork 0
mirror of https://github.com/qurator-spk/modstool.git synced 2025-06-07 19:05:06 +02:00

🧹 Extract a function to convert list[dict] to a DataFrame

This commit is contained in:
Mike Gerber 2023-11-23 15:00:06 +01:00
parent 5c2dfa8505
commit 968572168e
2 changed files with 27 additions and 10 deletions

View file

@ -1,8 +1,9 @@
from itertools import groupby from itertools import groupby
import re import re
import warnings import warnings
from typing import List, Sequence, MutableMapping from typing import List, Sequence, MutableMapping, Dict
import pandas as pd
import numpy as np import numpy as np
from lxml import etree as ET from lxml import etree as ET
@ -298,3 +299,26 @@ def flatten(d: MutableMapping, parent='', separator='_'):
return dict(items) return dict(items)
def dicts_to_df(data_list: List[Dict], *, index_column: str) -> pd.DataFrame:
"""
Convert the given list of dicts to a Pandas DataFrame.
The keys of the dicts make the columns.
"""
# Build columns from keys
columns = []
for m in data_list:
for c in m.keys():
if c not in columns:
columns.append(c)
# Build data table
data = [[m.get(c) for c in columns] for m in data_list]
# Build index
index = [m[index_column] for m in data_list]
df = pd.DataFrame(data=data, index=index, columns=columns)
return df

View file

@ -14,7 +14,7 @@ import click
import pandas as pd import pandas as pd
from tqdm import tqdm from tqdm import tqdm
from .lib import sorted_groupby, TagGroup, ns, flatten from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df
@ -404,14 +404,7 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
#import traceback; traceback.print_exc() #import traceback; traceback.print_exc()
# Convert the mods_info List[Dict] to a pandas DataFrame # Convert the mods_info List[Dict] to a pandas DataFrame
columns = [] mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
for m in mods_info:
for c in m.keys():
if c not in columns:
columns.append(c)
data = [[m.get(c) for c in columns] for m in mods_info]
index = [m['recordInfo_recordIdentifier'] for m in mods_info] # PPN
mods_info_df = pd.DataFrame(data=data, index=index, columns=columns)
# Pickle the DataFrame # Pickle the DataFrame
logger.info('Writing DataFrame to {}'.format(output_file)) logger.info('Writing DataFrame to {}'.format(output_file))