🧹 Extract a function to convert list[dict] to a DataFrame

feat/page_info
Mike Gerber 7 months ago
parent 5c2dfa8505
commit 968572168e

@ -1,8 +1,9 @@
from itertools import groupby
import re
import warnings
from typing import List, Sequence, MutableMapping
from typing import List, Sequence, MutableMapping, Dict
import pandas as pd
import numpy as np
from lxml import etree as ET
@ -298,3 +299,26 @@ def flatten(d: MutableMapping, parent='', separator='_'):
return dict(items)
def dicts_to_df(data_list: List[Dict], *, index_column: str) -> pd.DataFrame:
"""
Convert the given list of dicts to a Pandas DataFrame.
The keys of the dicts make the columns.
"""
# Build columns from keys
columns = []
for m in data_list:
for c in m.keys():
if c not in columns:
columns.append(c)
# Build data table
data = [[m.get(c) for c in columns] for m in data_list]
# Build index
index = [m[index_column] for m in data_list]
df = pd.DataFrame(data=data, index=index, columns=columns)
return df

@ -14,7 +14,7 @@ import click
import pandas as pd
from tqdm import tqdm
from .lib import sorted_groupby, TagGroup, ns, flatten
from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df
@ -404,14 +404,7 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
#import traceback; traceback.print_exc()
# Convert the mods_info List[Dict] to a pandas DataFrame
columns = []
for m in mods_info:
for c in m.keys():
if c not in columns:
columns.append(c)
data = [[m.get(c) for c in columns] for m in mods_info]
index = [m['recordInfo_recordIdentifier'] for m in mods_info] # PPN
mods_info_df = pd.DataFrame(data=data, index=index, columns=columns)
mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
# Pickle the DataFrame
logger.info('Writing DataFrame to {}'.format(output_file))

Loading…
Cancel
Save