🧹 Extract a function to convert list[dict] to a DataFrame

master
Mike Gerber 12 months ago
parent 5c2dfa8505
commit 968572168e

@ -1,8 +1,9 @@
from itertools import groupby from itertools import groupby
import re import re
import warnings import warnings
from typing import List, Sequence, MutableMapping from typing import List, Sequence, MutableMapping, Dict
import pandas as pd
import numpy as np import numpy as np
from lxml import etree as ET from lxml import etree as ET
@ -298,3 +299,26 @@ def flatten(d: MutableMapping, parent='', separator='_'):
return dict(items) return dict(items)
def dicts_to_df(data_list: List[Dict], *, index_column: str) -> pd.DataFrame:
"""
Convert the given list of dicts to a Pandas DataFrame.
The keys of the dicts make the columns.
"""
# Build columns from keys
columns = []
for m in data_list:
for c in m.keys():
if c not in columns:
columns.append(c)
# Build data table
data = [[m.get(c) for c in columns] for m in data_list]
# Build index
index = [m[index_column] for m in data_list]
df = pd.DataFrame(data=data, index=index, columns=columns)
return df

@ -14,7 +14,7 @@ import click
import pandas as pd import pandas as pd
from tqdm import tqdm from tqdm import tqdm
from .lib import sorted_groupby, TagGroup, ns, flatten from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df
@ -404,14 +404,7 @@ def process(mets_files: List[str], output_file: str, output_csv: str, output_xls
#import traceback; traceback.print_exc() #import traceback; traceback.print_exc()
# Convert the mods_info List[Dict] to a pandas DataFrame # Convert the mods_info List[Dict] to a pandas DataFrame
columns = [] mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
for m in mods_info:
for c in m.keys():
if c not in columns:
columns.append(c)
data = [[m.get(c) for c in columns] for m in mods_info]
index = [m['recordInfo_recordIdentifier'] for m in mods_info] # PPN
mods_info_df = pd.DataFrame(data=data, index=index, columns=columns)
# Pickle the DataFrame # Pickle the DataFrame
logger.info('Writing DataFrame to {}'.format(output_file)) logger.info('Writing DataFrame to {}'.format(output_file))

Loading…
Cancel
Save