from importlib.resources import open_text
import pandas as pd
from sklearn.model_selection import train_test_split
[docs]def get_data(
module,
fname="train.csv",
mapper=None,
groupby=True,
dummy=False,
split=True,
val_size=0.2,
test_size=0.0,
random_state=42,
):
"""Grab data from within the subdirectories (modules) of mat_discover.
Parameters
----------
module : Module
The module within CrabNet that contains e.g. "train.csv". For example,
`from CrabNet.data.materials_data import elasticity`
fname : str, optional
Filename of text file to open.
mapper: dict, optional
Column renamer for pandas DataFrame (i.e. used in `df.rename(columns=mapper)` By default, None.
dummy : bool, optional
Whether to pare down the data to a small test set, by default False
groupby : bool, optional
Whether to use groupby_formula to filter identical compositions
split : bool, optional
Whether to split the data into train, val, and (optionally) test sets, by default True
val_size : float, optional
Validation dataset fraction, by default 0.2
test_size : float, optional
Test dataset fraction, by default 0.0
random_state : int, optional
seed to use for the train/val/test split, by default 42
Returns
-------
DataFrame
If split==False, then the full DataFrame is returned directly
DataFrame, DataFrame
If test_size == 0 and split==True, then training and validation DataFrames are returned.
DataFrame, DataFrame, DataFrame
If test_size > 0 and split==True, then training, validation, and test DataFrames are returned.
"""
train_csv = open_text(module, fname)
df = pd.read_csv(train_csv)
if groupby:
df = groupby_formula(df, how="max", mapper=mapper)
if dummy:
ntot = min(100, len(df))
df = df.head(ntot)
if split:
if test_size > 0:
df, test_df = train_test_split(
df, test_size=test_size, random_state=random_state
)
train_df, val_df = train_test_split(
df, test_size=val_size / (1 - test_size), random_state=random_state
)
if test_size > 0:
return train_df, val_df, test_df
else:
return train_df, val_df
else:
return df