Source code for crabnet.utils.modelselectionhelper

import os

from time import time

from crabnet.utils.utils import get_cbfv

from crabnet.utils.estimatorselectionhelper import EstimatorSelectionHelper


# %%
[docs]def modelselectionhelper( models, params, elem_props, mat_props_dir, mat_props, metrics_dir, fig_dir, scoring=None, n_jobs=1, cv=3, refit="neg_MAE", verbose=False, random_seed=42, ): if scoring is None: scoring = {"neg_MAE": "neg_mean_absolute_error"} for ep in elem_props: print("\n++++++++++++++++++++++++++++++++++++++++++++++++") print(f"currently using element property: {ep}") print("++++++++++++++++++++++++++++++++++++++++++++++++") ti_ep = time() for mp in mat_props: print(f"fitting {mp} with {ep} using {cv}-fold CV") ti_mp = time() trainpath = os.path.join(mat_props_dir, mp, "train.csv") valpath = os.path.join(mat_props_dir, mp, "val.csv") if not os.path.exists(trainpath) or not os.path.exists(valpath): trainpath = os.path.join(mat_props_dir, mp, "train0.csv") valpath = os.path.join(mat_props_dir, mp, "val0.csv") X, y, form, skipped = get_cbfv(trainpath, elem_prop=ep) X_val, y_val, form_val, skipped_val = get_cbfv(valpath, elem_prop=ep) # Sample the dataset for faster gridsearch n_samples = 2000 if X.shape[0] > n_samples: print( f"Sampling training data to {n_samples} samples " f"to speed up initial gridsearch" ) X = X.sample(n=n_samples) y = y.loc[X.index] form = form.loc[X.index] helper1 = EstimatorSelectionHelper(models, params) helper1.fit( X, y, scoring=scoring, n_jobs=n_jobs, cv=cv, refit=refit, verbose=verbose, random_seed=random_seed, ) output = helper1.score_summary(ep, mp, fig_dir, sort_by="mean_test_r2") score_summary, best_models = output print("\n************************************************") print(f"finished {mp} with {ep}") print("saving score summary and best models files") print("************************************************") outpath_all = os.path.join(metrics_dir, f"{ep}_{mp}.csv") score_summary.to_csv(outpath_all, index=False) outpath_best = os.path.join(metrics_dir, f"best_{ep}_{mp}.csv") best_models.to_csv(outpath_best, index=False) dt_mp = time() - ti_mp print(f"time elapsed for {mp} with {ep}: {dt_mp:0.4f} s") dt_ep = time() - ti_ep print( f"time elapsed for all material properties " f"using {ep}: {dt_ep:0.4f} s" ) return helper1