Source code for crabnet.utils.estimatorselectionhelper

import warnings

import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV

plt.rcParams.update({"font.size": 16})


# %%
[docs]class EstimatorSelectionHelper:
[docs] def __init__(self, models, params, random_seed=42): self.models = models self.params = params self.keys = models.keys() self.grid_searches = {} if not set(models.keys()).issubset(set(params.keys())): missing_params = list(set(models.keys()) - set(params.keys())) raise ValueError(f"Missing estimator parameters: {missing_params}")
[docs] def fit( self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False, random_seed=42 ): if n_jobs == 1: warnings.warn( "n_jobs is currently set at 1, " "consider passing n_jobs=-1 or n_jobs=n_cores, " "where n_cores is the number of CPU cores you have", RuntimeWarning, ) for key in self.keys: print(f"\nRunning GridSearchCV for {key}.") model = self.models[key] params = self.params[key] gs = GridSearchCV( model, params, cv=cv, n_jobs=n_jobs, verbose=verbose, scoring=scoring, refit=refit, return_train_score=True, ) gs.fit(X, y) self.grid_searches[key] = gs self.scoring = scoring self.scoring_val2key = {scoring[k]: k for k in scoring}
[docs] def plot_gridsearch(self, model_name, elem_prop, mat_prop, fig_dir, gs): test_metric = "mean_test_neg_MAE" # Get CV parameters and results dims = pd.DataFrame(gs.cv_results_["params"]) # for RandomForest: convert max_depth = None to string 'None' dims.loc[dims["max_depth"].isnull(), "max_depth"] = "None" dims["score"] = gs.cv_results_[test_metric] col_names = dims.columns.tolist() dims = dims.pivot(col_names[0], col_names[1], col_names[2]) dims_flipped = dims.iloc[::-1] # Plot and save gridsearch heatmap plt.figure(figsize=(6, 6)) sns.heatmap(dims_flipped) plt.title( f"{test_metric}\n" f"{model_name} on {elem_prop} with {mat_prop}", fontsize=18, ) plt.savefig( fig_dir + f"{model_name}_{elem_prop}_{mat_prop}.png", dpi=300, bbox_inches="tight", ) plt.close("all") print(f"saved figure {model_name}_{elem_prop}_{mat_prop}.png")
[docs] def score_summary(self, ep, mp, fig_dir, sort_by="mean_test_r2"): print("***************** gridsearch done *****************") scoring_keys = self.scoring.keys() col_prefixes = ["mean_", "std_"] col_midfixes = ["train_", "test_"] col_suffixes = scoring_keys columns1 = ["estimator"] columns2 = [ p + m + s for s in col_suffixes for m in col_midfixes for p in col_prefixes ] all_columns = ( columns1 + columns2 + ["params"] + ["mean_fit_time", "mean_score_time"] ) df = pd.DataFrame(columns=all_columns) df_best_models = pd.DataFrame(columns=all_columns) # Plot gridsearch results for each model/param/elem_prop/mat_prop # combination, and save dataframe of best results for m in self.grid_searches: self.plot_gridsearch(m, ep, mp, fig_dir, self.grid_searches[m]) df_model = pd.DataFrame(columns=all_columns) print(f"Parsing results for {m}") res = self.grid_searches[m].cv_results_ params = res["params"] df_model["estimator"] = [m] * len(params) for col in df_model.columns: if col != "estimator": df_model[col] = res[col] best_run_idx = df_model["mean_test_neg_MAE"].idxmax() best_run = df_model.loc[[best_run_idx], :] df_best_models = pd.concat([df_best_models, best_run], axis=0) df = pd.concat([df, df_model], axis=0) df["elem_prop"] = ep df["mat_prop"] = mp df_best_models["elem_prop"] = ep df_best_models["mat_prop"] = mp return df, df_best_models