Source code for crabnet.utils.figures

import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.cm as cm

from matplotlib.ticker import AutoMinorLocator
from matplotlib.colors import Normalize

from .composition import _element_composition
from scipy import stats

plt.rcParams.update({"font.size": 14})


# %%
[docs]def act_pred(
    y_act,
    y_pred,
    name="example",
    x_hist=True,
    y_hist=True,
    reg_line=True,
    save_dir=None,
):

    mec = "#2F4F4F"
    mfc = "#C0C0C0"

    plt.figure(1, figsize=(4, 4))
    left, width = 0.1, 0.65

    bottom, height = 0.1, 0.65
    bottom_h = left_h = left + width
    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.15]
    rect_histy = [left_h, bottom, 0.15, height]

    ax2 = plt.axes(rect_scatter)
    ax2.tick_params(direction="in", length=7, top=True, right=True)

    # add minor ticks
    minor_locator_x = AutoMinorLocator(2)
    minor_locator_y = AutoMinorLocator(2)
    ax2.get_xaxis().set_minor_locator(minor_locator_x)
    ax2.get_yaxis().set_minor_locator(minor_locator_y)
    plt.tick_params(which="minor", direction="in", length=4, right=True, top=True)

    # feel free to change the colors here.
    ax2.plot(
        y_act, y_pred, "o", mfc=mfc, alpha=0.5, label=None, mec=mec, mew=1.2, ms=5.2
    )

    ax2.plot(
        [-(10**9), 10**9], [-(10**9), 10**9], "k--", alpha=0.8, label="ideal"
    )

    ax2.set_ylabel("Predicted value (Units)")
    ax2.set_xlabel("Actual value (Units)")
    x_range = max(y_act) - min(y_act)
    ax2.set_xlim(max(y_act) - x_range * 1.05, min(y_act) + x_range * 1.05)

    ax2.set_ylim(max(y_act) - x_range * 1.05, min(y_act) + x_range * 1.05)

    ax1 = plt.axes(rect_histx)
    ax1_n, ax1_bins, ax1_patches = ax1.hist(
        y_act, bins=31, density=True, color=mfc, edgecolor=mec, alpha=0
    )
    ax1.set_xticks([])
    ax1.set_yticks([])
    ax1.set_xlim(ax2.get_xlim())
    ax1.axis("off")

    if x_hist:
        [p.set_alpha(1.0) for p in ax1_patches]

    ax3 = plt.axes(rect_histy)
    ax3_n, ax3_bins, ax3_patches = ax3.hist(
        y_pred,
        bins=31,
        density=True,
        color=mfc,
        edgecolor=mec,
        orientation="horizontal",
        alpha=0,
    )
    ax3.set_xticks([])
    ax3.set_yticks([])
    ax3.set_ylim(ax2.get_ylim())
    ax3.axis("off")

    if y_hist:
        [p.set_alpha(1.0) for p in ax3_patches]

    if reg_line:
        polyfit = np.polyfit(y_act, y_pred, deg=1)
        reg_ys = np.poly1d(polyfit)(np.unique(y_act))
        ax2.plot(np.unique(y_act), reg_ys, alpha=0.8, label="linear fit")

    ax2.legend(loc=2, framealpha=0.35, handlelength=1.5)
    plt.draw()

    if save_dir is not None:
        fig_name = f"{save_dir}/{name}_act_pred.png"
        os.makedirs(save_dir, exist_ok=True)
        plt.savefig(fig_name, bbox_inches="tight", dpi=300)
    plt.draw()
    plt.pause(0.001)
    plt.close()


[docs]def residual(y_act, y_pred, name="example", save_dir=None):

    mec = "#2F4F4F"
    mfc = "#C0C0C0"

    y_act = np.array(y_act)
    y_pred = np.array(y_pred)

    xmin = np.min([y_act]) * 0.9
    xmax = np.max([y_act]) / 0.9

    y_err = y_pred - y_act
    ymin = np.min([y_err]) * 0.9
    ymax = np.max([y_err]) / 0.9

    fig, ax = plt.subplots(figsize=(4, 4))

    ax.plot(y_act, y_err, "o", mec=mec, mfc=mfc, alpha=0.5, label=None, mew=1.2, ms=5.2)
    ax.plot([xmin, xmax], [0, 0], "k--", alpha=0.8, label="ideal")

    ax.set_ylabel("Residual error (Units)")
    ax.set_xlabel("Actual value (Units)")
    ax.legend(loc="lower right")

    minor_locator_x = AutoMinorLocator(2)
    minor_locator_y = AutoMinorLocator(2)
    ax.get_xaxis().set_minor_locator(minor_locator_x)
    ax.get_yaxis().set_minor_locator(minor_locator_y)

    ax.tick_params(right=True, top=True, direction="in", length=7)
    ax.tick_params(which="minor", right=True, top=True, direction="in", length=4)

    ax.set_xlim(xmin, xmax)
    ax.set_ylim(ymin, ymax)

    if save_dir is not None:
        fig_name = f"{save_dir}/{name}_residual.png"
        os.makedirs(save_dir, exist_ok=True)
        plt.savefig(fig_name, bbox_inches="tight", dpi=300)
    plt.draw()
    plt.pause(0.001)
    plt.close()


[docs]def residual_hist(y_act, y_pred, name="example", save_dir=None):

    mec = "#2F4F4F"
    mfc = "#C0C0C0"

    fig, ax = plt.subplots(figsize=(4, 4))
    y_err = y_pred - y_act
    kde_act = stats.gaussian_kde(y_err)
    x_range = np.linspace(min(y_err), max(y_err), 1000)

    ax.hist(y_err, color=mfc, bins=35, alpha=1, edgecolor=mec, density=True)
    ax.plot(x_range, kde_act(x_range), "-", lw=1.2, color="k", label="kde")

    ax.set_xlabel("Residual error (Units)")
    plt.legend(loc=2, framealpha=0.35, handlelength=1.5)

    ax.tick_params(direction="in", length=7, top=True, right=True)

    minor_locator_x = AutoMinorLocator(2)
    minor_locator_y = AutoMinorLocator(2)
    ax.get_xaxis().set_minor_locator(minor_locator_x)
    ax.get_yaxis().set_minor_locator(minor_locator_y)
    plt.tick_params(which="minor", direction="in", length=4, right=True, top=True)

    if save_dir is not None:
        fig_name = f"{save_dir}/{name}_residual_hist.png"
        os.makedirs(save_dir, exist_ok=True)
        plt.savefig(fig_name, bbox_inches="tight", dpi=300)

    plt.draw()
    plt.pause(0.001)
    plt.close()


[docs]def loss_curve(x_data, train_err, val_err, name="example", save_dir=None):

    mec1 = "#2F4F4F"
    mfc1 = "#C0C0C0"
    mec2 = "maroon"
    mfc2 = "pink"

    fig, ax = plt.subplots(figsize=(4, 4))

    ax.plot(
        x_data,
        train_err,
        "-",
        color=mec1,
        marker="o",
        mec=mec1,
        mfc=mfc1,
        ms=4,
        alpha=0.5,
        label="train",
    )
    ax.plot(
        x_data,
        val_err,
        "--",
        color=mec2,
        marker="s",
        mec=mec2,
        mfc=mfc2,
        ms=4,
        alpha=0.5,
        label="validation",
    )

    max_val_err = max(val_err)
    ax.axhline(max_val_err, color="b", linestyle="--", alpha=0.3)

    ax.set_xlabel("Number of training epochs")
    ax.set_ylabel("Loss (Units)")
    ax.set_ylim(0, 2 * np.mean(val_err))

    ax.legend(loc=1, framealpha=0.35, handlelength=1.5)

    minor_locator_x = AutoMinorLocator(2)
    minor_locator_y = AutoMinorLocator(2)
    ax.get_xaxis().set_minor_locator(minor_locator_x)
    ax.get_yaxis().set_minor_locator(minor_locator_y)

    ax.tick_params(right=True, top=True, direction="in", length=7)
    ax.tick_params(which="minor", right=True, top=True, direction="in", length=4)

    if save_dir is not None:
        fig_name = f"{save_dir}/{name}_loss_curve.png"
        os.makedirs(save_dir, exist_ok=True)
        plt.savefig(fig_name, bbox_inches="tight", dpi=300)
    plt.draw()
    plt.pause(0.001)
    plt.close()


[docs]def element_prevalence(
    formulae, name="example", save_dir=None, log_scale=False, ptable_fig=True
):
    ptable = pd.read_csv("ML_figures/element_properties/ptable.csv")
    ptable.index = ptable["symbol"].values
    elem_tracker = ptable["count"]
    n_row = ptable["row"].max()
    n_column = ptable["column"].max()

    for formula in formulae:
        formula_dict = _element_composition(formula)
        elem_count = pd.Series(formula_dict, name="count")
        elem_tracker = elem_tracker.add(elem_count, fill_value=0)

    if ptable_fig:
        fig, ax = plt.subplots(figsize=(n_column, n_row))
        rows = ptable["row"]
        columns = ptable["column"]
        symbols = ptable["symbol"]
        rw = 0.9  # rectangle width (rw)
        rh = rw  # rectangle height (rh)
        for row, column, symbol in zip(rows, columns, symbols):
            row = ptable["row"].max() - row
            cmap = cm.YlGn
            count_min = elem_tracker.min()
            count_max = elem_tracker.max()
            norm = Normalize(vmin=count_min, vmax=count_max)
            count = elem_tracker[symbol]
            if log_scale:
                norm = Normalize(vmin=np.log(1), vmax=np.log(count_max))
                if count != 0:
                    count = np.log(count)
            color = cmap(norm(count))
            if count == 0:
                color = "silver"
            if row < 3:
                row += 0.5
            rect = patches.Rectangle(
                (column, row),
                rw,
                rh,
                linewidth=1.5,
                edgecolor="gray",
                facecolor=color,
                alpha=1,
            )

            plt.text(
                column + rw / 2,
                row + rw / 2,
                symbol,
                horizontalalignment="center",
                verticalalignment="center",
                fontsize=20,
                fontweight="semibold",
                color="k",
            )

            ax.add_patch(rect)

        granularity = 20
        for i in range(granularity):
            value = int(round((i) * count_max / (granularity - 1)))
            if log_scale:
                if value != 0:
                    value = np.log(value)
            color = cmap(norm(value))
            if value == 0:
                color = "silver"
            length = 9
            x_offset = 3.5
            y_offset = 7.8
            x_loc = i / (granularity) * length + x_offset
            width = length / granularity
            height = 0.35
            rect = patches.Rectangle(
                (x_loc, y_offset),
                width,
                height,
                linewidth=1.5,
                edgecolor="gray",
                facecolor=color,
                alpha=1,
            )

            if i in [0, 4, 9, 14, 19]:
                text = f"{value:0.0f}"
                if log_scale:
                    text = f"{np.exp(value):0.1e}".replace("+", "")
                plt.text(
                    x_loc + width / 2,
                    y_offset - 0.4,
                    text,
                    horizontalalignment="center",
                    verticalalignment="center",
                    fontweight="semibold",
                    fontsize=20,
                    color="k",
                )

            ax.add_patch(rect)

        plt.text(
            x_offset + length / 2,
            y_offset + 0.7,
            "log(Element Count)" if log_scale else "Element Count",
            horizontalalignment="center",
            verticalalignment="center",
            fontweight="semibold",
            fontsize=20,
            color="k",
        )

        ax.set_ylim(-0.15, n_row + 0.1)
        ax.set_xlim(0.85, n_column + 1.1)

        # fig.patch.set_visible(False)
        ax.axis("off")

        if save_dir is not None:
            fig_name = f"{save_dir}/{name}_ptable.png"
            os.makedirs(save_dir, exist_ok=True)
            plt.savefig(fig_name, bbox_inches="tight", dpi=300)
        plt.draw()
        plt.pause(0.001)
        plt.close()

    if not ptable_fig:
        fig, ax = plt.subplots(figsize=(15, 6))
        non_zero = elem_tracker[elem_tracker != 0].sort_values(ascending=False)
        if log_scale:
            non_zero = np.log(non_zero)
        non_zero.plot.bar(width=0.7, edgecolor="k")

        minor_locator_y = AutoMinorLocator(2)
        ax.get_yaxis().set_minor_locator(minor_locator_y)

        ax.set_ylabel("Element Count")
        if log_scale:
            ax.set_ylabel("log(Element Count)")

        ax.tick_params(right=True, top=True, direction="in", length=7)
        ax.tick_params(which="minor", right=True, top=True, direction="in", length=4)

        if save_dir is not None:
            fig_name = f"{save_dir}/{name}_elem_hist.png"
            os.makedirs(save_dir, exist_ok=True)
            plt.savefig(fig_name, bbox_inches="tight", dpi=300)
        plt.draw()
        plt.pause(0.001)
        plt.close()


# %%
if __name__ == "__main__":
    # read in example act vs. pred data
    df_act_pred = pd.read_csv("example_data/act_pred.csv")
    y_act, y_pred = df_act_pred.iloc[:, 1], df_act_pred.iloc[:, 2]

    act_pred(y_act, y_pred, reg_line=True, save_dir="example_figures")

    act_pred(
        y_act,
        y_pred,
        name="example_no_hist",
        x_hist=False,
        y_hist=False,
        reg_line=True,
        save_dir="example_figures",
    )

    residual(y_act, y_pred, save_dir="example_figures")

    residual_hist(y_act, y_pred, save_dir="example_figures")

    # read in loss curve data
    df_lc = pd.read_csv("example_data/training_progress.csv")
    epoch = df_lc["epoch"]
    train_err, val_err = df_lc["mae_train"], df_lc["mae_val"]

    loss_curve(epoch, train_err, val_err, save_dir="example_figures")

    # element prevalence
    formula = df_act_pred.iloc[:, 0]

    element_prevalence(formula, save_dir="example_figures", log_scale=False)
    element_prevalence(
        formula, save_dir="example_figures", name="example_log", log_scale=True
    )

    plt.rcParams.update({"font.size": 12})
    element_prevalence(
        formula, save_dir="example_figures", ptable_fig=False, log_scale=False
    )
    element_prevalence(
        formula,
        save_dir="example_figures",
        name="example_log",
        ptable_fig=False,
        log_scale=True,
    )