Source code for tablemage._src.ml.predict.regression.mlreg_report

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import Literal
import warnings
from .base import BaseR
from ....data.datahandler import DataHandler
from ....metrics.visualization import plot_obs_vs_pred
from ....display.print_utils import (
    print_wrapped,
    color_text,
    bold_text,
    list_to_string,
    fill_ignore_format,
    quote_and_color,
    format_two_column,
)
from ....display.print_options import print_options
from ....feature_selection import BaseFSR, VotingSelectionReport

warnings.simplefilter("ignore", category=UserWarning)


class SingleModelSingleDatasetMLRegReport:
    """
    Class for generating regression-relevant plots and
    tables for a single machine learning model on a single dataset.
    """

    def __init__(self, model: BaseR, dataset: Literal["train", "test"]):
        """
        Initializes a SingleModelSingleDatasetMLReport object.

        Parameters
        ----------
        model : BaseRegression
            The data for the model must already be
            specified. The model should already be trained on the specified data.

        dataset : Literal['train', 'test']
        """
        self._model = model
        if dataset not in ["train", "test"]:
            raise ValueError('dataset must be either "train" or "test".')
        self._dataset = dataset

    def metrics(self) -> pd.DataFrame:
        """Returns a DataFrame containing the goodness-of-fit statistics
        for the model on the specified data.

        Returns
        ----------
        pd.DataFrame
        """
        if self._dataset == "train":
            return (
                self._model._train_scorer.stats_df()
                .astype(float)
                .round(print_options._n_decimals)
            )
        else:
            return (
                self._model._test_scorer.stats_df()
                .astype(float)
                .round(print_options._n_decimals)
            )

    def cv_metrics(self, average_across_folds: bool = True) -> pd.DataFrame | None:
        """Returns a DataFrame containing the cross-validated goodness-of-fit
        statistics for the model on the specified data.

        Parameters
        ----------
        average_across_folds : bool
            Default: True. If True, returns a DataFrame
            containing goodness-of-fit statistics averaged across all folds.
            Otherwise, returns a DataFrame containing goodness-of-fit
            statistics for each fold.

        Returns
        ----------
        pd.DataFrame | None
            None is returned if cross validation fit statistics are not available.
        """
        if not self._model.is_cross_validated():
            print_wrapped(
                "Cross validation statistics are not available "
                + "for models that are not cross-validated.",
                type="WARNING",
            )
            return None
        if self._dataset == "train":
            if average_across_folds:
                return (
                    self._model._cv_scorer.stats_df()
                    .astype(float)
                    .round(print_options._n_decimals)
                )
            else:
                return (
                    self._model._cv_scorer.cv_stats_df()
                    .astype(float)
                    .round(print_options._n_decimals)
                )
        else:
            print_wrapped(
                "Cross validation statistics are not available for test data.",
                type="WARNING",
            )
            return None

    def plot_obs_vs_pred(
        self, figsize: tuple[float, float] = (5, 5), ax: plt.Axes | None = None
    ) -> plt.Figure:
        """Returns a figure that is a scatter plot of the observed (y-axis) and
        predicted (x-axis) values.

        Parameters
        ----------
        figsize : tuple[float, float]
            Default: (5, 5). The size of the figure.

        ax : plt.Axes | None
            Default: None. The axes on which to plot the figure. If None,
            a new figure is created.

        Returns
        -------
        plt.Figure
        """
        if self._dataset == "train":
            y_pred = self._model._train_scorer._y_pred
            y_true = self._model._train_scorer._y_true
        else:
            y_pred = self._model._test_scorer._y_pred
            y_true = self._model._test_scorer._y_true
        return plot_obs_vs_pred(y_pred, y_true, self._model._name, figsize, ax)


class SingleModelMLRegReport:
    """SingleModelMLRegReport: generates regression-relevant plots and
    tables for a single machine learning model.
    """

    def __init__(self, model: BaseR):
        """
        Initializes a SingleModelMLRegReport object.

        Parameters
        ----------
        model : BaseR
            The data for the model must already be specified.
            The model should already be trained on the specified data.
        """
        self._model = model

    def train_report(self) -> SingleModelSingleDatasetMLRegReport:
        """Returns a SingleModelSingleDatasetMLReport object for the training data.

        Returns
        -------
        SingleModelSingleDatasetMLReport
        """
        return SingleModelSingleDatasetMLRegReport(self._model, "train")

    def test_report(self) -> SingleModelSingleDatasetMLRegReport:
        """Returns a SingleModelSingleDatasetMLReport object for the test data.

        Returns
        -------
        SingleModelSingleDatasetMLReport
        """
        return SingleModelSingleDatasetMLRegReport(self._model, "test")

    def model(self) -> BaseR:
        """Returns the model.

        Returns
        -------
        BaseR
        """
        return self._model

    def plot_obs_vs_pred(
        self,
        dataset: Literal["train", "test"],
        figsize: tuple[float, float] = (5, 5),
        ax: plt.Axes | None = None,
    ) -> plt.Figure:
        """Returns a figure that is a scatter plot of the observed (y-axis) and
        predicted (x-axis) values for the specified dataset.

        Parameters
        ----------
        dataset : Literal['train', 'test']
            The dataset for which to plot the observed vs predicted values.

        figsize : tuple[float, float]
            Default: (5, 5). The size of the figure.

        ax : plt.Axes | None
            Default: None. The axes on which to plot the figure. If None,
            a new figure is created.

        Returns
        -------
        plt.Figure
        """
        if dataset == "train":
            return self.train_report().plot_obs_vs_pred(figsize, ax)
        elif dataset == "test":
            return self.test_report().plot_obs_vs_pred(figsize, ax)
        else:
            raise ValueError('dataset must be either "train" or "test".')

    def fs_report(self) -> VotingSelectionReport | None:
        """Returns the feature selection report. If feature selectors were
        specified at the model level or not at all, then this method will return None.

        Returns
        -------
        VotingSelectionReport | None
            None is returned if no feature selectors were specified.
        """
        return self._model.fs_report()

    def feature_importance(self) -> pd.DataFrame | None:
        """Returns the feature importances for the model. If the model does not
        have feature importances, the coefficients are returned instead.
        If the model does not have feature importances or coefficients,
        None is returned.

        Returns
        -------
        pd.DataFrame | None
            None is returned if the model does not have feature importances.
        """
        return (
            self._model.feature_importance()
            .astype(float)
            .round(print_options._n_decimals)
        )



[docs]
class MLRegressionReport:
    """Class for reporting model goodness of fit.
    Fits the model based on provided DataHandler.
    """

    def __init__(
        self,
        models: list[BaseR],
        datahandler: DataHandler,
        target: str,
        predictors: list[str],
        feature_selectors: list[BaseFSR] | None = None,
        max_n_features: int | None = None,
        outer_cv: int | None = None,
        outer_cv_seed: int = 42,
        verbose: bool = True,
    ):
        """MLRegressionReport.
        Fits the model based on provided DataHandler.

        Parameters
        ----------
        models : list[BaseR]
            The models will be trained by the MLRegressionReport object.

        datahandler : DataHandler
            The DataHandler object that contains the data.

        target : str
            The name of the target variable.

        predictors : list[str]
            The names of the predictor variables.

        feature_selectors : list[BaseFSR] | None
            Default: None.
            The feature selectors for voting selection. Feature selectors
            can be used to select the most important predictors.

        max_n_features : int | None
            Default: None.
            Maximum number of predictors to utilize. Ignored if feature_selectors
            is None.

        outer_cv : int | None
            Default: None.
            If not None, reports training scores via nested k-fold CV.

        outer_cv_seed : int
            Default: 42. The random seed for the outer cross validation loop.

        verbose : bool
            Default: True. If True, prints progress.
        """
        self._models: list[BaseR] = models

        for model in self._models:
            if not isinstance(model, BaseR):
                raise ValueError(
                    f"Model {quote_and_color(str(model))} is not an instance "
                    "of BaseR. All models must be instances of BaseR."
                )

        self._id_to_model = {}
        for model in models:
            if model._name in self._id_to_model:
                raise ValueError(
                    f"Duplicate model name: {quote_and_color(model._name)}."
                )
            self._id_to_model[model._name] = model

        self._feature_selection_report = None
        self._feature_selectors = feature_selectors

        self._y_var = target
        self._predictors = predictors
        self._X_vars = predictors

        self._emitter = datahandler.train_test_emitter(y_var=target, X_vars=predictors)
        if feature_selectors is not None:
            for feature_selector in feature_selectors:
                if not isinstance(feature_selector, BaseFSR):
                    raise ValueError(
                        f"Feature selector {quote_and_color(model._name)} "
                        "is not an instance of BaseFSR. "
                        "All feature selectors must be instances of BaseFSR."
                    )

            self._feature_selection_report = VotingSelectionReport(
                selectors=feature_selectors,
                dataemitter=self._emitter,
                max_n_features=max_n_features,
                verbose=verbose,
            )
            self._X_vars = self._feature_selection_report.top_features()
            self._emitter.select_predictors(self._X_vars)

        self._emitters = None

        if outer_cv is not None:
            self._emitters = datahandler.kfold_emitters(
                y_var=target,
                X_vars=predictors,
                n_folds=outer_cv,
                shuffle=True,
                random_state=outer_cv_seed,
            )
            if feature_selectors is not None:
                for emitter in self._emitters:
                    fold_selection_report = VotingSelectionReport(
                        selectors=feature_selectors,
                        dataemitter=emitter,
                        max_n_features=max_n_features,
                        verbose=verbose,
                    )
                    emitter.select_predictors(fold_selection_report.top_features())

        self._verbose = verbose

        for model in self._models:
            if self._verbose:
                print_wrapped(
                    f"Fitting model {quote_and_color(model._name)}.",
                    type="UPDATE",
                )
            model.specify_data(
                dataemitter=self._emitter,
                dataemitters=self._emitters,
            )

            model.fit(verbose=self._verbose)

            if (
                model._feature_selection_report is not None
                and self._feature_selection_report is not None
            ):
                if self._verbose:
                    print_wrapped(
                        "Feature selectors were specified for all models as well as "
                        f"for the model {quote_and_color(model._name)}. "
                        f"The feature selection report attributed "
                        f"to {quote_and_color(model._name)} "
                        "will be for the model-specific feature selectors. "
                        "Note that the feature selectors for all models "
                        "were used to select a subset of the predictors first. "
                        "Then, the model-specific feature selectors were used to "
                        "select a subset of the predictors from the subset selected "
                        "by the feature selectors for all models.",
                        type="WARNING",
                        level="INFO",
                    )

            if model._feature_selection_report is None:
                model._set_voting_selection_report(
                    voting_selection_report=self._feature_selection_report
                )

            if self._verbose:
                print_wrapped(
                    f"Successfully evaluated model {quote_and_color(model._name)}.",
                    type="UPDATE",
                )

        self._id_to_report = {
            model._name: SingleModelMLRegReport(model) for model in models
        }

    def _model_report(self, model_id: str) -> SingleModelMLRegReport:
        """Returns the SingleModelMLRegReport object for the specified model.

        Parameters
        ----------
        model_id : str
            The id of the model.

        Returns
        -------
        SingleModelMLRegReport
        """
        if model_id not in self._id_to_report:
            raise ValueError(f"Model {model_id} not found.")
        return self._id_to_report[model_id]


[docs]
    def model(self, model_id: str) -> BaseR:
        """Returns the model with the specified id.

        Parameters
        ----------
        model_id : str
            The id of the model.

        Returns
        -------
        BaseR
        """
        if model_id not in self._id_to_model:
            raise ValueError(f"Model {model_id} not found.")
        return self._id_to_model[model_id]



[docs]
    def metrics(self, dataset: Literal["train", "test", "both"]) -> pd.DataFrame:
        """Returns a DataFrame containing the metrics for
        all models on the specified data.

        Parameters
        ----------
        dataset : Literal['train', 'test', 'both']
            The dataset for which to return the metrics.

        Returns
        -------
        pd.DataFrame
        """
        if dataset == "train":
            return pd.concat(
                [
                    report.train_report().metrics()
                    for report in self._id_to_report.values()
                ],
                axis=1,
            )
        elif dataset == "test":
            return pd.concat(
                [
                    report.test_report().metrics()
                    for report in self._id_to_report.values()
                ],
                axis=1,
            )
        elif dataset == "both":
            test_metrics = pd.concat(
                [
                    report.test_report().metrics()
                    for report in self._id_to_report.values()
                ],
                axis=1,
            )
            train_metrics = pd.concat(
                [
                    report.train_report().metrics()
                    for report in self._id_to_report.values()
                ],
                axis=1,
            )
            return pd.concat(
                [train_metrics, test_metrics], keys=["train", "test"], names=["Dataset"]
            )
        else:
            raise ValueError('dataset must be either "train", "test", or "both".')



[docs]
    def cv_metrics(self, average_across_folds: bool = True) -> pd.DataFrame | None:
        """Returns a DataFrame containing the cross-validated goodness-of-fit
        statistics for all models on the training data. Cross validation must
        have been conducted, otherwise None is returned.

        Parameters
        ----------
        average_across_folds : bool
            Default: True.
            If True, returns a DataFrame containing goodness-of-fit
            statistics averaged across all folds.
            Otherwise, returns a DataFrame containing goodness-of-fit
            statistics for each fold.

        Returns
        -------
        pd.DataFrame | None
            None if cross validation was not conducted.
        """
        if not self._models[0].is_cross_validated():
            print_wrapped(
                "Cross validation statistics are not available "
                + "for models that are not cross-validated.",
                type="WARNING",
            )
            return None
        return pd.concat(
            [
                report.train_report().cv_metrics(average_across_folds)
                for report in self._id_to_report.values()
            ],
            axis=1,
        )



[docs]
    def fs_report(self) -> VotingSelectionReport | None:
        """Returns the feature selection report. If feature selectors were
        specified at the model level or not at all, then this method will return None.

        To access the feature selection report for a specific model, use
        model_report(<model_id>).feature_selection_report().

        Returns
        -------
        VotingSelectionReport | None
            None if feature selectors were not specified.
        """
        if self._feature_selection_report is None:
            print_wrapped(
                "No feature selection report available.",
                type="WARNING",
            )
        return self._feature_selection_report



[docs]
    def plot_obs_vs_pred(
        self,
        model_id: str,
        dataset: Literal["train", "test"],
        figsize: tuple[float, float] = (5, 5),
        ax: plt.Axes | None = None,
    ) -> plt.Figure:
        """Returns a figure that is a scatter plot of the observed (y-axis) and
        predicted (x-axis) values for the specified model and dataset.

        Parameters
        ----------
        model_id : str
            The id of the model.

        dataset : Literal['train', 'test']
            The dataset for which to plot the observed vs predicted values.

        figsize : tuple[float, float]
            Default: (5, 5). The size of the figure.

        ax : plt.Axes | None
            Default: None. The axes on which to plot the figure. If None,
            a new figure is created.

        Returns
        -------
        plt.Figure
        """
        return self._id_to_report[model_id].plot_obs_vs_pred(dataset, figsize, ax)



[docs]
    def feature_importance(self, model_id: str) -> pd.DataFrame | None:
        """Returns the feature importances of the model with the specified id.
        If the model does not have feature importances, the coefficients are returned
        instead. Otherwise, None is returned.

        Parameters
        ----------
        model_id : str
            The id of the model.

        Returns
        -------
        pd.DataFrame | None
            None is returned if the model does not have feature importances
            or coefficients.
        """
        return self._id_to_report[model_id].feature_importance()


    def __getitem__(self, model_id: str) -> SingleModelMLRegReport:
        return self._id_to_report[model_id]

    def __str__(self) -> str:
        n_dec = print_options._n_decimals
        max_width = print_options._max_line_width

        top_divider = color_text("=" * max_width, "none") + "\n"
        bottom_divider = "\n" + color_text("=" * max_width, "none")
        divider = "\n" + color_text("-" * max_width, "none") + "\n"
        divider_invisible = "\n" + " " * max_width + "\n"

        title_message = bold_text("ML Regression Report")

        target_var = "'" + self._y_var + "'"
        target_message = f"{bold_text('Target variable:')}\n"
        target_message += fill_ignore_format(
            color_text(target_var, "purple"),
            width=max_width,
            initial_indent=2,
            subsequent_indent=2,
        )

        predictors_message = f"{bold_text('Predictor variables:')}\n"
        predictors_message += fill_ignore_format(
            list_to_string(self._predictors),
            width=max_width,
            initial_indent=2,
            subsequent_indent=2,
        )

        models_str = list_to_string(
            [model._name for model in self._models],
            color="blue",
        )
        models_message = f"{bold_text('Models evaluated:')}\n"
        models_message += fill_ignore_format(
            models_str,
            width=max_width,
            initial_indent=2,
            subsequent_indent=2,
        )

        if self._feature_selectors is not None:
            fs_str = list_to_string(
                [fs._name for fs in self._feature_selectors], color="blue"
            )
        else:
            fs_str = color_text("None", "yellow")
        feature_selectors_message = f"{bold_text('Feature selectors:')}\n"
        feature_selectors_message += fill_ignore_format(
            fs_str,
            width=max_width,
            initial_indent=2,
            subsequent_indent=2,
        )

        top_models_message = f"{bold_text('Best models:')}\n"
        top_models_df = (
            self.metrics("test").T.sort_values("rmse", ascending=True).head(3)
        )
        for i, model in enumerate(top_models_df.index):
            top_models_message += fill_ignore_format(
                format_two_column(
                    f"{i+1}. " + quote_and_color(str(model)),
                    "Test RMSE: "
                    + color_text(
                        str(np.round(top_models_df.loc[model, "rmse"], n_dec)), "yellow"
                    ),
                    total_len=max_width - 2,
                ),
                initial_indent=2,
            )
            if i < len(top_models_df) - 1:
                top_models_message += "\n"

        final_message = (
            top_divider
            + title_message
            + divider
            + target_message
            + divider_invisible
            + predictors_message
            + divider_invisible
            + models_message
            + divider_invisible
            + feature_selectors_message
            + divider
            + top_models_message
            + bottom_divider
        )

        return final_message

    def _repr_pretty_(self, p, cycle):
        p.text(str(self))

    def _to_dict(self) -> dict:
        return {
            "train_metrics": self.metrics("train").to_dict("index"),
            "test_metrics": self.metrics("test").to_dict("index"),
            "model_info": [model._to_dict() for model in self._models],
        }