Source code for tablemage._src.linear.reports.ols_report

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import Literal
import warnings
from adjustText import adjust_text
from statsmodels.regression.linear_model import OLSResults, RegressionResultsWrapper
from ...data import DataHandler, DataEmitter
from ...metrics.visualization import plot_obs_vs_pred, decrease_font_sizes_axs
from ..ols import OLSLinearModel
from ...display.print_utils import print_wrapped, suppress_print_output
from .linearreport_utils import MAX_N_OUTLIERS_TEXT, TRAIN_ONLY_MESSAGE
from ..lmutils.plot import (
    plot_residuals_vs_var,
    plot_residuals_vs_fitted,
    plot_residuals_hist,
    plot_scale_location,
    plot_residuals_vs_leverage,
    plot_qq,
)
from ...display.print_options import print_options
from ...display.plot_options import plot_options
from ...display.print_utils import (
    print_wrapped,
    color_text,
    bold_text,
    list_to_string,
    fill_ignore_format,
    format_two_column,
)
from ...stattests import StatisticalTestReport


class _SingleDatasetOLSReport:
    """Class for generating regression-relevant diagnostic
    plots and tables for a single linear regression model.
    """

    def __init__(self, model: OLSLinearModel, dataset: Literal["train", "test"]):
        """
        Initializes a _SingleDatasetOLSReport object.

        Parameters
        ----------
        model : OLSLinearModel.
            The model must already be trained.

        dataset : Literal['train', 'test']
            The dataset to generate the report for.
        """
        self.model = model

        with suppress_print_output():
            if dataset == "test":
                self.scorer = model.test_scorer
                self._X_eval_df = self.model._dataemitter.emit_test_Xy()[0]
                self._is_train = False
            elif dataset == "train":
                self.scorer = model.train_scorer
                self._X_eval_df = self.model._dataemitter.emit_train_Xy()[0]
                self._is_train = True
            else:
                raise ValueError('specification must be either "train" or "test".')

        self._y_pred = self.scorer._y_pred
        self._y_true = self.scorer._y_true

        self._residuals = self._y_true - self._y_pred
        self._stdresiduals = self._residuals / np.std(self._residuals)
        self._outlier_threshold = 2
        self._compute_outliers()

        self._include_text = False
        if self._n_outliers <= MAX_N_OUTLIERS_TEXT:
            self._include_text = True

    def plot_obs_vs_pred(
        self,
        show_outliers: bool = True,
        figsize: tuple[float, float] = (5.0, 5.0),
        ax: plt.Axes | None = None,
    ) -> plt.Figure:
        """Returns a figure that is a scatter plot of the true and predicted y
        values.

        Parameters
        ----------
        show_outliers : bool
            Default: True.
            If True, then the outliers calculated using standard errors will be
            shown in red.

        figsize : tuple[float, float]
            Default: (5.0,5.0). Sets the size of the resulting graph.

        ax : plt.Axes
            Default: None.

        Returns
        -------
        plt.Figure
        """
        fig = None
        if ax is None:
            fig, ax = plt.subplots(1, 1, figsize=figsize)

        plot_obs_vs_pred(self._y_pred, self._y_true, self.model._name, figsize, ax)
        if show_outliers and self._n_outliers > 0:
            ax.scatter(
                self._y_pred[self._outliers_residual_mask],
                self._y_true[self._outliers_residual_mask],
                s=plot_options._dot_size,
                color="red",
            )
            if self._include_text and self._n_outliers <= MAX_N_OUTLIERS_TEXT:
                annotations = []
                for i, label in enumerate(self._outliers_df_idx):
                    annotations.append(
                        ax.annotate(
                            label,
                            (
                                self._y_pred[self._outliers_residual_mask][i],
                                self._y_true[self._outliers_residual_mask][i],
                            ),
                            color="red",
                            fontsize=plot_options._axis_minor_ticklabel_font_size,
                        )
                    )
                adjust_text(annotations, ax=ax)

        if fig is not None:
            fig.tight_layout()
            plt.close()
        return fig

    def plot_residuals_vs_fitted(
        self,
        standardized: bool = False,
        show_outliers: bool = True,
        figsize: tuple[float, float] = (5.0, 5.0),
        ax: plt.Axes | None = None,
    ) -> plt.Figure:
        """Returns a figure that is a residuals vs fitted (y_pred) plot.

        Parameters
        ----------
        standardized : bool
            Default: False. If True, plots the standardized residuals as
            opposed to the raw residuals.

        show_outliers : bool
            Default: True. If True, colors the outliers determined by the
            standardized residuals in red.

        figsize : tuple[float, float]
            Default: (5.0, 5.0). Determines the size of the returned figure.

        ax : plt.Axes
            Default: None.

        Returns
        -------
        plt.Figure
        """
        return plot_residuals_vs_fitted(
            y_pred=self._y_pred,
            residuals=self._residuals,
            outliers_idx=self._outliers_df_idx,
            outliers_mask=self._outliers_residual_mask,
            show_outliers=show_outliers,
            standardized=standardized,
            include_text=self._include_text,
            figsize=figsize,
            ax=ax,
        )

    def plot_residuals_vs_var(
        self,
        predictor: str,
        standardized: bool = False,
        show_outliers: bool = False,
        figsize: tuple[float, float] = (5.0, 5.0),
        ax: plt.Axes | None = None,
    ) -> plt.Figure:
        """Returns a figure that is a residuals vs fitted (y_pred) plot.

        Parameters
        ----------
        predictor : str
            The predictor variable whose values should be plotted on the x-axis.

        standardized : bool
            Default: False. If True, standardizes the residuals.

        show_outliers : bool
            Default: False. If True, plots the outliers in red.

        figsize : tuple[float, float]
            Default: (5.0, 5.0). Determines the size of the returned figure.

        ax : plt.Axes
            Default: None.

        Returns
        -------
        plt.Figure
        """
        return plot_residuals_vs_var(
            predictor=predictor,
            X_eval_df=self._X_eval_df,
            residuals=self._residuals,
            outliers_idx=self._outliers_df_idx,
            outliers_mask=self._outliers_residual_mask,
            show_outliers=show_outliers,
            standardized=standardized,
            include_text=self._include_text,
            figsize=figsize,
            ax=ax,
        )

    def plot_residuals_hist(
        self,
        standardized: bool = False,
        density: bool = False,
        figsize: tuple[float, float] = (5.0, 5.0),
        ax: plt.Axes | None = None,
    ) -> plt.Figure:
        """Returns a figure that is a histogram of the residuals.

        Parameters
        ----------
        standardized : bool
            Default: False. If True, standardizes the residuals.

        density : bool
            Default: False. If True, plots density rather than frequency.

        figsize : tuple[float, float]
            Default: (5.0, 5.0). Determines the size of the returned figure.

        ax : plt.Axes
            Default: None.

        Returns
        -------
        plt.Figure
        """
        return plot_residuals_hist(
            residuals=self._residuals,
            standardized=standardized,
            density=density,
            figsize=figsize,
            ax=ax,
        )

    def plot_scale_location(
        self,
        show_outliers: bool = True,
        figsize: tuple[float, float] = (5.0, 5.0),
        ax: plt.Axes | None = None,
    ) -> plt.Figure:
        """Returns a figure that is a plot of the
        sqrt of the residuals versus the fitted.

        Parameters
        ----------
        show_outliers : bool
            Default: True. If True, plots the outliers in red.

        figsize : tuple[float, float]
            Default: (5.0, 5.0).

        ax : plt.Axes
            Default: None.

        Returns
        -------
        plt.Figure
        """
        return plot_scale_location(
            y_pred=self._y_pred,
            std_residuals=self._residuals / np.std(self._residuals),
            show_outliers=show_outliers,
            outliers_idx=self._outliers_df_idx,
            outliers_mask=self._outliers_residual_mask,
            include_text=self._include_text,
            figsize=figsize,
            ax=ax,
        )

    def plot_residuals_vs_leverage(
        self,
        standardized: bool = True,
        show_outliers: bool = True,
        figsize: tuple[float, float] = (5.0, 5.0),
        ax: plt.Axes | None = None,
    ) -> plt.Figure:
        """Returns a figure that is a plot of the residuals versus leverage.

        Parameters
        ----------
        standardized : bool
            Default: True. If True, standardizes the residuals.

        show_outliers : bool
            Default: True. If True, plots the outliers in red.

        figsize : tuple[float, float]
            Default: (5.0, 5.0).

        ax : plt.Axes
            Default: None.

        Returns
        -------
        plt.Figure
        """
        if not self._is_train:
            print_wrapped(TRAIN_ONLY_MESSAGE, type="WARNING")
            return None

        if isinstance(self.model.estimator, RegressionResultsWrapper):
            leverage = self.model.estimator._results.get_influence().hat_matrix_diag
        else:
            raise ValueError(
                "Leverage/influence statistics are not available for regularized models. "
                f"The statsmodels output type is {type(self.model.estimator)}."
            )

        return plot_residuals_vs_leverage(
            leverage=leverage,
            residuals=self._residuals,
            standardized=standardized,
            show_outliers=show_outliers,
            outliers_idx=self._outliers_df_idx,
            outliers_mask=self._outliers_residual_mask,
            include_text=self._include_text,
            figsize=figsize,
            ax=ax,
        )

    def plot_qq(
        self,
        standardized: bool = True,
        show_outliers: bool = False,
        figsize: tuple[float, float] = (5.0, 5.0),
        ax: plt.Axes | None = None,
    ) -> plt.Figure:
        """Returns a quantile-quantile plot.

        Parameters
        ----------
        standardized : bool
            Default: True. If True, standardizes the residuals.

        show_outliers : bool
            Default: False. If True, plots the outliers in red.

        figsize : tuple[float, float]
            Default: (5.0, 5.0).

        ax : plt.Axes
            Default: None.

        Returns
        -------
        plt.Figure
        """
        return plot_qq(
            df_idx=self._X_eval_df.index,
            residuals=self._residuals,
            standardized=standardized,
            outliers_idx=self._outliers_df_idx,
            outliers_mask=self._outliers_residual_mask,
            show_outliers=show_outliers,
            include_text=self._include_text,
            figsize=figsize,
            ax=ax,
        )

    def plot_diagnostics(
        self, show_outliers: bool = False, figsize: tuple[float, float] = (7.0, 7.0)
    ) -> plt.Figure:
        """Plots several useful linear regression diagnostic plots.

        Parameters
        ----------
        show_outliers : bool
            Default: False. If True, plots the residual outliers in red.

        figsize : tuple[float, float]
            Default: (7.0, 7.0).

        Returns
        -------
        plt.Figure
        """
        fig, axs = plt.subplots(2, 2, figsize=figsize)
        self.plot_obs_vs_pred(show_outliers=show_outliers, ax=axs[0][0])
        self.plot_residuals_vs_fitted(show_outliers=show_outliers, ax=axs[0][1])
        if self._is_train and isinstance(self.model.estimator, OLSResults):
            self.plot_residuals_vs_leverage(show_outliers=show_outliers, ax=axs[1][0])
        else:
            self.plot_scale_location(show_outliers=show_outliers, ax=axs[1][0])
        self.plot_qq(show_outliers=show_outliers, ax=axs[1][1])
        fig.subplots_adjust(hspace=0.3, wspace=0.3)
        decrease_font_sizes_axs(axs, 2, 2, 0)
        plt.close(fig)
        return fig

    def set_outlier_threshold(self, threshold: float) -> "_SingleDatasetOLSReport":
        """Standardized residuals threshold for outlier identification.
        Recomputes the outliers.

        Parameters
        ----------
        threshold : float
            Default: 2. Must be a nonnegative value.

        Returns
        -------
        self
        """
        if threshold < 0:
            raise ValueError(
                f"Input threshold must be nonnegative. Received {threshold}."
            )
        self._outlier_threshold = threshold
        self._compute_outliers()
        return self

    def get_outlier_indices(self) -> list:
        """Returns the indices corresponding to DataFrame examples associated
        with standardized residual outliers.

        Returns
        -------
        outliers_df_idx : list ~ (n_outliers)
        """
        return self._outliers_df_idx.tolist()

    def metrics(self) -> pd.DataFrame:
        """Returns a DataFrame containing the goodness-of-fit statistics
        for the model.

        Returns
        ----------
        pd.DataFrame
        """
        return self.scorer.stats_df().astype(float).round(print_options._n_decimals)

    def _compute_outliers(self):
        """Computes the outliers."""
        self._outliers_residual_mask = (
            self._stdresiduals >= self._outlier_threshold
        ) | (self._stdresiduals <= -self._outlier_threshold)
        self._outliers_df_idx = self._X_eval_df.iloc[
            self._outliers_residual_mask
        ].index.to_numpy()
        self._n_outliers = len(self._outliers_df_idx)
        self._include_text = False
        if self._n_outliers <= MAX_N_OUTLIERS_TEXT:
            self._include_text = True


[docs] class OLSReport: """OLSReport. Fits the model based on provided DataHandler. Contains methods for generating regression-relevant diagnostic plots and tables for a single linear regression model. """ def __init__( self, model: OLSLinearModel, datahandler: DataHandler, target: str, predictors: list[str], dataemitter: DataEmitter | None = None, ): """OLSReport. Fits the model based on provided DataHandler. Contains methods for generating regression-relevant diagnostic plots and tables for a single linear regression model. Parameters ---------- model : OLSModel datahandler : DataHandler The DataHandler object that contains the data. target : str The name of the target variable. predictors : list[str] The names of the predictor variables. dataemitter : DataEmitter Default: None. The DataEmitter object that emits the data. Optionally you can initialize the report with a DataEmitter object instead of a DataHandler object. If not None, will ignore the values of target and predictors. """ self._model = model self._datahandler = datahandler if dataemitter is not None: self._dataemitter = dataemitter else: self._dataemitter = self._datahandler.train_test_emitter(target, predictors) self._model.specify_data(self._dataemitter) self._model.fit() self._target = target self._predictors = predictors self._train_report = _SingleDatasetOLSReport(model, "train") self._test_report = _SingleDatasetOLSReport(model, "test") def train_report(self) -> _SingleDatasetOLSReport: """Returns an SingleDatasetLinRegReport object for the train dataset Returns ------- SingleDatasetLinRegReport """ return self._train_report def test_report(self) -> _SingleDatasetOLSReport: """Returns an SingleDatasetLinRegReport object for the test dataset Returns ------- SingleDatasetLinRegReport """ return self._test_report
[docs] def model(self) -> OLSLinearModel: """Returns the fitted OLSLinearModel object. Returns ------- OLSLinearModel """ return self._model
[docs] def metrics(self, dataset: Literal["train", "test", "both"]) -> pd.DataFrame: """Returns a DataFrame containing the goodness-of-fit statistics for the model. Parameters ---------- dataset : Literal['train', 'test', 'both'] The dataset to compute the metrics for. Returns ------- pd.DataFrame """ if dataset == "train": return self._train_report.metrics() elif dataset == "test": return self._test_report.metrics() elif dataset == "both": test_metrics = self._test_report.metrics() # one column w/ model name train_metrics = self._train_report.metrics() # one column w/ model name # stack the two DataFrames on top of each other # add an outermost index level to differentiate between the two datasets return pd.concat( [train_metrics, test_metrics], keys=["train", "test"], names=["Dataset"] ) else: raise ValueError('dataset must be either "train", "test", or "both".')
[docs] def step( self, direction: Literal["both", "backward", "forward"] = "backward", criteria: Literal["aic", "bic"] = "aic", kept_vars: list[str] | None = None, all_vars: list[str] | None = None, start_vars: list[str] | None = None, max_steps: int = 100, ) -> "OLSReport": """Performs stepwise selection. Returns a new OLSReport object with the reduced model. Parameters ---------- direction : Literal["both", "backward", "forward"] Default: 'backward'. The direction of the stepwise selection. criteria : Literal["aic", "bic"] Default: 'aic'. The criteria to use for selecting the best model. kept_vars : list[str] Default: None. The variables that should be kept in the model. If None, defaults to an empty list. all_vars : list[str] Default: None. The variables that are candidates for inclusion in the model. If None, defaults to all variables in the training data. start_vars : list[str] Default: None. The variables to start the bidirectional stepwise selection with. Ignored if direction is not 'both'. If direction is 'both' and start_vars is None, then the starting variables are the kept_vars. max_steps : int Default: 100. The maximum number of steps to take. Returns ------- OLSReport """ if direction == "backward": method_name = "Backward selection" elif direction == "both": method_name = "Alternating selection" elif direction == "forward": method_name = "Forward selection" else: raise ValueError(f"Invalid argument: {direction}.") selected_vars = self._model.step( direction=direction, criteria=criteria, kept_vars=kept_vars, all_vars=all_vars, start_vars=start_vars, max_steps=max_steps, ) if all_vars is None: all_vars = self._model._dataemitter.X_vars() vars_removed = list(set(all_vars) - set(selected_vars)) if len(vars_removed) == 0: print_wrapped( f"{method_name} removed 0 predictors.", level="INFO", type="NOTE" ) return self elif len(vars_removed) == 1: print_wrapped( text=f"{method_name} removed {len(vars_removed)} predictor: " + list_to_string(vars_removed) + ".", level="INFO", type="UPDATE", ) else: print_wrapped( text=f"{method_name} removed {len(vars_removed)} predictors: " + list_to_string(vars_removed) + ".", level="INFO", type="UPDATE", ) new_emitter = self._dataemitter.copy() new_emitter.select_predictors_pre_onehot(selected_vars) return OLSReport( OLSLinearModel( alpha=self._model.alpha, l1_weight=self._model.l1_weight, name=self._model._name + f" (reduced, direction={direction})", ), self._datahandler, # only used for y var scaler self._target, # ignored selected_vars, # ignored new_emitter, )
[docs] def test_lr(self, alternative_report: "OLSReport") -> StatisticalTestReport: """Performs a likelihood ratio test to compare an alternative OLSLinearModel. Returns an object of class StatisticalTestReport describing the results. Parameters ---------- alternative_report : OLSReport The report of an alternative OLSLinearModel. The alternative model must be a nested version of the current model or vice-versa. Returns ------- StatisticalTestReport """ if not isinstance(self._model.estimator, RegressionResultsWrapper): raise ValueError( "Partial F-tests are not available for regularized models. " f"The model type is {type(self._model.estimator)}." ) # Determine which report is the reduced model # Get the models from each report original_model = self._train_report.model.estimator alternative_model = alternative_report.train_report().model.estimator # Get the number of predictors for each model num_predictors_orig = len(self._train_report._X_eval_df.columns) num_predictors_alternative = len( alternative_report.train_report()._X_eval_df.columns ) if num_predictors_orig > num_predictors_alternative: full_model = original_model reduced_model = alternative_model elif num_predictors_orig < num_predictors_alternative: full_model = alternative_model reduced_model = original_model else: # Raise an error if the number of predictors are the same raise ValueError("One model must be a reduced version of the other") # Raise ValueError if one set of predictors is not a subset of the other orig_var_set = set(self._train_report._X_eval_df.columns) alt_var_set = set(alternative_report.train_report()._X_eval_df.columns) if not (orig_var_set < alt_var_set or orig_var_set > alt_var_set): raise ValueError("One model must be a reduced version of the other") # Extract the results of the test and temporarily suppress warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") lr_stat, p_value, dr_diff = full_model.compare_lr_test(reduced_model) # Initialize and return an object of class StatisticalTestReport lr_result = StatisticalTestReport( description="Likelihood Ratio Test", statistic=lr_stat, pval=p_value, degfree=dr_diff, statistic_description="Chi-square", null_hypothesis_description="The full model does not fit the " "data significantly better than the reduced model", alternative_hypothesis_description="The full model fits the " "data signficantly better than the reduced model", assumptions_description="The data must be homoscedastic and " "uncorrelated", ) return lr_result
[docs] def test_partialf(self, alternative_report: "OLSReport") -> StatisticalTestReport: """Performs a partial F-test to compare an alternative OLSLinearModel. Returns an object of class StatisticalTestReport describing the results. Parameters ---------- alternative_report : OLSReport The report of an alternative OLSLinearModel. The alternative model must be a nested version of the current model or vice-versa. Returns ------- StatisticalTestReport """ if not isinstance(self._model.estimator, RegressionResultsWrapper): raise ValueError( "Partial F-tests are not available for regularized models." ) # Determine which report is the reduced model # Get the models from each report original_model = self._train_report.model.estimator alternative_model = alternative_report.train_report().model.estimator # Get the number of predictors for each model num_predictors_orig = len(self._train_report._X_eval_df.columns) num_predictors_alternative = len( alternative_report.train_report()._X_eval_df.columns ) if num_predictors_orig > num_predictors_alternative: full_model = original_model reduced_model = alternative_model elif num_predictors_orig < num_predictors_alternative: full_model = alternative_model reduced_model = original_model else: # Raise an error if the number of predictors are the same raise ValueError("One model must be a reduced version of the other") # Raise ValueError if one set of predictors is not a subset of the other orig_var_set = set(self._train_report._X_eval_df.columns) alt_var_set = set(alternative_report.train_report()._X_eval_df.columns) if not (orig_var_set < alt_var_set or orig_var_set > alt_var_set): raise ValueError("One model must be a reduced version of the other") # Extract the results of the test and suppress warnings temporarily with warnings.catch_warnings(): warnings.simplefilter("ignore") f_value, p_value, dr_diff = full_model.compare_f_test(reduced_model) # Initialize and return an object of class StatisticalTestReport partial_f_result = StatisticalTestReport( description="Partial F-Test", statistic=f_value, pval=p_value, degfree=dr_diff, statistic_description="F-statistic", null_hypothesis_description="The coefficients of the additional " "predictors are all zero", alternative_hypothesis_description="At least one of the " "coefficients of the additional predictors is not zero", assumptions_description="The data must be homoscedastic and " "have no autocorrelation", ) return partial_f_result
[docs] def statsmodels_summary(self): """Returns the summary of the statsmodels RegressionResultsWrapper for OLS. """ try: return self._model.estimator.summary() except Exception as e: raise RuntimeError( "Error occured in statsmodels_summary call. " f"Error: {e}" )
[docs] def plot_obs_vs_pred( self, dataset: Literal["train", "test"], show_outliers: bool = True, figsize: tuple[float, float] = (5.0, 5.0), ax: plt.Axes | None = None, ) -> plt.Figure: """Plots a scatter plot of the true and predicted y values. Parameters ---------- dataset : Literal['train', 'test'] The dataset to generate the plot for. show_outliers : bool Default: True. If True, then the outliers calculated using standard errors will be shown in red. figsize : tuple[float, float] Default: (5.0,5.0). Sets the size of the resulting graph. ax : plt.Axes Default: None. Returns ------- - Figure """ if dataset == "train": return self._train_report.plot_obs_vs_pred( show_outliers=show_outliers, figsize=figsize, ax=ax ) else: return self._test_report.plot_obs_vs_pred( show_outliers=show_outliers, figsize=figsize, ax=ax )
[docs] def plot_residuals_vs_fitted( self, dataset: Literal["train", "test"], standardized: bool = False, show_outliers: bool = True, figsize: tuple[float, float] = (5.0, 5.0), ax: plt.Axes | None = None, ) -> plt.Figure: """Plots the residuals versus the fitted values. Parameters ---------- dataset : Literal['train', 'test'] The dataset to generate the plot for. standardized : bool Default: False. If True, plots the standardized residuals as opposed to the raw residuals. show_outliers : bool Default: True. If True, colors the outliers determined by the standardized residuals in red. figsize : tuple[float, float] Default: (5.0, 5.0). Determines the size of the returned figure. ax : plt.Axes Default: None. Returns ------- plt.Figure """ if dataset == "train": return self._train_report.plot_residuals_vs_fitted( standardized=standardized, show_outliers=show_outliers, figsize=figsize, ax=ax, ) elif dataset == "test": return self._test_report.plot_residuals_vs_fitted( standardized=standardized, show_outliers=show_outliers, figsize=figsize, ax=ax, ) else: raise ValueError('The dataset must be either "train" or "test".')
[docs] def plot_residuals_vs_var( self, predictor: str, dataset: Literal["train", "test"], standardized: bool = False, show_outliers: bool = False, figsize: tuple[float, float] = (5.0, 5.0), ax: plt.Axes | None = None, ) -> plt.Figure: """Returns a figure that is a residuals vs fitted (y_pred) plot. Parameters ---------- predictor : str The predictor variable whose values should be plotted on the x-axis. dataset : Literal['train', 'test'] The dataset to generate the plot for. standardized : bool Default: False. If True, standardizes the residuals. show_outliers : bool Default: False. If True, plots the outliers in red. figsize : tuple[float, float] Default: (5.0, 5.0). Determines the size of the returned figure. ax : plt.Axes Default: None. Returns ------- plt.Figure """ if dataset == "train": return self._train_report.plot_residuals_vs_var( predictor=predictor, standardized=standardized, show_outliers=show_outliers, figsize=figsize, ax=ax, ) elif dataset == "test": return self._test_report.plot_residuals_vs_var( predictor=predictor, standardized=standardized, show_outliers=show_outliers, figsize=figsize, ax=ax, ) else: raise ValueError('The dataset must be either "train" or "test".')
[docs] def plot_residuals_hist( self, dataset: Literal["train", "test"], standardized: bool = False, density: bool = False, figsize: tuple[float, float] = (5.0, 5.0), ax: plt.Axes | None = None, ) -> plt.Figure: """Returns a figure that is a histogram of the residuals. Parameters ---------- dataset : Literal['train', 'test'] The dataset to generate the plot for. standardized : bool Default: False. If True, standardizes the residuals. density : bool Default: False. If True, plots density rather than frequency. figsize : tuple[float, float] Default: (5.0, 5.0). Determines the size of the returned figure. ax : plt.Axes Default: None. Returns ------- plt.Figure """ if dataset == "train": return self._train_report.plot_residuals_hist( standardized=standardized, density=density, figsize=figsize, ax=ax ) elif dataset == "test": return self._test_report.plot_residuals_hist( standardized=standardized, density=density, figsize=figsize, ax=ax ) else: raise ValueError('The dataset must be either "train" or "test".')
[docs] def plot_scale_location( self, dataset: Literal["train", "test"], show_outliers: bool = True, figsize: tuple[float, float] = (5.0, 5.0), ax: plt.Axes | None = None, ) -> plt.Figure: """Returns a figure that is a plot of the sqrt of the residuals versus the fitted. Parameters ---------- dataset : Literal['train', 'test'] The dataset to generate the plot for. show_outliers : bool Default: True. If True, plots the outliers in red. figsize : tuple[float, float] Default: (5.0, 5.0). ax : plt.Axes Default: None. Returns ------- plt.Figure """ if dataset == "train": return self._train_report.plot_scale_location( show_outliers=show_outliers, figsize=figsize, ax=ax ) elif dataset == "test": return self._test_report.plot_scale_location( show_outliers=show_outliers, figsize=figsize, ax=ax ) else: raise ValueError('The dataset must be either "train" or "test".')
[docs] def plot_residuals_vs_leverage( self, dataset: Literal["train", "test"], standardized: bool = True, show_outliers: bool = True, figsize: tuple[float, float] = (5.0, 5.0), ax: plt.Axes | None = None, ) -> plt.Figure: """Plots the residuals versus leverage. Parameters ---------- dataset : Literal['train', 'test'] Default: 'test'. standardized : bool Default: True. If True, standardizes the residuals. show_outliers : bool Default: True. If True, plots the outliers in red. figsize : tuple[float, float] Default: (5.0, 5.0). ax : plt.Axes Default: None. Returns ------- plt.Figure """ if dataset == "train": return self._train_report.plot_residuals_vs_leverage( standardized=standardized, show_outliers=show_outliers, figsize=figsize, ax=ax, ) elif dataset == "test": return self._test_report.plot_residuals_vs_leverage( standardized=standardized, show_outliers=show_outliers, figsize=figsize, ax=ax, ) else: raise ValueError('The dataset must be either "train" or "test".')
[docs] def plot_qq( self, dataset: Literal["train", "test"], standardized: bool = True, show_outliers: bool = False, figsize: tuple[float, float] = (5.0, 5.0), ax: plt.Axes | None = None, ) -> plt.Figure: """Plots a quantile-quantile plot of the residuals. Parameters ---------- dataset : Literal['train', 'test'] The dataset to generate the plot for. standardized : bool Default: True. If True, standardizes the residuals. show_outliers : bool Default: False. If True, plots the outliers in red. figsize : tuple[float, float] Default: (5.0, 5.0). ax : plt.Axes Default: None. Returns ------- plt.Figure """ if dataset == "train": return self._train_report.plot_qq( standardized=standardized, show_outliers=show_outliers, figsize=figsize, ax=ax, ) elif dataset == "test": return self._test_report.plot_qq( standardized=standardized, show_outliers=show_outliers, figsize=figsize, ax=ax, ) else: raise ValueError('The dataset must be either "train" or "test".')
[docs] def plot_diagnostics( self, dataset: Literal["train", "test"], show_outliers: bool = False, figsize: tuple[float, float] = (7.0, 7.0), ) -> plt.Figure: """Plots several useful linear regression diagnostic plots. Parameters ---------- dataset : Literal['train', 'test'] The dataset to generate the plot for. show_outliers : bool Default: False. If True, plots the residual outliers in red. figsize : tuple[float, float] Default: (7.0, 7.0). Returns ------- plt.Figure """ if dataset == "train": return self._train_report.plot_diagnostics( show_outliers=show_outliers, figsize=figsize ) elif dataset == "test": return self._test_report.plot_diagnostics( show_outliers=show_outliers, figsize=figsize ) else: raise ValueError('The dataset must be either "train" or "test".')
[docs] def set_outlier_threshold(self, threshold: float) -> "OLSReport": """Standardized residuals threshold for outlier identification. Recomputes the outliers. Parameters ---------- threshold : float Default: 2. Must be a nonnegative value. Returns ------- OLSReport Returns self for method chaining. """ self._train_report.set_outlier_threshold(threshold=threshold) self._test_report.set_outlier_threshold(threshold=threshold) return self
[docs] def get_outlier_indices(self, dataset: Literal["train", "test"] = "test") -> list: """Returns the indices corresponding to DataFrame examples associated with standardized residual outliers. Parameters ---------- dataset : Literal['train', 'test'] Default: 'test'. Returns ------- outliers_df_idx : list ~ (n_outliers) """ if dataset == "train": return self._train_report.get_outlier_indices() else: return self._test_report.get_outlier_indices()
[docs] def coefs( self, format: Literal[ "coef(se)|pval", "coef|se|pval", "coef(ci)|pval", "coef|ci_low|ci_high|pval" ] = "coef(se)|pval", ) -> pd.DataFrame: """Returns the coefficients of the model. Parameters ---------- format : Literal["coef(se)|pval", "coef|se|pval", "coef(ci)|pval", "coef|ci_low|ci_high|pval"] Default: 'coef(se)|pval'. Returns ------- pd.DataFrame """ return self._model.coefs(format)
def _compute_outliers(self, dataset: Literal["train", "test"] = "test"): """Computes the outliers. Parameters ---------- dataset : Literal['train', 'test'] Default: 'test'. """ if dataset == "train": return self._train_report._compute_outliers() else: return self._test_report._compute_outliers() def _to_dict(self) -> dict: """Returns the JSON serializable data stored in the report as a dictionary. Returns ------- dict """ return { "coefficients": self.coefs("coef(se)|pval").to_dict("index"), "train_metrics": self.metrics("train").to_dict("index"), "test_metrics": self.metrics("test").to_dict("index"), } def __str__(self) -> str: max_width = print_options._max_line_width n_dec = print_options._n_decimals top_divider = color_text("=" * max_width, "none") + "\n" bottom_divider = "\n" + color_text("=" * max_width, "none") divider = "\n" + color_text("-" * max_width, "none") + "\n" divider_invisible = "\n" + " " * max_width + "\n" if self._model.alpha == 0: title_message = bold_text("Ordinary Least Squares Regression Report") else: if self._model.l1_weight == 0: title_message = bold_text( f"Ridge Regression Report (alpha={self._model.alpha})" ) elif self._model.l1_weight == 1: title_message = bold_text( f"Lasso Regression Report (alpha={self._model.alpha})" ) else: title_message = bold_text( f"Elastic Net Regression Report (alpha={self._model.alpha}, " f"l1_ratio={self._model.l1_weight})" ) target_var = "'" + self._target + "'" target_message = f"{bold_text('Target variable:')}\n" target_message += fill_ignore_format( color_text(target_var, "purple"), width=max_width, initial_indent=2, subsequent_indent=2, ) predictors_message = ( f"{bold_text(f'Predictor variables ({len(self._predictors)}):')}\n" ) predictors_message += fill_ignore_format( list_to_string(self._predictors), width=max_width, initial_indent=2, subsequent_indent=2, ) metrics_message = f"{bold_text('Metrics:')}\n" metrics_message += fill_ignore_format( format_two_column( bold_text(f"Train ({self._model._n_train})"), bold_text(f"Test ({self._model._n_test})"), total_len=max_width - 2, ), initial_indent=2, ) mstr = str(self._model) metrics_message += "\n" metrics_message += fill_ignore_format( format_two_column( "R2: " + color_text( str(np.round(self.metrics("train").at["r2", mstr], n_dec)), "yellow" ), "R2: " + color_text( str(np.round(self.metrics("test").at["r2", mstr], n_dec)), "yellow" ), total_len=max_width - 2, ), initial_indent=4, ) metrics_message += "\n" metrics_message += fill_ignore_format( format_two_column( "Adj. R2: " + color_text( str(np.round(self.metrics("train").at["adjr2", mstr], n_dec)), "yellow", ), "Adj. R2: " + color_text( str(np.round(self.metrics("test").at["adjr2", mstr], n_dec)), "yellow", ), total_len=max_width - 2, ), initial_indent=4, ) metrics_message += "\n" metrics_message += fill_ignore_format( format_two_column( "RMSE: " + color_text( str(np.round(self.metrics("train").at["rmse", mstr], n_dec)), "yellow", ), "RMSE: " + color_text( str(np.round(self.metrics("test").at["rmse", mstr], n_dec)), "yellow", ), total_len=max_width - 2, ), initial_indent=4, ) col_ratios = [3, 4, 3, 3] col_space = [max_width // sum(col_ratios) * i for i in col_ratios] coefs_df = self.coefs("coef|se|pval") coefs_df.index.name = "Predictor" coefs_message = bold_text("Coefficients:") + "\n" actual_coefs_message = fill_ignore_format( coefs_df.to_string(col_space=col_space[1:]), width=max_width, initial_indent=2, subsequent_indent=2, ) # bold the first two lines of the actual_coefs_message actual_coefs_message = "\n".join( [ bold_text(line) if i in [0, 1] else line for i, line in enumerate(actual_coefs_message.split("\n")) ] ) coefs_message += actual_coefs_message final_message = ( top_divider + title_message + divider + target_message + divider_invisible + predictors_message + divider + metrics_message + divider + coefs_message + bottom_divider ) return final_message def _repr_pretty_(self, p, cycle): p.text(str(self))