import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import Literal
import warnings
from adjustText import adjust_text
from statsmodels.regression.linear_model import OLSResults, RegressionResultsWrapper
from ...data import DataHandler, DataEmitter
from ...metrics.visualization import plot_obs_vs_pred, decrease_font_sizes_axs
from ..ols import OLSLinearModel
from ...display.print_utils import print_wrapped, suppress_print_output
from .linearreport_utils import MAX_N_OUTLIERS_TEXT, TRAIN_ONLY_MESSAGE
from ..lmutils.plot import (
plot_residuals_vs_var,
plot_residuals_vs_fitted,
plot_residuals_hist,
plot_scale_location,
plot_residuals_vs_leverage,
plot_qq,
)
from ...display.print_options import print_options
from ...display.plot_options import plot_options
from ...display.print_utils import (
print_wrapped,
color_text,
bold_text,
list_to_string,
fill_ignore_format,
format_two_column,
)
from ...stattests import StatisticalTestReport
class _SingleDatasetOLSReport:
"""Class for generating regression-relevant diagnostic
plots and tables for a single linear regression model.
"""
def __init__(self, model: OLSLinearModel, dataset: Literal["train", "test"]):
"""
Initializes a _SingleDatasetOLSReport object.
Parameters
----------
model : OLSLinearModel.
The model must already be trained.
dataset : Literal['train', 'test']
The dataset to generate the report for.
"""
self.model = model
with suppress_print_output():
if dataset == "test":
self.scorer = model.test_scorer
self._X_eval_df = self.model._dataemitter.emit_test_Xy()[0]
self._is_train = False
elif dataset == "train":
self.scorer = model.train_scorer
self._X_eval_df = self.model._dataemitter.emit_train_Xy()[0]
self._is_train = True
else:
raise ValueError('specification must be either "train" or "test".')
self._y_pred = self.scorer._y_pred
self._y_true = self.scorer._y_true
self._residuals = self._y_true - self._y_pred
self._stdresiduals = self._residuals / np.std(self._residuals)
self._outlier_threshold = 2
self._compute_outliers()
self._include_text = False
if self._n_outliers <= MAX_N_OUTLIERS_TEXT:
self._include_text = True
def plot_obs_vs_pred(
self,
show_outliers: bool = True,
figsize: tuple[float, float] = (5.0, 5.0),
ax: plt.Axes | None = None,
) -> plt.Figure:
"""Returns a figure that is a scatter plot of the true and predicted y
values.
Parameters
----------
show_outliers : bool
Default: True.
If True, then the outliers calculated using standard errors will be
shown in red.
figsize : tuple[float, float]
Default: (5.0,5.0). Sets the size of the resulting graph.
ax : plt.Axes
Default: None.
Returns
-------
plt.Figure
"""
fig = None
if ax is None:
fig, ax = plt.subplots(1, 1, figsize=figsize)
plot_obs_vs_pred(self._y_pred, self._y_true, self.model._name, figsize, ax)
if show_outliers and self._n_outliers > 0:
ax.scatter(
self._y_pred[self._outliers_residual_mask],
self._y_true[self._outliers_residual_mask],
s=plot_options._dot_size,
color="red",
)
if self._include_text and self._n_outliers <= MAX_N_OUTLIERS_TEXT:
annotations = []
for i, label in enumerate(self._outliers_df_idx):
annotations.append(
ax.annotate(
label,
(
self._y_pred[self._outliers_residual_mask][i],
self._y_true[self._outliers_residual_mask][i],
),
color="red",
fontsize=plot_options._axis_minor_ticklabel_font_size,
)
)
adjust_text(annotations, ax=ax)
if fig is not None:
fig.tight_layout()
plt.close()
return fig
def plot_residuals_vs_fitted(
self,
standardized: bool = False,
show_outliers: bool = True,
figsize: tuple[float, float] = (5.0, 5.0),
ax: plt.Axes | None = None,
) -> plt.Figure:
"""Returns a figure that is a residuals vs fitted (y_pred) plot.
Parameters
----------
standardized : bool
Default: False. If True, plots the standardized residuals as
opposed to the raw residuals.
show_outliers : bool
Default: True. If True, colors the outliers determined by the
standardized residuals in red.
figsize : tuple[float, float]
Default: (5.0, 5.0). Determines the size of the returned figure.
ax : plt.Axes
Default: None.
Returns
-------
plt.Figure
"""
return plot_residuals_vs_fitted(
y_pred=self._y_pred,
residuals=self._residuals,
outliers_idx=self._outliers_df_idx,
outliers_mask=self._outliers_residual_mask,
show_outliers=show_outliers,
standardized=standardized,
include_text=self._include_text,
figsize=figsize,
ax=ax,
)
def plot_residuals_vs_var(
self,
predictor: str,
standardized: bool = False,
show_outliers: bool = False,
figsize: tuple[float, float] = (5.0, 5.0),
ax: plt.Axes | None = None,
) -> plt.Figure:
"""Returns a figure that is a residuals vs fitted (y_pred) plot.
Parameters
----------
predictor : str
The predictor variable whose values should be plotted on the x-axis.
standardized : bool
Default: False. If True, standardizes the residuals.
show_outliers : bool
Default: False. If True, plots the outliers in red.
figsize : tuple[float, float]
Default: (5.0, 5.0). Determines the size of the returned figure.
ax : plt.Axes
Default: None.
Returns
-------
plt.Figure
"""
return plot_residuals_vs_var(
predictor=predictor,
X_eval_df=self._X_eval_df,
residuals=self._residuals,
outliers_idx=self._outliers_df_idx,
outliers_mask=self._outliers_residual_mask,
show_outliers=show_outliers,
standardized=standardized,
include_text=self._include_text,
figsize=figsize,
ax=ax,
)
def plot_residuals_hist(
self,
standardized: bool = False,
density: bool = False,
figsize: tuple[float, float] = (5.0, 5.0),
ax: plt.Axes | None = None,
) -> plt.Figure:
"""Returns a figure that is a histogram of the residuals.
Parameters
----------
standardized : bool
Default: False. If True, standardizes the residuals.
density : bool
Default: False. If True, plots density rather than frequency.
figsize : tuple[float, float]
Default: (5.0, 5.0). Determines the size of the returned figure.
ax : plt.Axes
Default: None.
Returns
-------
plt.Figure
"""
return plot_residuals_hist(
residuals=self._residuals,
standardized=standardized,
density=density,
figsize=figsize,
ax=ax,
)
def plot_scale_location(
self,
show_outliers: bool = True,
figsize: tuple[float, float] = (5.0, 5.0),
ax: plt.Axes | None = None,
) -> plt.Figure:
"""Returns a figure that is a plot of the
sqrt of the residuals versus the fitted.
Parameters
----------
show_outliers : bool
Default: True. If True, plots the outliers in red.
figsize : tuple[float, float]
Default: (5.0, 5.0).
ax : plt.Axes
Default: None.
Returns
-------
plt.Figure
"""
return plot_scale_location(
y_pred=self._y_pred,
std_residuals=self._residuals / np.std(self._residuals),
show_outliers=show_outliers,
outliers_idx=self._outliers_df_idx,
outliers_mask=self._outliers_residual_mask,
include_text=self._include_text,
figsize=figsize,
ax=ax,
)
def plot_residuals_vs_leverage(
self,
standardized: bool = True,
show_outliers: bool = True,
figsize: tuple[float, float] = (5.0, 5.0),
ax: plt.Axes | None = None,
) -> plt.Figure:
"""Returns a figure that is a plot of the residuals versus leverage.
Parameters
----------
standardized : bool
Default: True. If True, standardizes the residuals.
show_outliers : bool
Default: True. If True, plots the outliers in red.
figsize : tuple[float, float]
Default: (5.0, 5.0).
ax : plt.Axes
Default: None.
Returns
-------
plt.Figure
"""
if not self._is_train:
print_wrapped(TRAIN_ONLY_MESSAGE, type="WARNING")
return None
if isinstance(self.model.estimator, RegressionResultsWrapper):
leverage = self.model.estimator._results.get_influence().hat_matrix_diag
else:
raise ValueError(
"Leverage/influence statistics are not available for regularized models. "
f"The statsmodels output type is {type(self.model.estimator)}."
)
return plot_residuals_vs_leverage(
leverage=leverage,
residuals=self._residuals,
standardized=standardized,
show_outliers=show_outliers,
outliers_idx=self._outliers_df_idx,
outliers_mask=self._outliers_residual_mask,
include_text=self._include_text,
figsize=figsize,
ax=ax,
)
def plot_qq(
self,
standardized: bool = True,
show_outliers: bool = False,
figsize: tuple[float, float] = (5.0, 5.0),
ax: plt.Axes | None = None,
) -> plt.Figure:
"""Returns a quantile-quantile plot.
Parameters
----------
standardized : bool
Default: True. If True, standardizes the residuals.
show_outliers : bool
Default: False. If True, plots the outliers in red.
figsize : tuple[float, float]
Default: (5.0, 5.0).
ax : plt.Axes
Default: None.
Returns
-------
plt.Figure
"""
return plot_qq(
df_idx=self._X_eval_df.index,
residuals=self._residuals,
standardized=standardized,
outliers_idx=self._outliers_df_idx,
outliers_mask=self._outliers_residual_mask,
show_outliers=show_outliers,
include_text=self._include_text,
figsize=figsize,
ax=ax,
)
def plot_diagnostics(
self, show_outliers: bool = False, figsize: tuple[float, float] = (7.0, 7.0)
) -> plt.Figure:
"""Plots several useful linear regression diagnostic plots.
Parameters
----------
show_outliers : bool
Default: False. If True, plots the residual outliers in red.
figsize : tuple[float, float]
Default: (7.0, 7.0).
Returns
-------
plt.Figure
"""
fig, axs = plt.subplots(2, 2, figsize=figsize)
self.plot_obs_vs_pred(show_outliers=show_outliers, ax=axs[0][0])
self.plot_residuals_vs_fitted(show_outliers=show_outliers, ax=axs[0][1])
if self._is_train and isinstance(self.model.estimator, OLSResults):
self.plot_residuals_vs_leverage(show_outliers=show_outliers, ax=axs[1][0])
else:
self.plot_scale_location(show_outliers=show_outliers, ax=axs[1][0])
self.plot_qq(show_outliers=show_outliers, ax=axs[1][1])
fig.subplots_adjust(hspace=0.3, wspace=0.3)
decrease_font_sizes_axs(axs, 2, 2, 0)
plt.close(fig)
return fig
def set_outlier_threshold(self, threshold: float) -> "_SingleDatasetOLSReport":
"""Standardized residuals threshold for outlier identification.
Recomputes the outliers.
Parameters
----------
threshold : float
Default: 2. Must be a nonnegative value.
Returns
-------
self
"""
if threshold < 0:
raise ValueError(
f"Input threshold must be nonnegative. Received {threshold}."
)
self._outlier_threshold = threshold
self._compute_outliers()
return self
def get_outlier_indices(self) -> list:
"""Returns the indices corresponding to DataFrame examples associated
with standardized residual outliers.
Returns
-------
outliers_df_idx : list ~ (n_outliers)
"""
return self._outliers_df_idx.tolist()
def metrics(self) -> pd.DataFrame:
"""Returns a DataFrame containing the goodness-of-fit statistics
for the model.
Returns
----------
pd.DataFrame
"""
return self.scorer.stats_df().astype(float).round(print_options._n_decimals)
def _compute_outliers(self):
"""Computes the outliers."""
self._outliers_residual_mask = (
self._stdresiduals >= self._outlier_threshold
) | (self._stdresiduals <= -self._outlier_threshold)
self._outliers_df_idx = self._X_eval_df.iloc[
self._outliers_residual_mask
].index.to_numpy()
self._n_outliers = len(self._outliers_df_idx)
self._include_text = False
if self._n_outliers <= MAX_N_OUTLIERS_TEXT:
self._include_text = True
[docs]
class OLSReport:
"""OLSReport.
Fits the model based on provided DataHandler.
Contains methods for generating regression-relevant diagnostic
plots and tables for a single linear regression model.
"""
def __init__(
self,
model: OLSLinearModel,
datahandler: DataHandler,
target: str,
predictors: list[str],
dataemitter: DataEmitter | None = None,
):
"""OLSReport.
Fits the model based on provided DataHandler.
Contains methods for generating regression-relevant diagnostic
plots and tables for a single linear regression model.
Parameters
----------
model : OLSModel
datahandler : DataHandler
The DataHandler object that contains the data.
target : str
The name of the target variable.
predictors : list[str]
The names of the predictor variables.
dataemitter : DataEmitter
Default: None. The DataEmitter object that emits the data.
Optionally you can initialize the report with a DataEmitter object
instead of a DataHandler object. If not None, will ignore the
values of target and predictors.
"""
self._model = model
self._datahandler = datahandler
if dataemitter is not None:
self._dataemitter = dataemitter
else:
self._dataemitter = self._datahandler.train_test_emitter(target, predictors)
self._model.specify_data(self._dataemitter)
self._model.fit()
self._target = target
self._predictors = predictors
self._train_report = _SingleDatasetOLSReport(model, "train")
self._test_report = _SingleDatasetOLSReport(model, "test")
def train_report(self) -> _SingleDatasetOLSReport:
"""Returns an SingleDatasetLinRegReport object for the train dataset
Returns
-------
SingleDatasetLinRegReport
"""
return self._train_report
def test_report(self) -> _SingleDatasetOLSReport:
"""Returns an SingleDatasetLinRegReport object for the test dataset
Returns
-------
SingleDatasetLinRegReport
"""
return self._test_report
[docs]
def model(self) -> OLSLinearModel:
"""Returns the fitted OLSLinearModel object.
Returns
-------
OLSLinearModel
"""
return self._model
[docs]
def metrics(self, dataset: Literal["train", "test", "both"]) -> pd.DataFrame:
"""Returns a DataFrame containing the goodness-of-fit statistics
for the model.
Parameters
----------
dataset : Literal['train', 'test', 'both']
The dataset to compute the metrics for.
Returns
-------
pd.DataFrame
"""
if dataset == "train":
return self._train_report.metrics()
elif dataset == "test":
return self._test_report.metrics()
elif dataset == "both":
test_metrics = self._test_report.metrics() # one column w/ model name
train_metrics = self._train_report.metrics() # one column w/ model name
# stack the two DataFrames on top of each other
# add an outermost index level to differentiate between the two datasets
return pd.concat(
[train_metrics, test_metrics], keys=["train", "test"], names=["Dataset"]
)
else:
raise ValueError('dataset must be either "train", "test", or "both".')
[docs]
def step(
self,
direction: Literal["both", "backward", "forward"] = "backward",
criteria: Literal["aic", "bic"] = "aic",
kept_vars: list[str] | None = None,
all_vars: list[str] | None = None,
start_vars: list[str] | None = None,
max_steps: int = 100,
) -> "OLSReport":
"""Performs stepwise selection. Returns a new
OLSReport object with the reduced model.
Parameters
----------
direction : Literal["both", "backward", "forward"]
Default: 'backward'. The direction of the stepwise selection.
criteria : Literal["aic", "bic"]
Default: 'aic'. The criteria to use for selecting the best model.
kept_vars : list[str]
Default: None. The variables that should be kept in the model.
If None, defaults to an empty list.
all_vars : list[str]
Default: None. The variables that are candidates for inclusion in the model.
If None, defaults to all variables in the training data.
start_vars : list[str]
Default: None.
The variables to start the bidirectional stepwise selection with.
Ignored if direction is not 'both'. If direction is 'both' and
start_vars is None, then the starting variables are the kept_vars.
max_steps : int
Default: 100. The maximum number of steps to take.
Returns
-------
OLSReport
"""
if direction == "backward":
method_name = "Backward selection"
elif direction == "both":
method_name = "Alternating selection"
elif direction == "forward":
method_name = "Forward selection"
else:
raise ValueError(f"Invalid argument: {direction}.")
selected_vars = self._model.step(
direction=direction,
criteria=criteria,
kept_vars=kept_vars,
all_vars=all_vars,
start_vars=start_vars,
max_steps=max_steps,
)
if all_vars is None:
all_vars = self._model._dataemitter.X_vars()
vars_removed = list(set(all_vars) - set(selected_vars))
if len(vars_removed) == 0:
print_wrapped(
f"{method_name} removed 0 predictors.", level="INFO", type="NOTE"
)
return self
elif len(vars_removed) == 1:
print_wrapped(
text=f"{method_name} removed {len(vars_removed)} predictor: "
+ list_to_string(vars_removed)
+ ".",
level="INFO",
type="UPDATE",
)
else:
print_wrapped(
text=f"{method_name} removed {len(vars_removed)} predictors: "
+ list_to_string(vars_removed)
+ ".",
level="INFO",
type="UPDATE",
)
new_emitter = self._dataemitter.copy()
new_emitter.select_predictors_pre_onehot(selected_vars)
return OLSReport(
OLSLinearModel(
alpha=self._model.alpha,
l1_weight=self._model.l1_weight,
name=self._model._name + f" (reduced, direction={direction})",
),
self._datahandler, # only used for y var scaler
self._target, # ignored
selected_vars, # ignored
new_emitter,
)
[docs]
def test_lr(self, alternative_report: "OLSReport") -> StatisticalTestReport:
"""Performs a likelihood ratio test to compare an alternative
OLSLinearModel. Returns an object of class StatisticalTestReport
describing the results.
Parameters
----------
alternative_report : OLSReport
The report of an alternative OLSLinearModel. The alternative
model must be a nested version of the current model or vice-versa.
Returns
-------
StatisticalTestReport
"""
if not isinstance(self._model.estimator, RegressionResultsWrapper):
raise ValueError(
"Partial F-tests are not available for regularized models. "
f"The model type is {type(self._model.estimator)}."
)
# Determine which report is the reduced model
# Get the models from each report
original_model = self._train_report.model.estimator
alternative_model = alternative_report.train_report().model.estimator
# Get the number of predictors for each model
num_predictors_orig = len(self._train_report._X_eval_df.columns)
num_predictors_alternative = len(
alternative_report.train_report()._X_eval_df.columns
)
if num_predictors_orig > num_predictors_alternative:
full_model = original_model
reduced_model = alternative_model
elif num_predictors_orig < num_predictors_alternative:
full_model = alternative_model
reduced_model = original_model
else:
# Raise an error if the number of predictors are the same
raise ValueError("One model must be a reduced version of the other")
# Raise ValueError if one set of predictors is not a subset of the other
orig_var_set = set(self._train_report._X_eval_df.columns)
alt_var_set = set(alternative_report.train_report()._X_eval_df.columns)
if not (orig_var_set < alt_var_set or orig_var_set > alt_var_set):
raise ValueError("One model must be a reduced version of the other")
# Extract the results of the test and temporarily suppress warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore")
lr_stat, p_value, dr_diff = full_model.compare_lr_test(reduced_model)
# Initialize and return an object of class StatisticalTestReport
lr_result = StatisticalTestReport(
description="Likelihood Ratio Test",
statistic=lr_stat,
pval=p_value,
degfree=dr_diff,
statistic_description="Chi-square",
null_hypothesis_description="The full model does not fit the "
"data significantly better than the reduced model",
alternative_hypothesis_description="The full model fits the "
"data signficantly better than the reduced model",
assumptions_description="The data must be homoscedastic and "
"uncorrelated",
)
return lr_result
[docs]
def test_partialf(self, alternative_report: "OLSReport") -> StatisticalTestReport:
"""Performs a partial F-test to compare an alternative OLSLinearModel.
Returns an object of class StatisticalTestReport describing the results.
Parameters
----------
alternative_report : OLSReport
The report of an alternative OLSLinearModel. The alternative
model must be a nested version of the current model or vice-versa.
Returns
-------
StatisticalTestReport
"""
if not isinstance(self._model.estimator, RegressionResultsWrapper):
raise ValueError(
"Partial F-tests are not available for regularized models."
)
# Determine which report is the reduced model
# Get the models from each report
original_model = self._train_report.model.estimator
alternative_model = alternative_report.train_report().model.estimator
# Get the number of predictors for each model
num_predictors_orig = len(self._train_report._X_eval_df.columns)
num_predictors_alternative = len(
alternative_report.train_report()._X_eval_df.columns
)
if num_predictors_orig > num_predictors_alternative:
full_model = original_model
reduced_model = alternative_model
elif num_predictors_orig < num_predictors_alternative:
full_model = alternative_model
reduced_model = original_model
else:
# Raise an error if the number of predictors are the same
raise ValueError("One model must be a reduced version of the other")
# Raise ValueError if one set of predictors is not a subset of the other
orig_var_set = set(self._train_report._X_eval_df.columns)
alt_var_set = set(alternative_report.train_report()._X_eval_df.columns)
if not (orig_var_set < alt_var_set or orig_var_set > alt_var_set):
raise ValueError("One model must be a reduced version of the other")
# Extract the results of the test and suppress warnings temporarily
with warnings.catch_warnings():
warnings.simplefilter("ignore")
f_value, p_value, dr_diff = full_model.compare_f_test(reduced_model)
# Initialize and return an object of class StatisticalTestReport
partial_f_result = StatisticalTestReport(
description="Partial F-Test",
statistic=f_value,
pval=p_value,
degfree=dr_diff,
statistic_description="F-statistic",
null_hypothesis_description="The coefficients of the additional "
"predictors are all zero",
alternative_hypothesis_description="At least one of the "
"coefficients of the additional predictors is not zero",
assumptions_description="The data must be homoscedastic and "
"have no autocorrelation",
)
return partial_f_result
[docs]
def statsmodels_summary(self):
"""Returns the summary of the statsmodels RegressionResultsWrapper for
OLS.
"""
try:
return self._model.estimator.summary()
except Exception as e:
raise RuntimeError(
"Error occured in statsmodels_summary call. " f"Error: {e}"
)
[docs]
def plot_obs_vs_pred(
self,
dataset: Literal["train", "test"],
show_outliers: bool = True,
figsize: tuple[float, float] = (5.0, 5.0),
ax: plt.Axes | None = None,
) -> plt.Figure:
"""Plots a scatter plot of the true and predicted y values.
Parameters
----------
dataset : Literal['train', 'test']
The dataset to generate the plot for.
show_outliers : bool
Default: True.
If True, then the outliers calculated using standard errors will be
shown in red.
figsize : tuple[float, float]
Default: (5.0,5.0). Sets the size of the resulting graph.
ax : plt.Axes
Default: None.
Returns
-------
- Figure
"""
if dataset == "train":
return self._train_report.plot_obs_vs_pred(
show_outliers=show_outliers, figsize=figsize, ax=ax
)
else:
return self._test_report.plot_obs_vs_pred(
show_outliers=show_outliers, figsize=figsize, ax=ax
)
[docs]
def plot_residuals_vs_fitted(
self,
dataset: Literal["train", "test"],
standardized: bool = False,
show_outliers: bool = True,
figsize: tuple[float, float] = (5.0, 5.0),
ax: plt.Axes | None = None,
) -> plt.Figure:
"""Plots the residuals versus the fitted values.
Parameters
----------
dataset : Literal['train', 'test']
The dataset to generate the plot for.
standardized : bool
Default: False. If True, plots the standardized residuals as
opposed to the raw residuals.
show_outliers : bool
Default: True. If True, colors the outliers determined by the
standardized residuals in red.
figsize : tuple[float, float]
Default: (5.0, 5.0). Determines the size of the returned figure.
ax : plt.Axes
Default: None.
Returns
-------
plt.Figure
"""
if dataset == "train":
return self._train_report.plot_residuals_vs_fitted(
standardized=standardized,
show_outliers=show_outliers,
figsize=figsize,
ax=ax,
)
elif dataset == "test":
return self._test_report.plot_residuals_vs_fitted(
standardized=standardized,
show_outliers=show_outliers,
figsize=figsize,
ax=ax,
)
else:
raise ValueError('The dataset must be either "train" or "test".')
[docs]
def plot_residuals_vs_var(
self,
predictor: str,
dataset: Literal["train", "test"],
standardized: bool = False,
show_outliers: bool = False,
figsize: tuple[float, float] = (5.0, 5.0),
ax: plt.Axes | None = None,
) -> plt.Figure:
"""Returns a figure that is a residuals vs fitted (y_pred) plot.
Parameters
----------
predictor : str
The predictor variable whose values should be plotted on the x-axis.
dataset : Literal['train', 'test']
The dataset to generate the plot for.
standardized : bool
Default: False. If True, standardizes the residuals.
show_outliers : bool
Default: False. If True, plots the outliers in red.
figsize : tuple[float, float]
Default: (5.0, 5.0). Determines the size of the returned figure.
ax : plt.Axes
Default: None.
Returns
-------
plt.Figure
"""
if dataset == "train":
return self._train_report.plot_residuals_vs_var(
predictor=predictor,
standardized=standardized,
show_outliers=show_outliers,
figsize=figsize,
ax=ax,
)
elif dataset == "test":
return self._test_report.plot_residuals_vs_var(
predictor=predictor,
standardized=standardized,
show_outliers=show_outliers,
figsize=figsize,
ax=ax,
)
else:
raise ValueError('The dataset must be either "train" or "test".')
[docs]
def plot_residuals_hist(
self,
dataset: Literal["train", "test"],
standardized: bool = False,
density: bool = False,
figsize: tuple[float, float] = (5.0, 5.0),
ax: plt.Axes | None = None,
) -> plt.Figure:
"""Returns a figure that is a histogram of the residuals.
Parameters
----------
dataset : Literal['train', 'test']
The dataset to generate the plot for.
standardized : bool
Default: False. If True, standardizes the residuals.
density : bool
Default: False. If True, plots density rather than frequency.
figsize : tuple[float, float]
Default: (5.0, 5.0). Determines the size of the returned figure.
ax : plt.Axes
Default: None.
Returns
-------
plt.Figure
"""
if dataset == "train":
return self._train_report.plot_residuals_hist(
standardized=standardized, density=density, figsize=figsize, ax=ax
)
elif dataset == "test":
return self._test_report.plot_residuals_hist(
standardized=standardized, density=density, figsize=figsize, ax=ax
)
else:
raise ValueError('The dataset must be either "train" or "test".')
[docs]
def plot_scale_location(
self,
dataset: Literal["train", "test"],
show_outliers: bool = True,
figsize: tuple[float, float] = (5.0, 5.0),
ax: plt.Axes | None = None,
) -> plt.Figure:
"""Returns a figure that is a plot of the
sqrt of the residuals versus the fitted.
Parameters
----------
dataset : Literal['train', 'test']
The dataset to generate the plot for.
show_outliers : bool
Default: True. If True, plots the outliers in red.
figsize : tuple[float, float]
Default: (5.0, 5.0).
ax : plt.Axes
Default: None.
Returns
-------
plt.Figure
"""
if dataset == "train":
return self._train_report.plot_scale_location(
show_outliers=show_outliers, figsize=figsize, ax=ax
)
elif dataset == "test":
return self._test_report.plot_scale_location(
show_outliers=show_outliers, figsize=figsize, ax=ax
)
else:
raise ValueError('The dataset must be either "train" or "test".')
[docs]
def plot_residuals_vs_leverage(
self,
dataset: Literal["train", "test"],
standardized: bool = True,
show_outliers: bool = True,
figsize: tuple[float, float] = (5.0, 5.0),
ax: plt.Axes | None = None,
) -> plt.Figure:
"""Plots the residuals versus leverage.
Parameters
----------
dataset : Literal['train', 'test']
Default: 'test'.
standardized : bool
Default: True. If True, standardizes the residuals.
show_outliers : bool
Default: True. If True, plots the outliers in red.
figsize : tuple[float, float]
Default: (5.0, 5.0).
ax : plt.Axes
Default: None.
Returns
-------
plt.Figure
"""
if dataset == "train":
return self._train_report.plot_residuals_vs_leverage(
standardized=standardized,
show_outliers=show_outliers,
figsize=figsize,
ax=ax,
)
elif dataset == "test":
return self._test_report.plot_residuals_vs_leverage(
standardized=standardized,
show_outliers=show_outliers,
figsize=figsize,
ax=ax,
)
else:
raise ValueError('The dataset must be either "train" or "test".')
[docs]
def plot_qq(
self,
dataset: Literal["train", "test"],
standardized: bool = True,
show_outliers: bool = False,
figsize: tuple[float, float] = (5.0, 5.0),
ax: plt.Axes | None = None,
) -> plt.Figure:
"""Plots a quantile-quantile plot of the residuals.
Parameters
----------
dataset : Literal['train', 'test']
The dataset to generate the plot for.
standardized : bool
Default: True. If True, standardizes the residuals.
show_outliers : bool
Default: False. If True, plots the outliers in red.
figsize : tuple[float, float]
Default: (5.0, 5.0).
ax : plt.Axes
Default: None.
Returns
-------
plt.Figure
"""
if dataset == "train":
return self._train_report.plot_qq(
standardized=standardized,
show_outliers=show_outliers,
figsize=figsize,
ax=ax,
)
elif dataset == "test":
return self._test_report.plot_qq(
standardized=standardized,
show_outliers=show_outliers,
figsize=figsize,
ax=ax,
)
else:
raise ValueError('The dataset must be either "train" or "test".')
[docs]
def plot_diagnostics(
self,
dataset: Literal["train", "test"],
show_outliers: bool = False,
figsize: tuple[float, float] = (7.0, 7.0),
) -> plt.Figure:
"""Plots several useful linear regression diagnostic plots.
Parameters
----------
dataset : Literal['train', 'test']
The dataset to generate the plot for.
show_outliers : bool
Default: False. If True, plots the residual outliers in red.
figsize : tuple[float, float]
Default: (7.0, 7.0).
Returns
-------
plt.Figure
"""
if dataset == "train":
return self._train_report.plot_diagnostics(
show_outliers=show_outliers, figsize=figsize
)
elif dataset == "test":
return self._test_report.plot_diagnostics(
show_outliers=show_outliers, figsize=figsize
)
else:
raise ValueError('The dataset must be either "train" or "test".')
[docs]
def set_outlier_threshold(self, threshold: float) -> "OLSReport":
"""Standardized residuals threshold for outlier identification.
Recomputes the outliers.
Parameters
----------
threshold : float
Default: 2. Must be a nonnegative value.
Returns
-------
OLSReport
Returns self for method chaining.
"""
self._train_report.set_outlier_threshold(threshold=threshold)
self._test_report.set_outlier_threshold(threshold=threshold)
return self
[docs]
def get_outlier_indices(self, dataset: Literal["train", "test"] = "test") -> list:
"""Returns the indices corresponding to DataFrame examples associated
with standardized residual outliers.
Parameters
----------
dataset : Literal['train', 'test']
Default: 'test'.
Returns
-------
outliers_df_idx : list ~ (n_outliers)
"""
if dataset == "train":
return self._train_report.get_outlier_indices()
else:
return self._test_report.get_outlier_indices()
[docs]
def coefs(
self,
format: Literal[
"coef(se)|pval", "coef|se|pval", "coef(ci)|pval", "coef|ci_low|ci_high|pval"
] = "coef(se)|pval",
) -> pd.DataFrame:
"""Returns the coefficients of the model.
Parameters
----------
format : Literal["coef(se)|pval", "coef|se|pval", "coef(ci)|pval",
"coef|ci_low|ci_high|pval"]
Default: 'coef(se)|pval'.
Returns
-------
pd.DataFrame
"""
return self._model.coefs(format)
def _compute_outliers(self, dataset: Literal["train", "test"] = "test"):
"""Computes the outliers.
Parameters
----------
dataset : Literal['train', 'test']
Default: 'test'.
"""
if dataset == "train":
return self._train_report._compute_outliers()
else:
return self._test_report._compute_outliers()
def _to_dict(self) -> dict:
"""Returns the JSON serializable data stored in the report as a dictionary.
Returns
-------
dict
"""
return {
"coefficients": self.coefs("coef(se)|pval").to_dict("index"),
"train_metrics": self.metrics("train").to_dict("index"),
"test_metrics": self.metrics("test").to_dict("index"),
}
def __str__(self) -> str:
max_width = print_options._max_line_width
n_dec = print_options._n_decimals
top_divider = color_text("=" * max_width, "none") + "\n"
bottom_divider = "\n" + color_text("=" * max_width, "none")
divider = "\n" + color_text("-" * max_width, "none") + "\n"
divider_invisible = "\n" + " " * max_width + "\n"
if self._model.alpha == 0:
title_message = bold_text("Ordinary Least Squares Regression Report")
else:
if self._model.l1_weight == 0:
title_message = bold_text(
f"Ridge Regression Report (alpha={self._model.alpha})"
)
elif self._model.l1_weight == 1:
title_message = bold_text(
f"Lasso Regression Report (alpha={self._model.alpha})"
)
else:
title_message = bold_text(
f"Elastic Net Regression Report (alpha={self._model.alpha}, "
f"l1_ratio={self._model.l1_weight})"
)
target_var = "'" + self._target + "'"
target_message = f"{bold_text('Target variable:')}\n"
target_message += fill_ignore_format(
color_text(target_var, "purple"),
width=max_width,
initial_indent=2,
subsequent_indent=2,
)
predictors_message = (
f"{bold_text(f'Predictor variables ({len(self._predictors)}):')}\n"
)
predictors_message += fill_ignore_format(
list_to_string(self._predictors),
width=max_width,
initial_indent=2,
subsequent_indent=2,
)
metrics_message = f"{bold_text('Metrics:')}\n"
metrics_message += fill_ignore_format(
format_two_column(
bold_text(f"Train ({self._model._n_train})"),
bold_text(f"Test ({self._model._n_test})"),
total_len=max_width - 2,
),
initial_indent=2,
)
mstr = str(self._model)
metrics_message += "\n"
metrics_message += fill_ignore_format(
format_two_column(
"R2: "
+ color_text(
str(np.round(self.metrics("train").at["r2", mstr], n_dec)), "yellow"
),
"R2: "
+ color_text(
str(np.round(self.metrics("test").at["r2", mstr], n_dec)), "yellow"
),
total_len=max_width - 2,
),
initial_indent=4,
)
metrics_message += "\n"
metrics_message += fill_ignore_format(
format_two_column(
"Adj. R2: "
+ color_text(
str(np.round(self.metrics("train").at["adjr2", mstr], n_dec)),
"yellow",
),
"Adj. R2: "
+ color_text(
str(np.round(self.metrics("test").at["adjr2", mstr], n_dec)),
"yellow",
),
total_len=max_width - 2,
),
initial_indent=4,
)
metrics_message += "\n"
metrics_message += fill_ignore_format(
format_two_column(
"RMSE: "
+ color_text(
str(np.round(self.metrics("train").at["rmse", mstr], n_dec)),
"yellow",
),
"RMSE: "
+ color_text(
str(np.round(self.metrics("test").at["rmse", mstr], n_dec)),
"yellow",
),
total_len=max_width - 2,
),
initial_indent=4,
)
col_ratios = [3, 4, 3, 3]
col_space = [max_width // sum(col_ratios) * i for i in col_ratios]
coefs_df = self.coefs("coef|se|pval")
coefs_df.index.name = "Predictor"
coefs_message = bold_text("Coefficients:") + "\n"
actual_coefs_message = fill_ignore_format(
coefs_df.to_string(col_space=col_space[1:]),
width=max_width,
initial_indent=2,
subsequent_indent=2,
)
# bold the first two lines of the actual_coefs_message
actual_coefs_message = "\n".join(
[
bold_text(line) if i in [0, 1] else line
for i, line in enumerate(actual_coefs_message.split("\n"))
]
)
coefs_message += actual_coefs_message
final_message = (
top_divider
+ title_message
+ divider
+ target_message
+ divider_invisible
+ predictors_message
+ divider
+ metrics_message
+ divider
+ coefs_message
+ bottom_divider
)
return final_message
def _repr_pretty_(self, p, cycle):
p.text(str(self))