import pandas as pd
from typing import Literal
from sklearn.model_selection import train_test_split
from .ml.predict import (
BaseR,
MLRegressionReport,
BaseC,
MLClassificationReport,
)
from .ml.cluster import BaseClust, ClusterReport
from .feature_selection import BaseFSR, BaseFSC, VotingSelectionReport
from .linear import (
OLSLinearModel,
OLSReport,
LogitLinearModel,
LogitReport,
MNLogitLinearModel,
MNLogitReport,
)
from .exploratory import (
EDAReport,
)
from .causal import CausalModel
from .display.print_utils import print_wrapped, quote_and_color
from .data.datahandler import DataHandler
from .utils import ensure_arg_list_uniqueness
[docs]
class Analyzer:
"""Analyzer is the high-level interface class of TableMage.
An Analyzer object can be initialized from a single DataFrame which is then \
split into train and test DataFrames, or, alternatively, from pre-split \
train and test DataFrames. The object can then be used to conduct \
a variety of analyses, \
including exploratory data analysis (the eda() method), \
regression analysis (ols() and logit() methods), \
and machine learning modeling (classify() and regress() methods).
The Analyzer object also handles data preprocessing tasks, such as scaling, \
imputing missing values, dropping rows with missing values, one-hot encoding, \
and selecting variables. These methods can be chained together for easy data \
transformation. The Analyzer object remembers how the data was transformed, \
enabling proper fitting and transforming of cross validation splits of the \
train dataset.
"""
[docs]
def __init__(
self,
df: pd.DataFrame,
df_test: pd.DataFrame | None = None,
test_size: float = 0.0,
split_seed: int = 42,
id_column: str | None = None,
verbose: bool = True,
name: str = "Unnamed Dataset",
):
"""Initializes a Analyzer object.
Parameters
----------
df : pd.DataFrame | None
The DataFrame to be analyzed. Must be in wide format, i.e. with shape \
(n_units, n_vars). If df_test is provided, then the df is treated as the \
train DataFrame. Otherwise, the df is split into train and test DataFrames \
according to the test_size parameter.
df_test : pd.DataFrame | None
Default: None.
If not None, then treats df as the train DataFrame.
test_size : float
Default: 0. Proportion of the DataFrame to withhold for \
testing. If test_size = 0, then the train DataFrame and the \
test DataFrame will both be the same as the input df. \
If df_test is provided, then test_size is ignored.
id_column : str | None
Default: None. The name of the column containing unique identifiers. \
If not None, then the column will be set as the index of the DataFrame. \
If None, then the input index will be used as the index of the DataFrame.
split_seed : int
Default: 42. \
Used only for the train test split. \
If df_test is provided, then split_seed is ignored.
verbose : bool
Default: False. If True, prints helpful update messages for certain \
Analyzer function calls.
name : str
Default: 'Unnamed Dataset'. Name of the dataset the Analyzer is
initialized for.
"""
self._split_seed = split_seed
self._test_size = test_size
self._verbose = verbose
df = df.copy()
if df_test is not None:
df_test = df_test.copy()
# force column names to str
df.columns = df.columns.astype(str)
if df_test is not None:
df_test.columns = df_test.columns.astype(str)
# ensure column names are sorted
df = df.reindex(sorted(df.columns), axis=1)
df_test = df_test.reindex(sorted(df_test.columns), axis=1)
if id_column is not None:
if id_column not in df.columns:
raise ValueError(f"ID column {id_column} not found in train data.")
if id_column not in df_test.columns:
raise ValueError(f"ID column {id_column} not found in test data.")
df = df.set_index(id_column)
df_test = df_test.set_index(id_column)
self._datahandler = DataHandler(
df_train=df, df_test=df_test, verbose=self._verbose, name=name
)
else:
if test_size > 0:
temp_train, temp_test = train_test_split(
df, test_size=test_size, shuffle=True, random_state=split_seed
)
temp_train_df = pd.DataFrame(temp_train, columns=df.columns)
temp_test_df = pd.DataFrame(temp_test, columns=df.columns)
else:
if self._verbose:
print_wrapped(
"No test dataset provided. The test dataset "
+ "will be treated as a train dataset copy.",
type="NOTE",
)
temp_train_df = df
temp_test_df = df
# ensure column names are sorted
temp_train_df = temp_train_df.reindex(sorted(temp_train_df.columns), axis=1)
temp_test_df = temp_test_df.reindex(sorted(temp_test_df.columns), axis=1)
if id_column is not None:
if id_column not in temp_train_df.columns:
raise ValueError(f"ID column {id_column} not found in train data.")
if id_column not in temp_test_df.columns:
raise ValueError(f"ID column {id_column} not found in test data.")
temp_train_df = temp_train_df.set_index(id_column)
temp_test_df = temp_test_df.set_index(id_column)
self._datahandler = DataHandler(
df_train=temp_train_df,
df_test=temp_test_df,
verbose=self._verbose,
name=name,
)
self._name = name
if len(self._datahandler.df_train()) == 0:
raise ValueError("Train DataFrame is empty.")
if len(self._datahandler.df_test()) == 0:
raise ValueError("Test DataFrame is empty.")
if self._verbose:
print_wrapped(
"Analyzer initialized for dataset "
f"{quote_and_color(self._name, 'yellow')}.",
type="UPDATE",
)
# --------------------------------------------------------------------------
# EDA + FEATURE SELECTION + CAUSAL EFFECT ESTIMATION + REGRESSION ANALYSIS
# --------------------------------------------------------------------------
[docs]
def eda(self, dataset: Literal["train", "test", "all"] = "all") -> EDAReport:
"""Constructs an EDAReport object for the working train \
dataset, the working test dataset, or both datasets combined.
Parameters
----------
dataset : Literal['train', 'test', 'all']
The dataset to be analyzed. By default, analyzes all data.
Returns
-------
EDAReport
The EDAReport object contains a variety of exploratory data \
analysis methods, including summary statistics for numeric and \
categorical variables, t-tests, and data visualizations.
"""
if dataset == "train":
return EDAReport(self._datahandler.df_train())
elif dataset == "test":
return EDAReport(self._datahandler.df_test())
elif dataset == "all":
return EDAReport(self._datahandler.df_all())
else:
raise ValueError(f"Invalid input: dataset = {dataset}.")
[docs]
@ensure_arg_list_uniqueness()
def causal(
self,
treatment: str,
outcome: str,
confounders: list[str],
dataset: Literal["train", "test", "all"] = "all",
) -> CausalModel:
"""Returns a CausalModel object for estimating causal effects. \
The CausalModel object contains rudimentary methods for \
causal effect estimation (weighted least squares, IPW estimator).
Parameters
----------
treatment : str
The treatment variable. Must be binary numeric (0 or 1-valued).
outcome : str
The outcome variable.
confounders : list[str]
The confounding variables.
dataset : Literal['train', 'test', 'all']
The dataset to be analyzed. By default, analyzes all data.
Returns
-------
CausalModel
The CausalModel object contains methods for estimating causal effects.
"""
return CausalModel(
datahandler=self._datahandler,
treatment=treatment,
outcome=outcome,
confounders=confounders,
dataset=dataset,
)
[docs]
@ensure_arg_list_uniqueness()
def select_features(
self,
target: str,
predictors: list[str] | None = None,
feature_selectors: list[BaseFSR] | list[BaseFSC] | None = None,
max_n_features: int | None = None,
) -> VotingSelectionReport:
"""Selects the most important features using a variety of feature selection \
methods. The feature selection methods can be used to select the most \
important predictors for regression or classification.
Parameters
----------
target : str
The target variable.
predictors : list[str] | None
Default: None. The predictors to select from. \
If None, uses all variables except the target as predictors.
feature_selectors : list[BaseFSR] | list[BaseFSC] | None
Default: None. The feature selection methods to use. \
If None, uses all feature selection methods.
max_n_features : int | None
Default: None. Maximum number of features to select. \
If None, then all features with at least 50% support are selected.
Returns
-------
VotingSelectionReport
Report object containing the results of the feature selection methods.
"""
if target in self._datahandler.categorical_vars():
for fs in feature_selectors:
if not isinstance(fs, BaseFSC):
raise ValueError(
"Feature selection methods for classification "
+ "should be instances of BaseFSC."
)
if predictors is None:
predictors = self._datahandler.vars()
if target in predictors:
predictors.remove(target)
for predictor in predictors:
if predictor not in self._datahandler.vars():
raise ValueError(f"Predictor {predictor} not found in data.")
# remove target from predictors, if present
if target in predictors:
predictors.remove(target)
return VotingSelectionReport(
selectors=feature_selectors,
dataemitter=self._datahandler.train_test_emitter(
y_var=target,
X_vars=predictors,
),
max_n_features=max_n_features,
)
[docs]
@ensure_arg_list_uniqueness()
def ols(
self,
target: str | None = None,
predictors: list[str] | None = None,
alpha: float = 0.0,
l1_weight: float = 0.0,
) -> OLSReport:
"""Performs OLS regression. Units with missing data will be dropped.
Parameters
----------
target : str | None
Default: None. The variable to be predicted.
predictors : list[str]
Default: None. \
If None, all variables except target will be used as predictors.
alpha : float
Default: 0. Regularization strength. Must be a positive float.
l1_weight : float
Default: 0. The weight of the L1 penalty. Must be a float between 0 and 1.
Returns
-------
OLSReport
The OLSReport object contains a variety of OLS regression methods, \
including summary statistics, model coefficients, and data visualizations.
"""
if target not in self._datahandler.numeric_vars():
raise ValueError(
f"Target variable {quote_and_color(target, 'yellow')} "
+ "is not numeric."
)
if predictors is None:
predictors = self._datahandler.vars()
if target in predictors:
if self._verbose:
print_wrapped(
f"Removing target variable {quote_and_color(target, 'yellow')} "
+ "from predictors.",
type="WARNING",
)
predictors.remove(target)
return OLSReport(
OLSLinearModel(alpha=alpha, l1_weight=l1_weight),
self._datahandler,
target,
predictors,
)
[docs]
@ensure_arg_list_uniqueness()
def logit(
self,
target: str | None = None,
predictors: list[str] | None = None,
alpha: float = 0.0,
l1_weight: float = 0.0,
threshold_strategy: Literal["f1", "roc"] | None = None,
) -> LogitReport | MNLogitReport:
"""Performs logistic regression. Units with missing data will be dropped.
Parameters
----------
target : str | None
Default: None. The variable to be predicted.
predictors : list[str] | None
Default: None.
If None, all variables except target will be used as predictors.
alpha : float
Default: 0. Regularization strength. Must be a positive float.
l1_weight : float
Default: 0. The weight of the L1 penalty. Must be a float between 0 and 1.
threshold_strategy : Literal['f1', 'roc'] | None
Default: None. The strategy for determining the threshold for binary
classification. If None, the threshold is set to 0.5.
Returns
-------
LogitReport | MNLogitReport
The appropriate regression report object is returned.
"""
if predictors is None:
predictors = self._datahandler.vars()
if target in predictors:
if self._verbose:
print_wrapped(
f"Removing target variable "
f"{quote_and_color(target, 'yellow')} " + "from predictors.",
type="WARNING",
)
predictors.remove(target)
# decide between binary and multinomial logit
df_all = self._datahandler.df_all()
if len(df_all[target].dropna().unique()) == 2:
return LogitReport(
LogitLinearModel(
alpha=alpha,
l1_weight=l1_weight,
threshold_strategy=threshold_strategy,
),
self._datahandler,
target,
predictors,
)
else:
return MNLogitReport(
MNLogitLinearModel(
alpha=alpha,
l1_weight=l1_weight,
threshold_strategy=threshold_strategy,
),
self._datahandler,
target,
predictors,
)
# --------------------------------------------------------------------------
# MACHINE LEARNING
# --------------------------------------------------------------------------
[docs]
@ensure_arg_list_uniqueness()
def regress(
self,
models: list[BaseR],
target: str,
predictors: list[str] | None = None,
feature_selectors: list[BaseFSR] | None = None,
max_n_features: int | None = None,
outer_cv: int | None = None,
outer_cv_seed: int = 42,
) -> MLRegressionReport:
"""Conducts a comprehensive regression ML model benchmarking exercise. \
Observations with missing data will be dropped.
Parameters
----------
models : list[BaseR]
Models to be evaluated.
target : str
The variable to be predicted.
predictors : list[str]
Default: None.
If None, uses all variables except target as predictors.
feature_selectors : list[BaseFSR]
The feature selectors for voting selection. Feature selectors \
can be used to select the most important predictors. \
Feature selectors can also be specified at the model level. If \
specified here, the same feature selectors will be used for all \
models.
max_n_features : int | None
Default: None. Maximum number of predictors to utilize. \
Ignored if feature_selectors is None. \
If None, then all features with at least 50% support are selected.
outer_cv : int
Default: None. If not None, reports training scores via nested k-fold CV.
outer_cv_seed : int
Default: 42. The random seed for the outer cross validation loop.
Returns
-------
MLRegressionReport
"""
if predictors is None:
predictors = self._datahandler.vars()
if target in predictors:
predictors.remove(target)
return MLRegressionReport(
models=models,
datahandler=self._datahandler,
target=target,
predictors=predictors,
feature_selectors=feature_selectors,
max_n_features=max_n_features,
outer_cv=outer_cv,
outer_cv_seed=outer_cv_seed,
verbose=self._verbose,
)
[docs]
@ensure_arg_list_uniqueness()
def classify(
self,
models: list[BaseC],
target: str,
predictors: list[str] | None = None,
feature_selectors: list[BaseFSC] | None = None,
max_n_features: int | None = None,
outer_cv: int | None = None,
outer_cv_seed: int = 42,
) -> MLClassificationReport:
"""Conducts a comprehensive classification ML model benchmarking exercise. \
Observations with missing data will be dropped.
Parameters
----------
models : list[BaseC]
Models to be evaluated.
target : str
The variable to be predicted.
predictors : list[str]
Default: None.
If None, uses all variables except target as predictors.
feature_selectors : list[BaseFSR]
The feature selectors for voting selection. Feature selectors \
can be used to select the most important predictors. \
Feature selectors can also be specified at the model level. If \
specified here, the same feature selectors will be used for all \
models.
max_n_features : int
Default: None. \
Maximum number of predictors to utilize. \
Ignored if feature_selectors is None. \
If None, then all features with at least 50% support are selected. \
outer_cv : int
Default: None.
If not None, reports training scores via nested k-fold CV.
outer_cv_seed : int
Default: 42.
The random seed for the outer cross validation loop.
Returns
-------
MLClassificationReport
"""
if predictors is None:
predictors = self._datahandler.vars()
if target in predictors:
predictors.remove(target)
return MLClassificationReport(
models=models,
datahandler=self._datahandler,
target=target,
predictors=predictors,
feature_selectors=feature_selectors,
max_n_features=max_n_features,
outer_cv=outer_cv,
outer_cv_seed=outer_cv_seed,
verbose=self._verbose,
)
[docs]
def cluster(
self,
models: list[BaseClust],
features: list[str] | None = None,
dataset: Literal["train", "all"] = "all",
) -> ClusterReport:
"""Conducts a clustering exercise.
Parameters
----------
models : list[BaseClust]
Models to be evaluated.
features : list[str] | None
Default: None. The features to cluster on. \
If None, uses all the variables.
dataset : Literal['train', 'all']
Dataset to fit models on. If "train", only fits models on training data. \
Then, cluster predictions can be made on test data. \
If "all", fits models on all data. \
By default, fits models on all data.
"""
if features is None:
features = self._datahandler.vars()
return ClusterReport(
models=models,
datahandler=self._datahandler,
features=features,
dataset=dataset,
)
# --------------------------------------------------------------------------
# DATAHANDLER METHODS
# --------------------------------------------------------------------------
[docs]
def load_data_checkpoint(self, checkpoint_name: str | None = None) -> "Analyzer":
"""Loads the original train and test DataFrames.
Parameters
----------
checkpoint_name : str | None
Default: None. The name of the checkpoint to load. \
If None, loads the original train and test DataFrames.
Returns
-------
Analyzer
Returns self for method chaining.
"""
self._datahandler.load_data_checkpoint(checkpoint_name)
return self
[docs]
def save_data_checkpoint(self, checkpoint_name: str) -> "Analyzer":
"""Saves the current train and test DataFrames.
Parameters
----------
checkpoint_name : str
The name of the checkpoint.
Returns
-------
Analyzer
Returns self for method chaining.
"""
self._datahandler.save_data_checkpoint(checkpoint_name)
return self
[docs]
def remove_data_checkpoint(self, checkpoint_name: str) -> "Analyzer":
"""Deletes a saved checkpoint.
Parameters
----------
checkpoint_name : str
The name of the checkpoint to delete.
Returns
-------
Analyzer
Returns self for method chaining.
"""
self._datahandler.remove_data_checkpoint(checkpoint_name)
return self
[docs]
def engineer_numeric_var(self, name: str, formula: str) -> "Analyzer":
"""Engineers a new variable/feature based on a formula. The formula \
can only involve numeric variables. Creates another numeric variable.
Parameters
----------
name : str
The name of the new variable engineered.
formula : str
Formula for the new feature. For example, "x1 + x2" would create \
a new feature that is the sum of the columns x1 and x2 in the DataFrame. \
All variables used must be numeric. \
Handles the following operations:
- Addition (+)
- Subtraction (-)
- Multiplication (*)
- Division (/)
- Parentheses ()
- Exponentiation (**)
- Logarithm (log)
- Exponential (exp)
- Square root (sqrt)
If the i-th unit is missing a value in any of the variables used in the \
formula, then the i-th unit of the new feature will be missing.
Examples
--------
>>> analyzer.engineer_numeric_feature("x3", "x1 + x2")
>>> assert "x3" in analyzer.datahandler.vars()
True
>>> assert analyzer.datahandler.df_train()["x3"].equals(
... analyzer.datahandler.df_train()["x1"] + analyzer.datahandler.df_train()["x2"]
... )
True
Returns
-------
Analyzer
Returns self for method chaining.
"""
self._datahandler.engineer_numeric_feature(name, formula)
return self
[docs]
def engineer_categorical_var(
self,
name: str,
numeric_var: str,
level_names: list[str],
thresholds: list[float],
leq: bool = False,
) -> "Analyzer":
"""Engineers a new categorical variable/feature based on a list of thresholds.
Parameters
----------
name : str
The name of the new variable engineered.
numeric_var : str
The name of the numeric variable.
level_names : list[str]
The names of the levels of the new categorical variable. \
The first level is the lowest level, and the last level is the highest level.
thresholds : list[float]
The (upper) thresholds for the levels of the new categorical variable. \
The thresholds must be in ascending order. \
For example, if thresholds = [0, 10, 20], \
and level_names = ["Low", "Medium", "High", "Very High"], \
then the new variable will have the following levels: \
- "Low" for values less than 0,
- "Medium" for other values less than 10,
- "High" for other values less than 20,
- "Very High" for values greater than or equal to 20.
leq : bool
Default: False. If True, the thresholds are inclusive.
Returns
-------
Analyzer
Returns self for method chaining.
"""
self._datahandler.engineer_categorical_feature(
name, numeric_var, level_names, thresholds, leq
)
return self
[docs]
@ensure_arg_list_uniqueness()
def scale(
self,
include_vars: list[str] | None = None,
exclude_vars: list[str] | None = None,
strategy: Literal[
"standardize",
"minmax",
"log",
"log1p",
"robust_standardize",
"normal_quantile",
"uniform_quantile",
] = "standardize",
) -> "Analyzer":
"""Scales the variables.
Parameters
----------
include_vars : list[str] | None
Default: None. List of variables to scale. \
If None, scales values in all columns.
exclude_vars : list[str] | None
Default: None. List of variables to exclude from scaling. \
If None, no variables are excluded.
strategy : str
Default: 'standardize'. The scaling strategy.
Returns
-------
Analyzer
Returns self for method chaining.
"""
self._datahandler.scale(
include_vars=include_vars,
exclude_vars=exclude_vars,
strategy=strategy,
)
return self
[docs]
@ensure_arg_list_uniqueness()
def impute(
self,
include_vars: list[str] | None = None,
exclude_vars: list[str] | None = None,
numeric_strategy: Literal["median", "mean", "5nn", "10nn"] = "median",
categorical_strategy: Literal["most_frequent", "missing"] = "most_frequent",
) -> "Analyzer":
"""Imputes missing values. The imputer is fit on the train DataFrame \
and transforms both train and test DataFrames.
Parameters
----------
include_vars : list[str] | None
Default: None. List of variables to impute missing values. \
If None, imputes missing values in all columns.
exclude_vars : list[str] | None
Default: None. List of variables to exclude from imputing missing values. \
If None, no variables are excluded.
numeric_strategy : Literal['median', 'mean', '5nn', '10nn']
Default: 'median'. \
Strategy for imputing missing values in numeric variables.
- 'median': impute with median.
- 'mean': impute with mean.
- '5nn': impute with 5-nearest neighbors.
- '10nn': impute with 10-nearest neighbors.
categorical_strategy : Literal['most_frequent', 'missing']
Default: 'most_frequent'. \
Strategy for imputing missing values in categorical variables.
- 'most_frequent': impute with most frequent value.
- 'missing': impute with 'missing'.
Returns
-------
Analyzer
Returns self for method chaining.
"""
self._datahandler.impute(
include_vars=include_vars,
exclude_vars=exclude_vars,
numeric_strategy=numeric_strategy,
categorical_strategy=categorical_strategy,
)
return self
[docs]
@ensure_arg_list_uniqueness()
def dropna(
self,
include_vars: list[str] | None = None,
exclude_vars: list[str] | None = None,
) -> "Analyzer":
"""Drops observations (rows) with missing values on both the train \
and test DataFrames.
Parameters
----------
include_vars : list[str] | None
Default: None. \
List of columns along which to drop rows with missing values. \
If None, drops rows with missing values in all columns.
exclude_vars : list[str] | None
Default: None. \
List of columns along which to exclude from dropping rows with \
missing values. If None, no variables are excluded.
Returns
-------
Analyzer
Returns self for method chaining.
"""
self._datahandler.dropna(
include_vars=include_vars,
exclude_vars=exclude_vars,
)
return self
[docs]
@ensure_arg_list_uniqueness()
def drop_highly_missing_vars(
self,
include_vars: list[str] | None = None,
exclude_vars: list[str] | None = None,
threshold: float = 0.5,
) -> "Analyzer":
"""Drops variables (columns) with missingness rate above a specified threshold.
Parameters
----------
include_vars : list[str] | None
Default: None. If not None, only drops columns with more than 50% missing \
values in the specified variables. Otherwise, drops columns with more than \
50% missing values in all variables.
exclude_vars : list[str] | None
Default: None. If not None, excludes the specified variables from the \
list of variables to drop (which is set to all variables by default).
threshold : float
Default: 0.5. Proportion of missing values above which a column is dropped. \
For example, if threshold = 0.2, then columns with more than 20% missing \
values are dropped.
Returns
-------
Analyzer
Returns self for method chaining.
"""
self._datahandler.drop_highly_missing_vars(
include_vars, exclude_vars, threshold
)
return self
[docs]
@ensure_arg_list_uniqueness()
def onehot(
self,
include_vars: list[str] | None = None,
exclude_vars: list[str] | None = None,
dropfirst: bool = True,
keep_original: bool = False,
) -> "Analyzer":
"""One-hot encodes the specified variables (columns).
Parameters
----------
include_vars : list[str]
Default: None. List of variables to one-hot encode. \
If None, one-hot encodes all categorical variables.
exclude_vars : list[str]
Default: None. List of variables to exclude from one-hot encoding. \
If None, no variables are excluded.
dropfirst : bool
Default: True. If True, drops the first one-hot encoded column.
keep_original : bool
Default: False. If True, keeps the original variables in the DataFrame.
Returns
-------
Analyzer
Returns self for method chaining.
"""
self._datahandler.onehot(
include_vars=include_vars,
exclude_vars=exclude_vars,
dropfirst=dropfirst,
keep_original=keep_original,
)
return self
[docs]
@ensure_arg_list_uniqueness()
def select_vars(
self,
include_vars: list[str] | None = None,
exclude_vars: list[str] | None = None,
) -> "Analyzer":
"""Selects the specified variables.
Parameters
----------
include_vars : list[str]
Default: None. List of variables to include. \
If None, includes all variables.
exclude_vars : list[str]
Default: None. List of variables to exclude. \
If None, no variables are excluded.
Returns
-------
Analyzer
Returns self for method chaining.
"""
if include_vars is None:
include_vars = sorted(self._datahandler.vars())
if exclude_vars is not None:
include_vars = sorted(list(set(include_vars) - set(exclude_vars)))
self._datahandler.select_vars(vars=include_vars)
return self
[docs]
@ensure_arg_list_uniqueness()
def force_numeric(self, vars: list[str]) -> "Analyzer":
"""Forces specificed variables to numeric (float).
Parameters
----------
vars : list[str]
Name of variables to force to numeric.
Returns
-------
Analyzer
Returns self for method chaining.
"""
self._datahandler.force_numeric(vars)
return self
[docs]
@ensure_arg_list_uniqueness()
def force_categorical(self, vars: list[str]) -> "Analyzer":
"""Forces specificed variables (columns) to have categorical values.
That is, the variables' values are converted to strings.
Parameters
----------
vars : list[str]
Name of variables to force to categorical.
Returns
-------
Analyzer
Returns self for method chaining.
"""
self._datahandler.force_categorical(vars)
return self
[docs]
@ensure_arg_list_uniqueness()
def force_binary(
self,
var: str,
pos_label: str | None = None,
ignore_multiclass: bool = True,
rename: bool = True,
) -> "Analyzer":
"""Forces variables to be binary (0 and 1 valued numeric variables). \
Does nothing if the data contains more than two classes unless \
ignore_multiclass is True and pos_label is specified, \
in which case all classes except pos_label are labeled with zero.
Parameters
----------
vars : str
Name of variable to force to binary.
pos_labels : str
Default: None. The positive label. \
If None, the most common class is labeled as the positive class.
ignore_multiclass : bool
Default: False. If True, all classes except pos_label are labeled with \
zero. Otherwise raises ValueError.
rename : bool
Default: True. If True, the variable is renamed to \
{var}::{pos_label}.
Returns
-------
Analyzer
Returns self for method chaining.
"""
if pos_label is None:
# set the positive label to the most common class
pos_label = self._datahandler.df_all()[var].value_counts().index[0]
self._datahandler.force_binary(
vars=[var],
pos_labels=[pos_label],
ignore_multiclass=ignore_multiclass,
rename=rename,
)
return self
def datahandler(self) -> DataHandler:
"""Returns the DataHandler.
Returns
-------
DataHandler
The DataHandler object takes care of data management and preprocessing.
"""
return self._datahandler
[docs]
def numeric_vars(self) -> list[str]:
"""Returns the numeric variables in the working train DataFrame.
Returns
-------
list[str]
The numeric variables.
"""
return self._datahandler.numeric_vars()
[docs]
def categorical_vars(self) -> list[str]:
"""Returns the categorical variables in the working train DataFrame.
Returns
-------
list[str]
The categorical variables.
"""
return self._datahandler.categorical_vars()
[docs]
def vars(self) -> list[str]:
"""Returns the variables in the working train DataFrame.
Returns
-------
list[str]
The variables.
"""
return self._datahandler.vars()
[docs]
def shape(self, dataset: Literal["train", "test"]) -> tuple[int, int]:
"""Returns the shape of the working train DataFrame.
Parameters
----------
dataset : Literal['train', 'test']
The dataset to get the shape of.
Returns
-------
tuple[int, int]
The shape of the working DataFrame.
"""
if dataset == "train":
return self._datahandler._working_df_train.shape
elif dataset == "test":
return self._datahandler._working_df_test.shape
else:
raise ValueError(f"Invalid input: dataset = {dataset}.")
[docs]
def value_counts(
self, var: str, dataset: Literal["train", "test", "both"] = "both"
) -> pd.Series:
"""Returns the value counts of a variable in the working train DataFrame.
Parameters
----------
var : str
The variable to get the value counts of.
dataset : Literal['train', 'test']
The dataset to get the value counts of.
Returns
-------
pd.Series
The value counts of the variable.
"""
if dataset == "train":
return self._datahandler._working_df_train[var].value_counts()
elif dataset == "test":
return self._datahandler._working_df_test[var].value_counts()
elif dataset == "both":
return self._datahandler.df_all()[var].value_counts()
else:
raise ValueError(f"Invalid input: dataset = {dataset}.")
[docs]
def df_train(self) -> pd.DataFrame:
"""Returns the working train DataFrame.
Returns
-------
pd.DataFrame
The working train DataFrame.
"""
return self._datahandler.df_train()
[docs]
def df_test(self) -> pd.DataFrame:
"""Returns the working test DataFrame.
Returns
-------
pd.DataFrame
The working test DataFrame.
"""
return self._datahandler.df_test()
[docs]
def df_all(self) -> pd.DataFrame:
"""Returns the working DataFrame.
Returns
-------
pd.DataFrame
The working DataFrame.
"""
return self._datahandler.df_all()
def __len__(self) -> int:
"""Returns the number of units (rows) in working train DataFrame."""
return len(self._datahandler)
def __str__(self) -> str:
"""Returns metadata in string form."""
return self._datahandler.__str__()
def _repr_pretty_(self, p, cycle):
p.text(str(self))