Source code for tablemage._src.analyzer

import pandas as pd
from typing import Literal
from sklearn.model_selection import train_test_split
from .ml.predict import (
    BaseR,
    MLRegressionReport,
    BaseC,
    MLClassificationReport,
)
from .ml.cluster import BaseClust, ClusterReport
from .feature_selection import BaseFSR, BaseFSC, VotingSelectionReport
from .linear import (
    OLSLinearModel,
    OLSReport,
    LogitLinearModel,
    LogitReport,
    MNLogitLinearModel,
    MNLogitReport,
)
from .exploratory import (
    EDAReport,
)
from .causal import CausalModel
from .display.print_utils import print_wrapped, quote_and_color
from .data.datahandler import DataHandler
from .utils import ensure_arg_list_uniqueness


[docs] class Analyzer: """Analyzer is the high-level interface class of TableMage. An Analyzer object can be initialized from a single DataFrame which is then \ split into train and test DataFrames, or, alternatively, from pre-split \ train and test DataFrames. The object can then be used to conduct \ a variety of analyses, \ including exploratory data analysis (the eda() method), \ regression analysis (ols() and logit() methods), \ and machine learning modeling (classify() and regress() methods). The Analyzer object also handles data preprocessing tasks, such as scaling, \ imputing missing values, dropping rows with missing values, one-hot encoding, \ and selecting variables. These methods can be chained together for easy data \ transformation. The Analyzer object remembers how the data was transformed, \ enabling proper fitting and transforming of cross validation splits of the \ train dataset. """
[docs] def __init__( self, df: pd.DataFrame, df_test: pd.DataFrame | None = None, test_size: float = 0.0, split_seed: int = 42, id_column: str | None = None, verbose: bool = True, name: str = "Unnamed Dataset", ): """Initializes a Analyzer object. Parameters ---------- df : pd.DataFrame | None The DataFrame to be analyzed. Must be in wide format, i.e. with shape \ (n_units, n_vars). If df_test is provided, then the df is treated as the \ train DataFrame. Otherwise, the df is split into train and test DataFrames \ according to the test_size parameter. df_test : pd.DataFrame | None Default: None. If not None, then treats df as the train DataFrame. test_size : float Default: 0. Proportion of the DataFrame to withhold for \ testing. If test_size = 0, then the train DataFrame and the \ test DataFrame will both be the same as the input df. \ If df_test is provided, then test_size is ignored. id_column : str | None Default: None. The name of the column containing unique identifiers. \ If not None, then the column will be set as the index of the DataFrame. \ If None, then the input index will be used as the index of the DataFrame. split_seed : int Default: 42. \ Used only for the train test split. \ If df_test is provided, then split_seed is ignored. verbose : bool Default: False. If True, prints helpful update messages for certain \ Analyzer function calls. name : str Default: 'Unnamed Dataset'. Name of the dataset the Analyzer is initialized for. """ self._split_seed = split_seed self._test_size = test_size self._verbose = verbose df = df.copy() if df_test is not None: df_test = df_test.copy() # force column names to str df.columns = df.columns.astype(str) if df_test is not None: df_test.columns = df_test.columns.astype(str) # ensure column names are sorted df = df.reindex(sorted(df.columns), axis=1) df_test = df_test.reindex(sorted(df_test.columns), axis=1) if id_column is not None: if id_column not in df.columns: raise ValueError(f"ID column {id_column} not found in train data.") if id_column not in df_test.columns: raise ValueError(f"ID column {id_column} not found in test data.") df = df.set_index(id_column) df_test = df_test.set_index(id_column) self._datahandler = DataHandler( df_train=df, df_test=df_test, verbose=self._verbose, name=name ) else: if test_size > 0: temp_train, temp_test = train_test_split( df, test_size=test_size, shuffle=True, random_state=split_seed ) temp_train_df = pd.DataFrame(temp_train, columns=df.columns) temp_test_df = pd.DataFrame(temp_test, columns=df.columns) else: if self._verbose: print_wrapped( "No test dataset provided. The test dataset " + "will be treated as a train dataset copy.", type="NOTE", ) temp_train_df = df temp_test_df = df # ensure column names are sorted temp_train_df = temp_train_df.reindex(sorted(temp_train_df.columns), axis=1) temp_test_df = temp_test_df.reindex(sorted(temp_test_df.columns), axis=1) if id_column is not None: if id_column not in temp_train_df.columns: raise ValueError(f"ID column {id_column} not found in train data.") if id_column not in temp_test_df.columns: raise ValueError(f"ID column {id_column} not found in test data.") temp_train_df = temp_train_df.set_index(id_column) temp_test_df = temp_test_df.set_index(id_column) self._datahandler = DataHandler( df_train=temp_train_df, df_test=temp_test_df, verbose=self._verbose, name=name, ) self._name = name if len(self._datahandler.df_train()) == 0: raise ValueError("Train DataFrame is empty.") if len(self._datahandler.df_test()) == 0: raise ValueError("Test DataFrame is empty.") if self._verbose: print_wrapped( "Analyzer initialized for dataset " f"{quote_and_color(self._name, 'yellow')}.", type="UPDATE", )
# -------------------------------------------------------------------------- # EDA + FEATURE SELECTION + CAUSAL EFFECT ESTIMATION + REGRESSION ANALYSIS # --------------------------------------------------------------------------
[docs] def eda(self, dataset: Literal["train", "test", "all"] = "all") -> EDAReport: """Constructs an EDAReport object for the working train \ dataset, the working test dataset, or both datasets combined. Parameters ---------- dataset : Literal['train', 'test', 'all'] The dataset to be analyzed. By default, analyzes all data. Returns ------- EDAReport The EDAReport object contains a variety of exploratory data \ analysis methods, including summary statistics for numeric and \ categorical variables, t-tests, and data visualizations. """ if dataset == "train": return EDAReport(self._datahandler.df_train()) elif dataset == "test": return EDAReport(self._datahandler.df_test()) elif dataset == "all": return EDAReport(self._datahandler.df_all()) else: raise ValueError(f"Invalid input: dataset = {dataset}.")
[docs] @ensure_arg_list_uniqueness() def causal( self, treatment: str, outcome: str, confounders: list[str], dataset: Literal["train", "test", "all"] = "all", ) -> CausalModel: """Returns a CausalModel object for estimating causal effects. \ The CausalModel object contains rudimentary methods for \ causal effect estimation (weighted least squares, IPW estimator). Parameters ---------- treatment : str The treatment variable. Must be binary numeric (0 or 1-valued). outcome : str The outcome variable. confounders : list[str] The confounding variables. dataset : Literal['train', 'test', 'all'] The dataset to be analyzed. By default, analyzes all data. Returns ------- CausalModel The CausalModel object contains methods for estimating causal effects. """ return CausalModel( datahandler=self._datahandler, treatment=treatment, outcome=outcome, confounders=confounders, dataset=dataset, )
[docs] @ensure_arg_list_uniqueness() def select_features( self, target: str, predictors: list[str] | None = None, feature_selectors: list[BaseFSR] | list[BaseFSC] | None = None, max_n_features: int | None = None, ) -> VotingSelectionReport: """Selects the most important features using a variety of feature selection \ methods. The feature selection methods can be used to select the most \ important predictors for regression or classification. Parameters ---------- target : str The target variable. predictors : list[str] | None Default: None. The predictors to select from. \ If None, uses all variables except the target as predictors. feature_selectors : list[BaseFSR] | list[BaseFSC] | None Default: None. The feature selection methods to use. \ If None, uses all feature selection methods. max_n_features : int | None Default: None. Maximum number of features to select. \ If None, then all features with at least 50% support are selected. Returns ------- VotingSelectionReport Report object containing the results of the feature selection methods. """ if target in self._datahandler.categorical_vars(): for fs in feature_selectors: if not isinstance(fs, BaseFSC): raise ValueError( "Feature selection methods for classification " + "should be instances of BaseFSC." ) if predictors is None: predictors = self._datahandler.vars() if target in predictors: predictors.remove(target) for predictor in predictors: if predictor not in self._datahandler.vars(): raise ValueError(f"Predictor {predictor} not found in data.") # remove target from predictors, if present if target in predictors: predictors.remove(target) return VotingSelectionReport( selectors=feature_selectors, dataemitter=self._datahandler.train_test_emitter( y_var=target, X_vars=predictors, ), max_n_features=max_n_features, )
[docs] @ensure_arg_list_uniqueness() def ols( self, target: str | None = None, predictors: list[str] | None = None, alpha: float = 0.0, l1_weight: float = 0.0, ) -> OLSReport: """Performs OLS regression. Units with missing data will be dropped. Parameters ---------- target : str | None Default: None. The variable to be predicted. predictors : list[str] Default: None. \ If None, all variables except target will be used as predictors. alpha : float Default: 0. Regularization strength. Must be a positive float. l1_weight : float Default: 0. The weight of the L1 penalty. Must be a float between 0 and 1. Returns ------- OLSReport The OLSReport object contains a variety of OLS regression methods, \ including summary statistics, model coefficients, and data visualizations. """ if target not in self._datahandler.numeric_vars(): raise ValueError( f"Target variable {quote_and_color(target, 'yellow')} " + "is not numeric." ) if predictors is None: predictors = self._datahandler.vars() if target in predictors: if self._verbose: print_wrapped( f"Removing target variable {quote_and_color(target, 'yellow')} " + "from predictors.", type="WARNING", ) predictors.remove(target) return OLSReport( OLSLinearModel(alpha=alpha, l1_weight=l1_weight), self._datahandler, target, predictors, )
[docs] @ensure_arg_list_uniqueness() def logit( self, target: str | None = None, predictors: list[str] | None = None, alpha: float = 0.0, l1_weight: float = 0.0, threshold_strategy: Literal["f1", "roc"] | None = None, ) -> LogitReport | MNLogitReport: """Performs logistic regression. Units with missing data will be dropped. Parameters ---------- target : str | None Default: None. The variable to be predicted. predictors : list[str] | None Default: None. If None, all variables except target will be used as predictors. alpha : float Default: 0. Regularization strength. Must be a positive float. l1_weight : float Default: 0. The weight of the L1 penalty. Must be a float between 0 and 1. threshold_strategy : Literal['f1', 'roc'] | None Default: None. The strategy for determining the threshold for binary classification. If None, the threshold is set to 0.5. Returns ------- LogitReport | MNLogitReport The appropriate regression report object is returned. """ if predictors is None: predictors = self._datahandler.vars() if target in predictors: if self._verbose: print_wrapped( f"Removing target variable " f"{quote_and_color(target, 'yellow')} " + "from predictors.", type="WARNING", ) predictors.remove(target) # decide between binary and multinomial logit df_all = self._datahandler.df_all() if len(df_all[target].dropna().unique()) == 2: return LogitReport( LogitLinearModel( alpha=alpha, l1_weight=l1_weight, threshold_strategy=threshold_strategy, ), self._datahandler, target, predictors, ) else: return MNLogitReport( MNLogitLinearModel( alpha=alpha, l1_weight=l1_weight, threshold_strategy=threshold_strategy, ), self._datahandler, target, predictors, )
# -------------------------------------------------------------------------- # MACHINE LEARNING # --------------------------------------------------------------------------
[docs] @ensure_arg_list_uniqueness() def regress( self, models: list[BaseR], target: str, predictors: list[str] | None = None, feature_selectors: list[BaseFSR] | None = None, max_n_features: int | None = None, outer_cv: int | None = None, outer_cv_seed: int = 42, ) -> MLRegressionReport: """Conducts a comprehensive regression ML model benchmarking exercise. \ Observations with missing data will be dropped. Parameters ---------- models : list[BaseR] Models to be evaluated. target : str The variable to be predicted. predictors : list[str] Default: None. If None, uses all variables except target as predictors. feature_selectors : list[BaseFSR] The feature selectors for voting selection. Feature selectors \ can be used to select the most important predictors. \ Feature selectors can also be specified at the model level. If \ specified here, the same feature selectors will be used for all \ models. max_n_features : int | None Default: None. Maximum number of predictors to utilize. \ Ignored if feature_selectors is None. \ If None, then all features with at least 50% support are selected. outer_cv : int Default: None. If not None, reports training scores via nested k-fold CV. outer_cv_seed : int Default: 42. The random seed for the outer cross validation loop. Returns ------- MLRegressionReport """ if predictors is None: predictors = self._datahandler.vars() if target in predictors: predictors.remove(target) return MLRegressionReport( models=models, datahandler=self._datahandler, target=target, predictors=predictors, feature_selectors=feature_selectors, max_n_features=max_n_features, outer_cv=outer_cv, outer_cv_seed=outer_cv_seed, verbose=self._verbose, )
[docs] @ensure_arg_list_uniqueness() def classify( self, models: list[BaseC], target: str, predictors: list[str] | None = None, feature_selectors: list[BaseFSC] | None = None, max_n_features: int | None = None, outer_cv: int | None = None, outer_cv_seed: int = 42, ) -> MLClassificationReport: """Conducts a comprehensive classification ML model benchmarking exercise. \ Observations with missing data will be dropped. Parameters ---------- models : list[BaseC] Models to be evaluated. target : str The variable to be predicted. predictors : list[str] Default: None. If None, uses all variables except target as predictors. feature_selectors : list[BaseFSR] The feature selectors for voting selection. Feature selectors \ can be used to select the most important predictors. \ Feature selectors can also be specified at the model level. If \ specified here, the same feature selectors will be used for all \ models. max_n_features : int Default: None. \ Maximum number of predictors to utilize. \ Ignored if feature_selectors is None. \ If None, then all features with at least 50% support are selected. \ outer_cv : int Default: None. If not None, reports training scores via nested k-fold CV. outer_cv_seed : int Default: 42. The random seed for the outer cross validation loop. Returns ------- MLClassificationReport """ if predictors is None: predictors = self._datahandler.vars() if target in predictors: predictors.remove(target) return MLClassificationReport( models=models, datahandler=self._datahandler, target=target, predictors=predictors, feature_selectors=feature_selectors, max_n_features=max_n_features, outer_cv=outer_cv, outer_cv_seed=outer_cv_seed, verbose=self._verbose, )
[docs] def cluster( self, models: list[BaseClust], features: list[str] | None = None, dataset: Literal["train", "all"] = "all", ) -> ClusterReport: """Conducts a clustering exercise. Parameters ---------- models : list[BaseClust] Models to be evaluated. features : list[str] | None Default: None. The features to cluster on. \ If None, uses all the variables. dataset : Literal['train', 'all'] Dataset to fit models on. If "train", only fits models on training data. \ Then, cluster predictions can be made on test data. \ If "all", fits models on all data. \ By default, fits models on all data. """ if features is None: features = self._datahandler.vars() return ClusterReport( models=models, datahandler=self._datahandler, features=features, dataset=dataset, )
# -------------------------------------------------------------------------- # DATAHANDLER METHODS # --------------------------------------------------------------------------
[docs] def load_data_checkpoint(self, checkpoint_name: str | None = None) -> "Analyzer": """Loads the original train and test DataFrames. Parameters ---------- checkpoint_name : str | None Default: None. The name of the checkpoint to load. \ If None, loads the original train and test DataFrames. Returns ------- Analyzer Returns self for method chaining. """ self._datahandler.load_data_checkpoint(checkpoint_name) return self
[docs] def save_data_checkpoint(self, checkpoint_name: str) -> "Analyzer": """Saves the current train and test DataFrames. Parameters ---------- checkpoint_name : str The name of the checkpoint. Returns ------- Analyzer Returns self for method chaining. """ self._datahandler.save_data_checkpoint(checkpoint_name) return self
[docs] def remove_data_checkpoint(self, checkpoint_name: str) -> "Analyzer": """Deletes a saved checkpoint. Parameters ---------- checkpoint_name : str The name of the checkpoint to delete. Returns ------- Analyzer Returns self for method chaining. """ self._datahandler.remove_data_checkpoint(checkpoint_name) return self
[docs] def engineer_numeric_var(self, name: str, formula: str) -> "Analyzer": """Engineers a new variable/feature based on a formula. The formula \ can only involve numeric variables. Creates another numeric variable. Parameters ---------- name : str The name of the new variable engineered. formula : str Formula for the new feature. For example, "x1 + x2" would create \ a new feature that is the sum of the columns x1 and x2 in the DataFrame. \ All variables used must be numeric. \ Handles the following operations: - Addition (+) - Subtraction (-) - Multiplication (*) - Division (/) - Parentheses () - Exponentiation (**) - Logarithm (log) - Exponential (exp) - Square root (sqrt) If the i-th unit is missing a value in any of the variables used in the \ formula, then the i-th unit of the new feature will be missing. Examples -------- >>> analyzer.engineer_numeric_feature("x3", "x1 + x2") >>> assert "x3" in analyzer.datahandler.vars() True >>> assert analyzer.datahandler.df_train()["x3"].equals( ... analyzer.datahandler.df_train()["x1"] + analyzer.datahandler.df_train()["x2"] ... ) True Returns ------- Analyzer Returns self for method chaining. """ self._datahandler.engineer_numeric_feature(name, formula) return self
[docs] def engineer_categorical_var( self, name: str, numeric_var: str, level_names: list[str], thresholds: list[float], leq: bool = False, ) -> "Analyzer": """Engineers a new categorical variable/feature based on a list of thresholds. Parameters ---------- name : str The name of the new variable engineered. numeric_var : str The name of the numeric variable. level_names : list[str] The names of the levels of the new categorical variable. \ The first level is the lowest level, and the last level is the highest level. thresholds : list[float] The (upper) thresholds for the levels of the new categorical variable. \ The thresholds must be in ascending order. \ For example, if thresholds = [0, 10, 20], \ and level_names = ["Low", "Medium", "High", "Very High"], \ then the new variable will have the following levels: \ - "Low" for values less than 0, - "Medium" for other values less than 10, - "High" for other values less than 20, - "Very High" for values greater than or equal to 20. leq : bool Default: False. If True, the thresholds are inclusive. Returns ------- Analyzer Returns self for method chaining. """ self._datahandler.engineer_categorical_feature( name, numeric_var, level_names, thresholds, leq ) return self
[docs] @ensure_arg_list_uniqueness() def scale( self, include_vars: list[str] | None = None, exclude_vars: list[str] | None = None, strategy: Literal[ "standardize", "minmax", "log", "log1p", "robust_standardize", "normal_quantile", "uniform_quantile", ] = "standardize", ) -> "Analyzer": """Scales the variables. Parameters ---------- include_vars : list[str] | None Default: None. List of variables to scale. \ If None, scales values in all columns. exclude_vars : list[str] | None Default: None. List of variables to exclude from scaling. \ If None, no variables are excluded. strategy : str Default: 'standardize'. The scaling strategy. Returns ------- Analyzer Returns self for method chaining. """ self._datahandler.scale( include_vars=include_vars, exclude_vars=exclude_vars, strategy=strategy, ) return self
[docs] @ensure_arg_list_uniqueness() def impute( self, include_vars: list[str] | None = None, exclude_vars: list[str] | None = None, numeric_strategy: Literal["median", "mean", "5nn", "10nn"] = "median", categorical_strategy: Literal["most_frequent", "missing"] = "most_frequent", ) -> "Analyzer": """Imputes missing values. The imputer is fit on the train DataFrame \ and transforms both train and test DataFrames. Parameters ---------- include_vars : list[str] | None Default: None. List of variables to impute missing values. \ If None, imputes missing values in all columns. exclude_vars : list[str] | None Default: None. List of variables to exclude from imputing missing values. \ If None, no variables are excluded. numeric_strategy : Literal['median', 'mean', '5nn', '10nn'] Default: 'median'. \ Strategy for imputing missing values in numeric variables. - 'median': impute with median. - 'mean': impute with mean. - '5nn': impute with 5-nearest neighbors. - '10nn': impute with 10-nearest neighbors. categorical_strategy : Literal['most_frequent', 'missing'] Default: 'most_frequent'. \ Strategy for imputing missing values in categorical variables. - 'most_frequent': impute with most frequent value. - 'missing': impute with 'missing'. Returns ------- Analyzer Returns self for method chaining. """ self._datahandler.impute( include_vars=include_vars, exclude_vars=exclude_vars, numeric_strategy=numeric_strategy, categorical_strategy=categorical_strategy, ) return self
[docs] @ensure_arg_list_uniqueness() def dropna( self, include_vars: list[str] | None = None, exclude_vars: list[str] | None = None, ) -> "Analyzer": """Drops observations (rows) with missing values on both the train \ and test DataFrames. Parameters ---------- include_vars : list[str] | None Default: None. \ List of columns along which to drop rows with missing values. \ If None, drops rows with missing values in all columns. exclude_vars : list[str] | None Default: None. \ List of columns along which to exclude from dropping rows with \ missing values. If None, no variables are excluded. Returns ------- Analyzer Returns self for method chaining. """ self._datahandler.dropna( include_vars=include_vars, exclude_vars=exclude_vars, ) return self
[docs] @ensure_arg_list_uniqueness() def drop_highly_missing_vars( self, include_vars: list[str] | None = None, exclude_vars: list[str] | None = None, threshold: float = 0.5, ) -> "Analyzer": """Drops variables (columns) with missingness rate above a specified threshold. Parameters ---------- include_vars : list[str] | None Default: None. If not None, only drops columns with more than 50% missing \ values in the specified variables. Otherwise, drops columns with more than \ 50% missing values in all variables. exclude_vars : list[str] | None Default: None. If not None, excludes the specified variables from the \ list of variables to drop (which is set to all variables by default). threshold : float Default: 0.5. Proportion of missing values above which a column is dropped. \ For example, if threshold = 0.2, then columns with more than 20% missing \ values are dropped. Returns ------- Analyzer Returns self for method chaining. """ self._datahandler.drop_highly_missing_vars( include_vars, exclude_vars, threshold ) return self
[docs] @ensure_arg_list_uniqueness() def onehot( self, include_vars: list[str] | None = None, exclude_vars: list[str] | None = None, dropfirst: bool = True, keep_original: bool = False, ) -> "Analyzer": """One-hot encodes the specified variables (columns). Parameters ---------- include_vars : list[str] Default: None. List of variables to one-hot encode. \ If None, one-hot encodes all categorical variables. exclude_vars : list[str] Default: None. List of variables to exclude from one-hot encoding. \ If None, no variables are excluded. dropfirst : bool Default: True. If True, drops the first one-hot encoded column. keep_original : bool Default: False. If True, keeps the original variables in the DataFrame. Returns ------- Analyzer Returns self for method chaining. """ self._datahandler.onehot( include_vars=include_vars, exclude_vars=exclude_vars, dropfirst=dropfirst, keep_original=keep_original, ) return self
[docs] @ensure_arg_list_uniqueness() def select_vars( self, include_vars: list[str] | None = None, exclude_vars: list[str] | None = None, ) -> "Analyzer": """Selects the specified variables. Parameters ---------- include_vars : list[str] Default: None. List of variables to include. \ If None, includes all variables. exclude_vars : list[str] Default: None. List of variables to exclude. \ If None, no variables are excluded. Returns ------- Analyzer Returns self for method chaining. """ if include_vars is None: include_vars = sorted(self._datahandler.vars()) if exclude_vars is not None: include_vars = sorted(list(set(include_vars) - set(exclude_vars))) self._datahandler.select_vars(vars=include_vars) return self
[docs] @ensure_arg_list_uniqueness() def force_numeric(self, vars: list[str]) -> "Analyzer": """Forces specificed variables to numeric (float). Parameters ---------- vars : list[str] Name of variables to force to numeric. Returns ------- Analyzer Returns self for method chaining. """ self._datahandler.force_numeric(vars) return self
[docs] @ensure_arg_list_uniqueness() def force_categorical(self, vars: list[str]) -> "Analyzer": """Forces specificed variables (columns) to have categorical values. That is, the variables' values are converted to strings. Parameters ---------- vars : list[str] Name of variables to force to categorical. Returns ------- Analyzer Returns self for method chaining. """ self._datahandler.force_categorical(vars) return self
[docs] @ensure_arg_list_uniqueness() def force_binary( self, var: str, pos_label: str | None = None, ignore_multiclass: bool = True, rename: bool = True, ) -> "Analyzer": """Forces variables to be binary (0 and 1 valued numeric variables). \ Does nothing if the data contains more than two classes unless \ ignore_multiclass is True and pos_label is specified, \ in which case all classes except pos_label are labeled with zero. Parameters ---------- vars : str Name of variable to force to binary. pos_labels : str Default: None. The positive label. \ If None, the most common class is labeled as the positive class. ignore_multiclass : bool Default: False. If True, all classes except pos_label are labeled with \ zero. Otherwise raises ValueError. rename : bool Default: True. If True, the variable is renamed to \ {var}::{pos_label}. Returns ------- Analyzer Returns self for method chaining. """ if pos_label is None: # set the positive label to the most common class pos_label = self._datahandler.df_all()[var].value_counts().index[0] self._datahandler.force_binary( vars=[var], pos_labels=[pos_label], ignore_multiclass=ignore_multiclass, rename=rename, ) return self
def datahandler(self) -> DataHandler: """Returns the DataHandler. Returns ------- DataHandler The DataHandler object takes care of data management and preprocessing. """ return self._datahandler
[docs] def numeric_vars(self) -> list[str]: """Returns the numeric variables in the working train DataFrame. Returns ------- list[str] The numeric variables. """ return self._datahandler.numeric_vars()
[docs] def categorical_vars(self) -> list[str]: """Returns the categorical variables in the working train DataFrame. Returns ------- list[str] The categorical variables. """ return self._datahandler.categorical_vars()
[docs] def vars(self) -> list[str]: """Returns the variables in the working train DataFrame. Returns ------- list[str] The variables. """ return self._datahandler.vars()
[docs] def shape(self, dataset: Literal["train", "test"]) -> tuple[int, int]: """Returns the shape of the working train DataFrame. Parameters ---------- dataset : Literal['train', 'test'] The dataset to get the shape of. Returns ------- tuple[int, int] The shape of the working DataFrame. """ if dataset == "train": return self._datahandler._working_df_train.shape elif dataset == "test": return self._datahandler._working_df_test.shape else: raise ValueError(f"Invalid input: dataset = {dataset}.")
[docs] def value_counts( self, var: str, dataset: Literal["train", "test", "both"] = "both" ) -> pd.Series: """Returns the value counts of a variable in the working train DataFrame. Parameters ---------- var : str The variable to get the value counts of. dataset : Literal['train', 'test'] The dataset to get the value counts of. Returns ------- pd.Series The value counts of the variable. """ if dataset == "train": return self._datahandler._working_df_train[var].value_counts() elif dataset == "test": return self._datahandler._working_df_test[var].value_counts() elif dataset == "both": return self._datahandler.df_all()[var].value_counts() else: raise ValueError(f"Invalid input: dataset = {dataset}.")
[docs] def df_train(self) -> pd.DataFrame: """Returns the working train DataFrame. Returns ------- pd.DataFrame The working train DataFrame. """ return self._datahandler.df_train()
[docs] def df_test(self) -> pd.DataFrame: """Returns the working test DataFrame. Returns ------- pd.DataFrame The working test DataFrame. """ return self._datahandler.df_test()
[docs] def df_all(self) -> pd.DataFrame: """Returns the working DataFrame. Returns ------- pd.DataFrame The working DataFrame. """ return self._datahandler.df_all()
def __len__(self) -> int: """Returns the number of units (rows) in working train DataFrame.""" return len(self._datahandler) def __str__(self) -> str: """Returns metadata in string form.""" return self._datahandler.__str__() def _repr_pretty_(self, p, cycle): p.text(str(self))