Source code for tablemage._src.feature_selection.voteselect_report

import pandas as pd
from . import BaseFS
from ..data.datahandler import DataEmitter
from ..display.print_utils import (
    print_wrapped,
    quote_and_color,
    color_text,
    bold_text,
    fill_ignore_format,
    list_to_string,
)
from ..display.print_options import print_options


[docs] class VotingSelectionReport: """Class for generating feature selection-relevant tables.""" def __init__( self, selectors: list[BaseFS], dataemitter: DataEmitter, max_n_features: int | None = None, verbose: bool = True, ): """Initializes a VotingSelectionReport object. VotingSelectionReport selects features via voting selection. Parameters ---------- selectors : list[BaseSelector] Each BaseSelector decides on a maximum of max_n_features. dataemitter : DataEmitter The DataEmitter object that contains the data. max_n_features : int | None Default: None. Number of desired features. 0 < max_n_features < n_predictors. If None, then all features with at least 50% support are selected. verbose : bool Default: True. If True, prints progress. """ self._selector_to_support = {} self._emitter = dataemitter self._y_var = self._emitter.y_var() self._predictors = self._emitter.X_vars() X_train_df = self._emitter.emit_train_X(verbose=False) if len(X_train_df) == 0: raise ValueError( "No data was emitted. All rows with missing values were dropped. " + "This may have resulted in an empty dataset. " + "Please consider removing highly missing variables " + "or imputing missing values." ) self._selectors = selectors for selector in selectors: if verbose: print_wrapped(f"Fitting {quote_and_color(selector)}.", type="PROGRESS") features, _, support = selector.select(self._emitter) self._selector_to_support[str(selector)] = support self._all_features = features self._votes_df = pd.DataFrame.from_dict( self._selector_to_support, orient="index", columns=features ) self._vote_counts_series = self._votes_df.sum(axis=0) self._selector_dict_indexable_by_str = { str(selector): selector for selector in selectors } if max_n_features is not None: self._top_features = self._vote_counts_series.sort_values( ascending=False ).index.to_list()[:max_n_features] else: self._top_features = self._vote_counts_series[ self._vote_counts_series >= len(selectors) / 2 ].index.to_list()
[docs] def top_features(self) -> list: """Returns a list of top features determined by the voting selectors. Returns ------- list Top features. """ return self._top_features
[docs] def all_features(self) -> list: """Returns a list of all features considered by the voting selectors. Returns ------- list All features. """ return self._all_features
[docs] def votes(self) -> pd.DataFrame: """Returns a DataFrame that describes the distribution of votes among selectors. Returns ------- pd.DataFrame Votes DataFrame. """ return self._votes_df.T
def _emit_train_X( self, dropfirst: bool = True, verbose: bool = True ) -> pd.DataFrame: """Returns the training DataFrame with only the top features.""" return self._emitter.emit_train_Xy(dropfirst=dropfirst, verbose=verbose)[0][ self._top_features ] def _emit_test_X( self, dropfirst: bool = True, verbose: bool = True ) -> pd.DataFrame: """Returns the test DataFrame with only the top features.""" return self._emitter.emit_test_Xy(dropfirst=dropfirst, verbose=verbose)[0][ self._top_features ] def __getitem__(self, index: str) -> BaseFS: """Returns the RegressionBaseSelector by nickname index.""" return self._selector_dict_indexable_by_str[index] def __str__(self) -> str: n_dec = print_options._n_decimals max_width = print_options._max_line_width top_divider = color_text("=" * max_width, "none") + "\n" bottom_divider = "\n" + color_text("=" * max_width, "none") divider = "\n" + color_text("-" * max_width, "none") + "\n" divider_invisible = "\n" + " " * max_width + "\n" title_message = bold_text("Voting Selection Report") target_var = "'" + self._y_var + "'" target_message = f"{bold_text('Target variable:')}\n" target_message += fill_ignore_format( color_text(target_var, "purple"), width=max_width, initial_indent=2, subsequent_indent=2, ) predictors_message = f"{bold_text('Candidate predictor variables:')}\n" predictors_message += fill_ignore_format( list_to_string(self._predictors), width=max_width, initial_indent=2, subsequent_indent=2, ) models_str = list_to_string( [model._name for model in self._selectors], color="blue", ) models_message = f"{bold_text('Feature selectors:')}\n" models_message += fill_ignore_format( models_str, width=max_width, initial_indent=2, subsequent_indent=2, ) selected_features_message = f"{bold_text('Selected features:')}\n" selected_features_message += fill_ignore_format( list_to_string(self._top_features, color="purple"), width=max_width, initial_indent=2, subsequent_indent=2, ) final_message = ( top_divider + title_message + divider + target_message + divider_invisible + predictors_message + divider_invisible + models_message + divider + selected_features_message + bottom_divider ) return final_message def _to_dict(self) -> dict: """Returns the object as a dictionary.""" return { "target": self._y_var, "candidate_predictors": self._predictors, "feature_selectors": [str(selector) for selector in self._selectors], "selected_features": self._top_features, } def _repr_pretty_(self, p, cycle): p.text(str(self))