Source code for tablemage._src.feature_selection.classification_feature_selection

from sklearn.feature_selection import (
    SelectKBest,
    SelectFromModel,
    f_classif,
    mutual_info_classif,
    chi2,
)
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from typing import Literal
import numpy as np

from ..data.datahandler import DataEmitter
from .base_feature_selection import BaseFSC


[docs] class KBestFSC(BaseFSC): """Selects the k best features based on the f_classif or mutual info regression score. """
[docs] def __init__( self, scorer: Literal["f_classif", "mutual_info_classif", "chi2"], k: int, name: str | None = None, ): """Initializes a KBestFSC object. Parameters ---------- scorer : Literal['f_classif', 'mutual_info_classif'] k : int Number of desired features, < n_predictors. name : str | None Default: None. If None, then outputs the default name. """ if name is None: name = f"KBestFSC({scorer})" super().__init__(name) self._scorer = scorer self._k = k
def select( self, dataemitter: DataEmitter ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Selects the top max_n_features features based on the training data. Parameters ---------- dataemitter : DataEmitter Returns ------- np.ndarray ~ (n_in_features) All features (variable names). np.ndarray ~ (n_out_features) Selected features. np.ndarray ~ (n_in_features) Boolean mask, the support for selected features. """ scorer = None if self._scorer == "f_classif": scorer = f_classif elif self._scorer == "mutual_info_classif": scorer = mutual_info_classif elif self._scorer == "chi2": scorer = chi2 else: raise ValueError(f"Invalid scorer: {self._scorer}") selector = SelectKBest(scorer, k=self._k) X_train, y_train = dataemitter.emit_train_Xy() self._all_features = X_train.columns.to_list() selector.fit(X=X_train, y=y_train) self._selected_features = selector.get_feature_names_out() self._all_feature_scores = selector.scores_ self._support = selector.get_support() self._selected_feature_scores = selector.scores_[self._support] return self._all_features, self._selected_features, self._support
[docs] class LassoFSC(BaseFSC): """Selects the (at most) k best features via Lasso regression model-inherent feature selection."""
[docs] def __init__( self, max_n_features: int, c: float | None = None, name: str | None = None, ): """ Constructs a LassoFSC. Parameters ---------- max_n_features : int Number of desired features, < n_predictors. c : float | None Default: None. Inverse of regularization strength. If None, then c is selected via five-fold cross validation from a grid of 10 candidate values, on a log scale from 1e-4 to 1e4. name : str | None Default: None. If None, then name is set to default. """ if name is None: name = "LassoFSC" super().__init__(name) if c is None: self._model = LogisticRegressionCV(penalty="l1", solver="liblinear", cv=5) else: self._model = LogisticRegression(penalty="l1", solver="liblinear", C=c) self._max_n_features = max_n_features
def select( self, dataemitter: DataEmitter ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Selects the (at most) top max_n_features features based on the training data. Parameters ---------- dataemitter : DataEmitter Returns ------- np.ndarray ~ (n_in_features) All features (variable names). np.ndarray ~ (n_out_features) Selected features. np.ndarray ~ (n_in_features) Boolean mask, the support for selected features. """ X_train, y_train = dataemitter.emit_train_Xy() self._all_features = X_train.columns.to_numpy() self._model.fit(X=X_train.to_numpy(), y=y_train.to_numpy()) selector = SelectFromModel( estimator=self._model, prefit=True, max_features=self._max_n_features ) selector.fit(X=X_train, y=y_train) self._selected_features = selector.get_feature_names_out() self._support = selector.get_support() self._all_feature_scores = selector.estimator_.coef_ return self._all_features, self._selected_features, self._support