Source code for tablemage._src.feature_selection.boruta_feature_selection

from typing import Literal
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import numpy as np

from .base_feature_selection import BaseFSC, BaseFSR
from .BorutaPy import BorutaPy

from ..data.datahandler import DataEmitter
from ..display.print_utils import print_wrapped


[docs] class BorutaFSR(BaseFSR):
[docs] def __init__( self, estimator: Literal["random_forest", "xgboost"] = "random_forest", n_estimators: int = 100, max_depth: int = 5, model_random_state: int = 42, n_jobs: int = -1, name: str | None = None, ): """ Constructs a BorutaFSR. Parameters ---------- estimator : Literal["random_forest", "xgboost"] Default: "random_forest". The estimator to use for Boruta. Default \ hyperparameters are used for the estimator. n_estimators : int Default: 100. The number of estimators to use for Boruta. max_depth : int Default: 5. The maximum depth of the trees in the ensemble. model_random_state : int Default: 42. The random state to use for the estimator. n_jobs : int Default: -1. The number of jobs to run in parallel. name : str | None Default: None. If None, then outputs the default name. """ if name is None: name = "BorutaFSR" super().__init__(name) sk_estimator = None if estimator == "random_forest": sk_estimator = RandomForestRegressor( max_depth=max_depth, random_state=model_random_state, n_jobs=n_jobs, ) elif estimator == "xgboost": sk_estimator = XGBRegressor( max_depth=max_depth, random_state=model_random_state, n_jobs=n_jobs ) else: raise ValueError( f"estimator must be one of 'random_forest', " f"or 'xgboost'. Got: {estimator}" ) self._selector = BorutaPy( estimator=sk_estimator, n_estimators=n_estimators, random_state=model_random_state, )
def select( self, dataemitter: DataEmitter ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Selects the top k features based on the training data. Parameters ---------- dataemitter : DataEmitter Returns ------- np.ndarray ~ (n_in_features) The selected features. np.ndarray ~ (n_in_features) The support of the selected features. np.ndarray ~ (n_in_features) The ranking of the selected features. """ X_train, y_train = dataemitter.emit_train_Xy() self._all_features = X_train.columns.to_numpy() self._selector.fit(X=X_train.to_numpy(), y=y_train.to_numpy()) self._support = self._selector.support_ self._all_feature_scores = self._selector.ranking_ self._selected_features = self._all_features[self._support] return self._all_features, self._selected_features, self._support
[docs] class BorutaFSC(BaseFSC):
[docs] def __init__( self, estimator: Literal["random_forest", "xgboost"] = "random_forest", n_estimators: int = 100, max_depth: int = 5, model_random_state: int = 42, name: str | None = None, ): """ Constructs a BorutaFSC. Parameters ---------- estimator : Literal["random_forest", "xgboost"] Default: "random_forest". The estimator to use for Boruta. Default hyperparameters are used for the estimator. n_estimators : int Default: 100. The number of estimators to use for Boruta's estimator. max_depth : int Default: 5. The maximum depth of the trees in the ensemble. model_random_state : int Default: 42. The random state to use for the estimator. name : str | None Default: None. If None, then outputs the default name. """ if name is None: name = "BorutaFSC" super().__init__(name) sk_estimator = None if estimator == "random_forest": sk_estimator = RandomForestClassifier( max_depth=max_depth, random_state=model_random_state, class_weight="balanced", ) elif estimator == "xgboost": sk_estimator = XGBClassifier( max_depth=max_depth, random_state=model_random_state ) else: raise ValueError( f"estimator must be one of 'random_forest', " f"or 'xgboost'. Got: {estimator}" ) self._selector = BorutaPy( estimator=sk_estimator, n_estimators=n_estimators, random_state=model_random_state, )
def select( self, dataemitter: DataEmitter ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Selects the top k features based on the training data. Parameters ---------- dataemitter : DataEmitter Returns ------- np.ndarray ~ (n_in_features) The selected features. np.ndarray ~ (n_in_features) The support of the selected features. np.ndarray ~ (n_in_features) The ranking of the selected features. """ X_train, y_train = dataemitter.emit_train_Xy() y_train = LabelEncoder().fit_transform(y_train) self._all_features = X_train.columns.to_numpy() self._selector.fit(X=X_train.to_numpy(), y=y_train) self._support = self._selector.support_ self._all_feature_scores = self._selector.ranking_ self._selected_features = self._all_features[self._support] if len(self._selected_features) == 0: print_wrapped( "Boruta did not select any features. " "Boruta will vote for all features.", type="WARNING", ) return self._all_features, self._selected_features, self._support