Source code for tablemage._src.feature_selection.boruta_feature_selection

from typing import Literal
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import numpy as np

from .base_feature_selection import BaseFSC, BaseFSR
from .BorutaPy import BorutaPy

from ..data.datahandler import DataEmitter
from ..display.print_utils import print_wrapped



[docs]
class BorutaFSR(BaseFSR):

[docs]
    def __init__(
        self,
        estimator: Literal["random_forest", "xgboost"] = "random_forest",
        n_estimators: int = 100,
        max_depth: int = 5,
        model_random_state: int = 42,
        n_jobs: int = -1,
        name: str | None = None,
    ):
        """
        Constructs a BorutaFSR.

        Parameters
        ----------
        estimator : Literal["random_forest", "xgboost"]
            Default: "random_forest". The estimator to use for Boruta. Default \
            hyperparameters are used for the estimator.

        n_estimators : int
            Default: 100. The number of estimators to use for Boruta.

        max_depth : int
            Default: 5. The maximum depth of the trees in the ensemble.

        model_random_state : int
            Default: 42. The random state to use for the estimator.

        n_jobs : int
            Default: -1. The number of jobs to run in parallel.

        name : str | None
            Default: None. If None, then outputs the default name.
        """
        if name is None:
            name = "BorutaFSR"
        super().__init__(name)

        sk_estimator = None
        if estimator == "random_forest":
            sk_estimator = RandomForestRegressor(
                max_depth=max_depth,
                random_state=model_random_state,
                n_jobs=n_jobs,
            )
        elif estimator == "xgboost":
            sk_estimator = XGBRegressor(
                max_depth=max_depth, random_state=model_random_state, n_jobs=n_jobs
            )
        else:
            raise ValueError(
                f"estimator must be one of 'random_forest', "
                f"or 'xgboost'. Got: {estimator}"
            )

        self._selector = BorutaPy(
            estimator=sk_estimator,
            n_estimators=n_estimators,
            random_state=model_random_state,
        )


    def select(
        self, dataemitter: DataEmitter
    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Selects the top k features
        based on the training data.

        Parameters
        ----------
        dataemitter : DataEmitter

        Returns
        -------
        np.ndarray ~ (n_in_features)
            The selected features.

        np.ndarray ~ (n_in_features)
            The support of the selected features.

        np.ndarray ~ (n_in_features)
            The ranking of the selected features.
        """
        X_train, y_train = dataemitter.emit_train_Xy()
        self._all_features = X_train.columns.to_numpy()

        self._selector.fit(X=X_train.to_numpy(), y=y_train.to_numpy())
        self._support = self._selector.support_
        self._all_feature_scores = self._selector.ranking_
        self._selected_features = self._all_features[self._support]

        return self._all_features, self._selected_features, self._support




[docs]
class BorutaFSC(BaseFSC):

[docs]
    def __init__(
        self,
        estimator: Literal["random_forest", "xgboost"] = "random_forest",
        n_estimators: int = 100,
        max_depth: int = 5,
        model_random_state: int = 42,
        name: str | None = None,
    ):
        """
        Constructs a BorutaFSC.

        Parameters
        ----------
        estimator : Literal["random_forest", "xgboost"]
            Default: "random_forest". The estimator to use for Boruta. Default
            hyperparameters are used for the estimator.

        n_estimators : int
            Default: 100. The number of estimators to use for Boruta's estimator.

        max_depth : int
            Default: 5. The maximum depth of the trees in the ensemble.

        model_random_state : int
            Default: 42. The random state to use for the estimator.

        name : str | None
            Default: None. If None, then outputs the default name.
        """
        if name is None:
            name = "BorutaFSC"
        super().__init__(name)

        sk_estimator = None
        if estimator == "random_forest":
            sk_estimator = RandomForestClassifier(
                max_depth=max_depth,
                random_state=model_random_state,
                class_weight="balanced",
            )
        elif estimator == "xgboost":
            sk_estimator = XGBClassifier(
                max_depth=max_depth, random_state=model_random_state
            )
        else:
            raise ValueError(
                f"estimator must be one of 'random_forest', "
                f"or 'xgboost'. Got: {estimator}"
            )

        self._selector = BorutaPy(
            estimator=sk_estimator,
            n_estimators=n_estimators,
            random_state=model_random_state,
        )


    def select(
        self, dataemitter: DataEmitter
    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Selects the top k features
        based on the training data.

        Parameters
        ----------
        dataemitter : DataEmitter

        Returns
        -------
        np.ndarray ~ (n_in_features)
            The selected features.

        np.ndarray ~ (n_in_features)
            The support of the selected features.

        np.ndarray ~ (n_in_features)
            The ranking of the selected features.
        """
        X_train, y_train = dataemitter.emit_train_Xy()
        y_train = LabelEncoder().fit_transform(y_train)
        self._all_features = X_train.columns.to_numpy()

        self._selector.fit(X=X_train.to_numpy(), y=y_train)
        self._support = self._selector.support_
        self._all_feature_scores = self._selector.ranking_
        self._selected_features = self._all_features[self._support]

        if len(self._selected_features) == 0:
            print_wrapped(
                "Boruta did not select any features. "
                "Boruta will vote for all features.",
                type="WARNING",
            )

        return self._all_features, self._selected_features, self._support