当前位置：首页 > ops >正文

DAY 19 常见的特征筛选算法

ops 2025/6/23 9:13:42

@浙大疏锦行
1.方差筛选
2.皮尔逊相关系数筛选
3.lasso筛选
4.树模型重要性
5.shap重要性
6.递归特征消除REF

1.方差筛选

# 输入：特征矩阵 X，方差阈值 threshold
# 输出：筛选后的特征矩阵 X_selected
import numpy as npdef variance_selection(X, threshold):# 计算每个特征的方差variances = np.var(X, axis=0)# 选择方差大于阈值的特征索引selected_indices = np.where(variances > threshold)[0]# 筛选特征矩阵X_selected = X[:, selected_indices]return X_selected

皮尔逊相关系数筛选

# 输入：特征矩阵 X，目标变量 y，相关系数阈值 threshold
# 输出：筛选后的特征矩阵 X_selected
import numpy as npdef pearson_correlation_selection(X, y, threshold):correlations = []for i in range(X.shape[1]):# 计算皮尔逊相关系数corr = np.corrcoef(X[:, i], y)[0, 1]correlations.append(np.abs(corr))# 选择相关系数绝对值大于阈值的特征索引selected_indices = np.where(np.array(correlations) > threshold)[0]# 筛选特征矩阵X_selected = X[:, selected_indices]return X_selected

Lasso 筛选

# 输入：特征矩阵 X，目标变量 y，正则化系数 alpha，选择的特征数量 k
# 输出：筛选后的特征矩阵 X_selected
from sklearn.linear_model import Lasso
import numpy as npdef lasso_selection(X, y, alpha, k):# 创建 Lasso 模型并拟合数据lasso = Lasso(alpha=alpha)lasso.fit(X, y)# 获取特征系数coefficients = lasso.coef_# 获取系数绝对值最大的前 k 个特征的索引top_k_indices = np.argsort(np.abs(coefficients))[-k:]# 筛选特征矩阵X_selected = X[:, top_k_indices]return X_selected

树模型重要性

# 输入：特征矩阵 X，目标变量 y，选择的特征数量 k
# 输出：筛选后的特征矩阵 X_selected
from sklearn.ensemble import RandomForestClassifier
import numpy as npdef tree_importance_selection(X, y, k):# 创建随机森林模型并拟合数据model = RandomForestClassifier()model.fit(X, y)# 获取特征重要性得分importances = model.feature_importances_# 获取重要性得分最大的前 k 个特征的索引top_k_indices = np.argsort(importances)[-k:]# 筛选特征矩阵X_selected = X[:, top_k_indices]return X_selected

SHAP 重要性

# 输入：特征矩阵 X，目标变量 y，选择的特征数量 k
# 输出：筛选后的特征矩阵 X_selected
import shap
from sklearn.ensemble import RandomForestClassifier
import numpy as npdef shap_importance_selection(X, y, k):# 创建随机森林模型并拟合数据model = RandomForestClassifier()model.fit(X, y)# 创建 SHAP 解释器explainer = shap.Explainer(model)# 计算 SHAP 值shap_values = explainer(X)# 计算每个特征的平均绝对值 SHAP 值mean_abs_shap = np.abs(shap_values.values).mean(axis=0)# 获取平均绝对值 SHAP 值最大的前 k 个特征的索引top_k_indices = np.argsort(mean_abs_shap)[-k:]# 筛选特征矩阵X_selected = X[:, top_k_indices]return X_selected

递归特征消除（RFE）

# 输入：特征矩阵 X，目标变量 y，模型 estimator，要保留的特征数量 n_features_to_select
# 输出：筛选后的特征矩阵 X_selected
from sklearn.feature_selection import RFEdef rfe_selection(X, y, estimator, n_features_to_select):# 创建 RFE 选择器selector = RFE(estimator, n_features_to_select=n_features_to_select)# 拟合数据并进行特征选择selector = selector.fit(X, y)# 筛选特征矩阵X_selected = selector.transform(X)return X_selected