模型名称 | 特点 | 适用场景 | sklearn 类 |
---|
逻辑回归 (Logistic Regression) | 线性分类,输出概率,可正则化防止过拟合 | 二分类/多分类,特征线性可分 | linear_model.LogisticRegression |
感知机 (Perceptron) | 简单线性分类器,无概率输出 | 线性可分数据 | linear_model.Perceptron |
一.逻辑回归
# 导入逻辑回归模型
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats# 设置显示选项
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_rows', None) # 显示最大行数
pd.set_option('display.max_columns', None) # 显示最大列数
pd.set_option('display.max_colwidth', None) # 显示的最大列宽
pd.set_option('display.width', None) # 显示的最宽度# 导入数据
data = pd.read_excel("股票客户流失.xlsx")# 数据预处理
# 4.1 使用均值填写缺失值
print("缺失值统计:\n", data.isnull().sum())
data = data.apply(lambda col: col.fillna(col.mean()), axis=0)# 4.2 处理异常值
numeric_data = data.select_dtypes(include=[np.number])
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number]))) # 仅对数值型数据计算 Z-score
threshold = 3 # Z-score 阈值 3个标准差
outliers = (z_scores > threshold).any(axis=1) # 检测异常值
print("检测到的异常值行索引:\n", data[outliers].index.tolist()) # 输出异常值的行索引
print(data[outliers])
data = data[~outliers] # 移除异常值# 划分特征和目标变量
X = data.drop("是否流失", axis=1)
y = data["是否流失"]# 4.3 将数据划分为训练集与测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)# 4.4 创建标准化训练集与测试集
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)# 建立逻辑回归模型
model = LogisticRegression(max_iter=1000, random_state=42)# 参数网格
param_grid = {'C': [0.01, 0.1, 1, 10, 100], # 正则化强度的倒数'penalty': ['l1', 'l2'], # 正则化类型'solver': ['liblinear'] # 适用于小数据集的优化算法
}# 网格搜索
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=KFold(n_splits=5, random_state=42, shuffle=True),scoring='accuracy', # 使用准确率作为评估指标n_jobs=-1
)
grid_search.fit(X_train, y_train)# 输出最佳参数组合
print("最佳参数组合:", grid_search.best_params_)# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)# 输出测试集的评估指标
print("测试集准确率:", accuracy_score(y_test, y_pred))
print("测试集 F1 Score:", f1_score(y_test, y_pred))
print("测试集 ROC AUC Score:", roc_auc_score(y_test, y_pred))
print("混淆矩阵:\n", confusion_matrix(y_test, y_pred))
print("分类报告:\n", classification_report(y_test, y_pred))# 交叉验证
cv_scores = cross_val_score(best_model, X_train, y_train, cv=KFold(n_splits=5, random_state=42, shuffle=True), scoring='accuracy')
print(f"5折交叉验证准确率: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")
二.感知机
# 导入感知机模型
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats# 设置显示选项
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_rows', None) # 显示最大行数
pd.set_option('display.max_columns', None) # 显示最大列数
pd.set_option('display.max_colwidth', None) # 显示的最大列宽
pd.set_option('display.width', None) # 显示的最宽度# 导入数据
data = pd.read_excel("股票客户流失.xlsx")# 数据预处理
# 4.1 使用均值填写缺失值
print("缺失值统计:\n", data.isnull().sum())
data = data.apply(lambda col: col.fillna(col.mean()), axis=0)# 4.2 处理异常值
numeric_data = data.select_dtypes(include=[np.number])
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number]))) # 仅对数值型数据计算 Z-score
threshold = 3 # Z-score 阈值 3个标准差
outliers = (z_scores > threshold).any(axis=1) # 检测异常值
print("检测到的异常值行索引:\n", data[outliers].index.tolist()) # 输出异常值的行索引
print(data[outliers])
data = data[~outliers] # 移除异常值# 划分特征和目标变量
X = data.drop("是否流失", axis=1)
y = data["是否流失"]# 4.3 将数据划分为训练集与测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)# 4.4 创建标准化训练集与测试集
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)# 建立感知机模型
model = Perceptron(random_state=42)# 参数网格
param_grid = {'alpha': [0.0001, 0.001, 0.01, 0.1], # 正则化参数'max_iter': [100, 500, 1000], # 最大迭代次数'eta0': [0.01, 0.1, 1.0], # 学习率'penalty': ['l1', 'l2', 'elasticnet', None] # 正则化类型
}# 网格搜索
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=KFold(n_splits=5, random_state=42, shuffle=True),scoring='accuracy', # 使用准确率作为评估指标n_jobs=-1
)
grid_search.fit(X_train, y_train)# 输出最佳参数组合
print("最佳参数组合:", grid_search.best_params_)# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)# 输出测试集的评估指标
print("测试集准确率:", accuracy_score(y_test, y_pred))
print("测试集 F1 Score:", f1_score(y_test, y_pred))
print("测试集 ROC AUC Score:", roc_auc_score(y_test, y_pred))
print("混淆矩阵:\n", confusion_matrix(y_test, y_pred))
print("分类报告:\n", classification_report(y_test, y_pred))# 交叉验证
cv_scores = cross_val_score(best_model, X_train, y_train, cv=KFold(n_splits=5, random_state=42, shuffle=True), scoring='accuracy')
print(f"5折交叉验证准确率: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")