6.1. 介绍¶
当数据存在多重共线性或特征数量较多时,普通线性回归可能会出现过拟合问题。岭回归(Ridge Regression)和LASSO回归(Least Absolute Shrinkage and Selection Operator)是两种重要的正则化方法,能够有效解决这些问题。
6.2. 知识点¶
- 正则化回归原理
- 岭回归(Ridge Regression)
- LASSO回归(LASSO Regression)
- 弹性网络(Elastic Net)
- 正则化参数选择
- 特征选择与降维
# 导入必要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_regression, load_boston
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
print("库导入成功!")
6.3. 正则化回归原理¶
6.3.1. 普通线性回归¶
普通线性回归的目标函数为:
Got function '\\' with no arguments as subscript at position 45: …um_{i=1}^{m}(h_\̲\̲theta(x^{(i)}) …
J(\\theta) = \\frac{1}{2m}\\sum_{i=1}^{m}(h_\\theta(x^{(i)}) - y^{(i)})^26.3.2. 岭回归(Ridge Regression)¶
岭回归在目标函数中加入L2正则化项:
Got function '\\' with no arguments as subscript at position 45: …um_{i=1}^{m}(h_\̲\̲theta(x^{(i)}) …
J(\\theta) = \\frac{1}{2m}\\sum_{i=1}^{m}(h_\\theta(x^{(i)}) - y^{(i)})^2 + \\alpha\\sum_{j=1}^{n}\\theta_j^26.3.3. LASSO回归(LASSO Regression)¶
LASSO回归在目标函数中加入L1正则化项:
Got function '\\' with no arguments as subscript at position 45: …um_{i=1}^{m}(h_\̲\̲theta(x^{(i)}) …
J(\\theta) = \\frac{1}{2m}\\sum_{i=1}^{m}(h_\\theta(x^{(i)}) - y^{(i)})^2 + \\alpha\\sum_{j=1}^{n}|\\theta_j|6.3.4. 弹性网络(Elastic Net)¶
弹性网络结合了L1和L2正则化:
Got function '\\' with no arguments as subscript at position 45: …um_{i=1}^{m}(h_\̲\̲theta(x^{(i)}) …
J(\\theta) = \\frac{1}{2m}\\sum_{i=1}^{m}(h_\\theta(x^{(i)}) - y^{(i)})^2 + \\alpha\\rho\\sum_{j=1}^{n}|\\theta_j| + \\frac{\\alpha(1-\\rho)}{2}\\sum_{j=1}^{n}\\theta_j^26.4. 数据准备¶
我们使用波士顿房价数据集来演示不同回归方法的对比:
# 加载波士顿房价数据集
boston = load_boston()
X = boston.data
y = boston.target
print(f"数据集形状: {X.shape}")
print(f"特征名称: {boston.feature_names}")
# 创建DataFrame
df = pd.DataFrame(X, columns=boston.feature_names)
df['PRICE'] = y
# 数据基本信息
print("\n数据集基本信息:")
print(df.describe())
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"\n训练集大小: {X_train_scaled.shape}")
print(f"测试集大小: {X_test_scaled.shape}")
6.5. 不同回归方法对比¶
现在我们来比较普通线性回归、岭回归、LASSO回归和弹性网络的性能:
# 创建不同的回归模型
models = {
'线性回归': LinearRegression(),
'岭回归': Ridge(alpha=1.0),
'LASSO回归': Lasso(alpha=0.1),
'弹性网络': ElasticNet(alpha=0.1, l1_ratio=0.5)
}
# 训练和评估模型
results = {}
for name, model in models.items():
# 训练模型
model.fit(X_train_scaled, y_train)
# 预测
y_pred = model.predict(X_test_scaled)
# 评估指标
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
# 交叉验证
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores.mean())
results[name] = {
'model': model,
'mse': mse,
'rmse': rmse,
'r2': r2,
'cv_rmse': cv_rmse,
'predictions': y_pred
}
print(f"{name}:")
print(f" RMSE: {rmse:.3f}")
print(f" R²: {r2:.3f}")
print(f" CV RMSE: {cv_rmse:.3f}")
print()
# 创建结果对比表
comparison_df = pd.DataFrame({
name: [results[name]['rmse'], results[name]['r2'], results[name]['cv_rmse']]
for name in results.keys()
}, index=['RMSE', 'R²', 'CV RMSE'])
print("模型性能对比:")
print(comparison_df.round(3))
# 可视化预测结果对比
plt.figure(figsize=(15, 10))
# 预测vs实际值对比
for i, (name, result) in enumerate(results.items()):
plt.subplot(2, 3, i+1)
plt.scatter(y_test, result['predictions'], alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('实际房价')
plt.ylabel('预测房价')
plt.title(f'{name}\\nR² = {result["r2"]:.3f}')
plt.grid(True, alpha=0.3)
# 残差分析
plt.subplot(2, 3, 5)
for name, result in results.items():
residuals = y_test - result['predictions']
plt.scatter(result['predictions'], residuals, alpha=0.6, label=name)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('预测房价')
plt.ylabel('残差')
plt.title('残差分析')
plt.legend()
plt.grid(True, alpha=0.3)
# 模型性能对比
plt.subplot(2, 3, 6)
models_names = list(results.keys())
rmse_values = [results[name]['rmse'] for name in models_names]
r2_values = [results[name]['r2'] for name in models_names]
x = np.arange(len(models_names))
width = 0.35
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
bars1 = ax1.bar(x - width/2, rmse_values, width, label='RMSE', alpha=0.8)
bars2 = ax2.bar(x + width/2, r2_values, width, label='R²', alpha=0.8, color='orange')
ax1.set_xlabel('模型')
ax1.set_ylabel('RMSE', color='blue')
ax2.set_ylabel('R²', color='orange')
ax1.set_title('模型性能对比')
ax1.set_xticks(x)
ax1.set_xticklabels(models_names, rotation=45)
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')
plt.tight_layout()
plt.show()
6.6. 正则化参数调优¶
正则化参数α的选择对模型性能有重要影响,我们使用交叉验证来寻找最优参数:
# 正则化参数调优
from sklearn.model_selection import GridSearchCV
# 定义参数网格
param_grids = {
'Ridge': {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]},
'Lasso': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0]},
'ElasticNet': {'alpha': [0.001, 0.01, 0.1, 1.0], 'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]}
}
# 网格搜索
best_models = {}
for name, param_grid in param_grids.items():
if name == 'Ridge':
model = Ridge()
elif name == 'Lasso':
model = Lasso(max_iter=10000)
else: # ElasticNet
model = ElasticNet(max_iter=10000)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
best_models[name] = grid_search.best_estimator_
print(f"{name} 最优参数: {grid_search.best_params_}")
print(f"{name} 最优CV分数: {-grid_search.best_score_:.3f}")
print()
# 使用最优参数重新训练和评估
print("使用最优参数的模型性能:")
for name, model in best_models.items():
y_pred = model.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"{name}: RMSE = {rmse:.3f}, R² = {r2:.3f}")
# 可视化正则化路径
alphas = np.logspace(-4, 2, 50)
# 岭回归路径
ridge_coefs = []
for alpha in alphas:
ridge = Ridge(alpha=alpha)
ridge.fit(X_train_scaled, y_train)
ridge_coefs.append(ridge.coef_)
# LASSO路径
lasso_coefs = []
for alpha in alphas:
lasso = Lasso(alpha=alpha, max_iter=10000)
lasso.fit(X_train_scaled, y_train)
lasso_coefs.append(lasso.coef_)
# 绘制正则化路径
plt.figure(figsize=(15, 5))
# 岭回归路径
plt.subplot(1, 2, 1)
ridge_coefs = np.array(ridge_coefs)
for i in range(ridge_coefs.shape[1]):
plt.plot(alphas, ridge_coefs[:, i], label=boston.feature_names[i])
plt.xscale('log')
plt.xlabel('正则化参数 α')
plt.ylabel('系数值')
plt.title('岭回归正则化路径')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
# LASSO路径
plt.subplot(1, 2, 2)
lasso_coefs = np.array(lasso_coefs)
for i in range(lasso_coefs.shape[1]):
plt.plot(alphas, lasso_coefs[:, i], label=boston.feature_names[i])
plt.xscale('log')
plt.xlabel('正则化参数 α')
plt.ylabel('系数值')
plt.title('LASSO正则化路径')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
6.7. 特征选择分析¶
LASSO回归的一个重要特性是能够进行特征选择,让我们分析哪些特征被选择:
# 特征选择分析
lasso_best = best_models['Lasso']
ridge_best = best_models['Ridge']
# 获取系数
lasso_coef = lasso_best.coef_
ridge_coef = ridge_best.coef_
# 创建特征重要性DataFrame
feature_importance = pd.DataFrame({
'特征': boston.feature_names,
'岭回归系数': ridge_coef,
'LASSO系数': lasso_coef,
'岭回归|系数|': np.abs(ridge_coef),
'LASSO|系数|': np.abs(lasso_coef)
})
# 按LASSO系数绝对值排序
feature_importance = feature_importance.sort_values('LASSO|系数|', ascending=False)
print("特征重要性对比:")
print(feature_importance.round(3))
# 统计非零特征
lasso_nonzero = np.sum(lasso_coef != 0)
ridge_nonzero = np.sum(ridge_coef != 0)
print(f"\\n岭回归非零特征数: {ridge_nonzero}")
print(f"LASSO非零特征数: {lasso_nonzero}")
# 可视化特征重要性
plt.figure(figsize=(15, 6))
# 岭回归特征重要性
plt.subplot(1, 2, 1)
ridge_importance = feature_importance.sort_values('岭回归|系数|', ascending=True)
plt.barh(range(len(ridge_importance)), ridge_importance['岭回归|系数|'])
plt.yticks(range(len(ridge_importance)), ridge_importance['特征'])
plt.xlabel('系数绝对值')
plt.title('岭回归特征重要性')
plt.grid(True, alpha=0.3)
# LASSO特征重要性
plt.subplot(1, 2, 2)
lasso_importance = feature_importance.sort_values('LASSO|系数|', ascending=True)
plt.barh(range(len(lasso_importance)), lasso_importance['LASSO|系数|'])
plt.yticks(range(len(lasso_importance)), lasso_importance['特征'])
plt.xlabel('系数绝对值')
plt.title('LASSO特征重要性')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
6.8. 总结¶
通过本实验,我们学习了:
6.8.1. 正则化回归方法¶
- 岭回归(Ridge):使用L2正则化,防止过拟合,所有特征都会被保留
- LASSO回归:使用L1正则化,具有特征选择功能,可以将不重要的特征系数压缩为0
- 弹性网络(Elastic Net):结合L1和L2正则化,平衡特征选择和防止过拟合
6.8.2. 关键发现¶
- 岭回归:适合处理多重共线性问题,所有特征都参与预测
- LASSO回归:能够自动进行特征选择,适合高维数据
- 弹性网络:结合两者优点,在特征选择和平滑性之间取得平衡
6.8.3. 参数调优¶
- 正则化参数α的选择对模型性能至关重要
- 可以使用网格搜索和交叉验证来寻找最优参数
- 正则化路径图可以帮助理解参数对模型的影响
6.8.4. 应用场景¶
- 岭回归:特征数量适中,存在多重共线性
- LASSO回归:高维数据,需要特征选择
- 弹性网络:特征数量很多,需要平衡特征选择和平滑性
这些正则化方法为处理复杂的回归问题提供了强大的工具,在实际应用中应根据数据特点选择合适的方法。