9.1. 介绍¶
本实验将综合运用前面学到的各种回归方法,包括线性回归、多项式回归、岭回归、LASSO回归等,通过一个完整的项目来展示回归分析的全流程。我们将使用真实的房价数据集,从数据探索、特征工程、模型选择到最终预测,全面展示回归分析的实际应用。
9.2. 知识点¶
- 数据探索性分析(EDA)
- 特征工程与数据预处理
- 多种回归方法对比
- 模型选择与调优
- 交叉验证与模型评估
- 结果可视化与解释
# 导入必要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
import warnings
warnings.filterwarnings('ignore')
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
print("库导入成功!")
9.3. 数据加载与探索¶
我们使用加州房价数据集,这是一个经典的回归问题数据集:
# 加载加州房价数据集
housing = fetch_california_housing()
X = housing.data
y = housing.target
print(f"数据集形状: {X.shape}")
print(f"特征名称: {housing.feature_names}")
print(f"目标变量范围: {y.min():.2f} - {y.max():.2f}")
# 创建DataFrame
df = pd.DataFrame(X, columns=housing.feature_names)
df['房价'] = y
print("\\n数据集基本信息:")
print(df.describe())
# 检查缺失值
print("\\n缺失值检查:")
print(df.isnull().sum())
# 数据分布可视化
plt.figure(figsize=(15, 12))
# 房价分布
plt.subplot(3, 3, 1)
plt.hist(y, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('房价分布')
plt.xlabel('房价 (十万美元)')
plt.ylabel('频次')
# 各特征分布
for i, feature in enumerate(housing.feature_names):
plt.subplot(3, 3, i+2)
plt.hist(X[:, i], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
plt.title(f'{feature}分布')
plt.xlabel(feature)
plt.ylabel('频次')
plt.tight_layout()
plt.show()
# 特征与目标变量的关系分析
plt.figure(figsize=(15, 10))
# 散点图矩阵
for i, feature in enumerate(housing.feature_names):
plt.subplot(2, 4, i+1)
plt.scatter(X[:, i], y, alpha=0.5, s=1)
plt.xlabel(feature)
plt.ylabel('房价')
plt.title(f'{feature} vs 房价')
# 相关性热力图
plt.subplot(2, 4, 8)
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('特征相关性热力图')
plt.tight_layout()
plt.show()
# 计算特征与目标变量的相关性
correlations = df.corr()['房价'].drop('房价').sort_values(key=abs, ascending=False)
print("\\n特征与房价的相关性(按绝对值排序):")
print(correlations)
9.4. 数据预处理¶
在进行建模之前,我们需要对数据进行预处理:
# 数据预处理
# 1. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"训练集大小: {X_train.shape}")
print(f"测试集大小: {X_test.shape}")
# 2. 特征标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("\\n标准化前后对比(训练集前5个样本):")
print("原始数据:")
print(X_train[:5])
print("\\n标准化后:")
print(X_train_scaled[:5])
# 3. 特征选择(选择最重要的特征)
selector = SelectKBest(score_func=f_regression, k=6)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)
# 获取选择的特征名称
selected_features = [housing.feature_names[i] for i in selector.get_support(indices=True)]
print(f"\\n选择的特征: {selected_features}")
# 4. 创建多项式特征
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly_features.fit_transform(X_train_selected)
X_test_poly = poly_features.transform(X_test_selected)
print(f"\\n多项式特征数量: {X_train_poly.shape[1]}")
# 显示预处理后的数据形状
print("\\n预处理后的数据形状:")
print(f"训练集: {X_train_scaled.shape}")
print(f"训练集(特征选择后): {X_train_selected.shape}")
print(f"训练集(多项式特征): {X_train_poly.shape}")
9.5. 多种回归方法对比¶
现在我们将使用不同的回归方法来建模,并比较它们的性能:
# 定义多种回归模型
models = {
'线性回归': LinearRegression(),
'岭回归': Ridge(alpha=1.0),
'LASSO回归': Lasso(alpha=0.1),
'弹性网络': ElasticNet(alpha=0.1, l1_ratio=0.5)
}
# 定义不同的特征集
feature_sets = {
'原始特征': (X_train_scaled, X_test_scaled),
'选择特征': (X_train_selected, X_test_selected),
'多项式特征': (X_train_poly, X_test_poly)
}
# 计算评价指标
def calculate_metrics(y_true, y_pred):
"""计算各种评价指标"""
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
return {
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'R²': r2
}
# 训练和评估所有模型组合
results = {}
for feature_name, (X_train_f, X_test_f) in feature_sets.items():
results[feature_name] = {}
for model_name, model in models.items():
# 训练模型
model.fit(X_train_f, y_train)
# 预测
y_pred_train = model.predict(X_train_f)
y_pred_test = model.predict(X_test_f)
# 计算指标
train_metrics = calculate_metrics(y_train, y_pred_train)
test_metrics = calculate_metrics(y_test, y_pred_test)
# 交叉验证
cv_scores = cross_val_score(model, X_train_f, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores.mean())
results[feature_name][model_name] = {
'model': model,
'train_metrics': train_metrics,
'test_metrics': test_metrics,
'cv_rmse': cv_rmse,
'y_pred_test': y_pred_test
}
print(f"{feature_name} + {model_name}:")
print(f" 测试集 RMSE: {test_metrics['RMSE']:.3f}")
print(f" 测试集 R²: {test_metrics['R²']:.3f}")
print(f" 交叉验证 RMSE: {cv_rmse:.3f}")
print()
# 创建结果对比表
comparison_data = []
for feature_name in feature_sets.keys():
for model_name in models.keys():
result = results[feature_name][model_name]
comparison_data.append({
'特征集': feature_name,
'模型': model_name,
'RMSE': result['test_metrics']['RMSE'],
'R²': result['test_metrics']['R²'],
'CV_RMSE': result['cv_rmse']
})
comparison_df = pd.DataFrame(comparison_data)
print("模型性能对比表:")
print(comparison_df.round(3))
# 找出最佳模型
best_idx = comparison_df['CV_RMSE'].idxmin()
best_result = comparison_df.iloc[best_idx]
print(f"\\n最佳模型组合:")
print(f"特征集: {best_result['特征集']}")
print(f"模型: {best_result['模型']}")
print(f"交叉验证 RMSE: {best_result['CV_RMSE']:.3f}")
print(f"测试集 R²: {best_result['R²']:.3f}")
# 可视化模型性能对比
plt.figure(figsize=(20, 12))
# 1. RMSE对比
plt.subplot(2, 4, 1)
pivot_rmse = comparison_df.pivot(index='特征集', columns='模型', values='RMSE')
sns.heatmap(pivot_rmse, annot=True, cmap='YlOrRd', fmt='.3f')
plt.title('RMSE对比')
# 2. R²对比
plt.subplot(2, 4, 2)
pivot_r2 = comparison_df.pivot(index='特征集', columns='模型', values='R²')
sns.heatmap(pivot_r2, annot=True, cmap='YlGnBu', fmt='.3f')
plt.title('R²对比')
# 3. 交叉验证RMSE对比
plt.subplot(2, 4, 3)
pivot_cv = comparison_df.pivot(index='特征集', columns='模型', values='CV_RMSE')
sns.heatmap(pivot_cv, annot=True, cmap='RdYlBu_r', fmt='.3f')
plt.title('交叉验证RMSE对比')
# 4. 模型性能条形图
plt.subplot(2, 4, 4)
model_avg_rmse = comparison_df.groupby('模型')['CV_RMSE'].mean().sort_values()
plt.bar(range(len(model_avg_rmse)), model_avg_rmse.values, color='skyblue')
plt.xticks(range(len(model_avg_rmse)), model_avg_rmse.index, rotation=45)
plt.title('模型平均性能')
plt.ylabel('平均CV RMSE')
# 5. 特征集性能条形图
plt.subplot(2, 4, 5)
feature_avg_rmse = comparison_df.groupby('特征集')['CV_RMSE'].mean().sort_values()
plt.bar(range(len(feature_avg_rmse)), feature_avg_rmse.values, color='lightgreen')
plt.xticks(range(len(feature_avg_rmse)), feature_avg_rmse.index, rotation=45)
plt.title('特征集平均性能')
plt.ylabel('平均CV RMSE')
# 6. 最佳模型预测结果
best_feature = best_result['特征集']
best_model = best_result['模型']
best_pred = results[best_feature][best_model]['y_pred_test']
plt.subplot(2, 4, 6)
plt.scatter(y_test, best_pred, alpha=0.5, s=10)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('实际房价')
plt.ylabel('预测房价')
plt.title(f'最佳模型预测结果\\n{best_feature} + {best_model}')
# 7. 残差分析
plt.subplot(2, 4, 7)
residuals = y_test - best_pred
plt.scatter(best_pred, residuals, alpha=0.5, s=10)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('预测房价')
plt.ylabel('残差')
plt.title('残差分析')
# 8. 残差分布
plt.subplot(2, 4, 8)
plt.hist(residuals, bins=30, alpha=0.7, color='orange', edgecolor='black')
plt.xlabel('残差')
plt.ylabel('频次')
plt.title('残差分布')
plt.tight_layout()
plt.show()
9.6. 模型调优¶
现在我们对最佳模型进行超参数调优:
# 超参数调优
# 获取最佳特征集
if best_feature == '原始特征':
X_train_opt, X_test_opt = X_train_scaled, X_test_scaled
elif best_feature == '选择特征':
X_train_opt, X_test_opt = X_train_selected, X_test_selected
else: # 多项式特征
X_train_opt, X_test_opt = X_train_poly, X_test_poly
# 定义超参数网格
param_grids = {
'岭回归': {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]},
'LASSO回归': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0]},
'弹性网络': {'alpha': [0.001, 0.01, 0.1, 1.0], 'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]}
}
# 对每个模型进行网格搜索
tuned_models = {}
for model_name, param_grid in param_grids.items():
if model_name == '岭回归':
base_model = Ridge()
elif model_name == 'LASSO回归':
base_model = Lasso(max_iter=10000)
else: # 弹性网络
base_model = ElasticNet(max_iter=10000)
# 网格搜索
grid_search = GridSearchCV(
base_model,
param_grid,
cv=5,
scoring='neg_mean_squared_error',
n_jobs=-1
)
grid_search.fit(X_train_opt, y_train)
tuned_models[model_name] = grid_search.best_estimator_
print(f"{model_name} 最优参数: {grid_search.best_params_}")
print(f"{model_name} 最优CV分数: {-grid_search.best_score_:.3f}")
print()
# 评估调优后的模型
print("调优后的模型性能:")
for model_name, model in tuned_models.items():
y_pred = model.predict(X_test_opt)
metrics = calculate_metrics(y_test, y_pred)
print(f"{model_name}: RMSE = {metrics['RMSE']:.3f}, R² = {metrics['R²']:.3f}")
# 找出最终最佳模型
final_results = []
for model_name, model in tuned_models.items():
y_pred = model.predict(X_test_opt)
metrics = calculate_metrics(y_test, y_pred)
final_results.append({
'模型': model_name,
'RMSE': metrics['RMSE'],
'R²': metrics['R²']
})
final_df = pd.DataFrame(final_results)
print("\\n最终模型性能对比:")
print(final_df.round(3))
# 最佳最终模型
best_final_idx = final_df['RMSE'].idxmin()
best_final_model = final_df.iloc[best_final_idx]
print(f"\\n最终最佳模型: {best_final_model['模型']}")
print(f"RMSE: {best_final_model['RMSE']:.3f}")
print(f"R²: {best_final_model['R²']:.3f}")
9.7. 模型解释与特征重要性¶
让我们分析最佳模型的特征重要性:
# 特征重要性分析
best_final_model_name = best_final_model['模型']
best_final_model_obj = tuned_models[best_final_model_name]
# 获取特征重要性
if hasattr(best_final_model_obj, 'coef_'):
feature_importance = np.abs(best_final_model_obj.coef_)
# 获取特征名称
if best_feature == '原始特征':
feature_names = housing.feature_names
elif best_feature == '选择特征':
feature_names = selected_features
else: # 多项式特征
feature_names = [f'特征_{i}' for i in range(len(feature_importance))]
# 创建特征重要性DataFrame
importance_df = pd.DataFrame({
'特征': feature_names,
'重要性': feature_importance
}).sort_values('重要性', ascending=False)
print(f"最佳模型 ({best_final_model_name}) 特征重要性:")
print(importance_df.head(10))
# 可视化特征重要性
plt.figure(figsize=(12, 8))
# 特征重要性条形图
plt.subplot(2, 2, 1)
top_features = importance_df.head(10)
plt.barh(range(len(top_features)), top_features['重要性'], color='skyblue')
plt.yticks(range(len(top_features)), top_features['特征'])
plt.xlabel('特征重要性')
plt.title('前10个重要特征')
plt.gca().invert_yaxis()
# 特征重要性分布
plt.subplot(2, 2, 2)
plt.hist(feature_importance, bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
plt.xlabel('特征重要性')
plt.ylabel('频次')
plt.title('特征重要性分布')
# 累积重要性
plt.subplot(2, 2, 3)
cumulative_importance = np.cumsum(importance_df['重要性'].values)
cumulative_importance = cumulative_importance / cumulative_importance[-1] * 100
plt.plot(range(1, len(cumulative_importance) + 1), cumulative_importance, 'o-')
plt.xlabel('特征数量')
plt.ylabel('累积重要性 (%)')
plt.title('累积特征重要性')
plt.grid(True, alpha=0.3)
# 最佳模型预测结果详细分析
plt.subplot(2, 2, 4)
y_pred_final = best_final_model_obj.predict(X_test_opt)
plt.scatter(y_test, y_pred_final, alpha=0.5, s=10)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('实际房价')
plt.ylabel('预测房价')
plt.title(f'最终最佳模型预测结果\\n{best_final_model_name}')
plt.tight_layout()
plt.show()
else:
print("该模型不支持特征重要性分析")
9.8. 模型部署与预测¶
最后,我们展示如何使用训练好的模型进行新数据的预测:
# 创建预测函数
def predict_house_price(model, scaler, selector, poly_features, house_features):
"""
预测房价函数
参数:
model: 训练好的模型
scaler: 标准化器
selector: 特征选择器
poly_features: 多项式特征生成器
house_features: 房屋特征数组
"""
# 数据预处理
house_scaled = scaler.transform(house_features.reshape(1, -1))
house_selected = selector.transform(house_scaled)
house_poly = poly_features.transform(house_selected)
# 根据最佳特征集选择数据
if best_feature == '原始特征':
X_pred = house_scaled
elif best_feature == '选择特征':
X_pred = house_selected
else: # 多项式特征
X_pred = house_poly
# 预测
prediction = model.predict(X_pred)
return prediction[0]
# 示例:预测几个新房屋的价格
print("房屋价格预测示例:")
print("=" * 50)
# 创建示例房屋数据
example_houses = [
[8.3252, 41, 6.984127, 1.023810, 322, 2.555556, 37.88, -122.23], # 示例1
[5.2500, 52, 8.696429, 1.052632, 496, 2.173913, 37.85, -122.25], # 示例2
[3.8462, 52, 6.730769, 1.173077, 558, 2.673077, 37.84, -122.26], # 示例3
]
house_descriptions = [
"高收入地区,大房子",
"中等收入地区,中等房子",
"低收入地区,小房子"
]
for i, (house, description) in enumerate(zip(example_houses, house_descriptions)):
house_array = np.array(house)
predicted_price = predict_house_price(
best_final_model_obj, scaler, selector, poly_features, house_array
)
print(f"房屋 {i+1} ({description}):")
print(f" 特征: {house}")
print(f" 预测房价: ${predicted_price:.2f} 十万美元")
print(f" 预测房价: ${predicted_price*100000:.0f} 美元")
print()
# 批量预测
print("批量预测示例:")
print("=" * 30)
# 使用测试集的一部分进行批量预测
batch_size = 10
batch_houses = X_test[:batch_size]
batch_predictions = []
for house in batch_houses:
pred = predict_house_price(
best_final_model_obj, scaler, selector, poly_features, house
)
batch_predictions.append(pred)
batch_predictions = np.array(batch_predictions)
batch_actual = y_test[:batch_size]
print("前10个测试样本的预测结果:")
for i in range(batch_size):
print(f"样本 {i+1}: 实际=${batch_actual[i]:.2f}, 预测=${batch_predictions[i]:.2f}, 误差=${abs(batch_actual[i]-batch_predictions[i]):.2f}")
# 计算批量预测的准确性
batch_rmse = np.sqrt(mean_squared_error(batch_actual, batch_predictions))
batch_mae = mean_absolute_error(batch_actual, batch_predictions)
print(f"\\n批量预测性能:")
print(f"RMSE: {batch_rmse:.3f}")
print(f"MAE: {batch_mae:.3f}")
9.9. 总结¶
通过本综合应用练习,我们完成了完整的回归分析流程:
9.9.1. 项目流程回顾¶
- 数据探索:分析了加州房价数据集的分布和特征关系
- 数据预处理:标准化、特征选择、多项式特征生成
- 模型训练:对比了线性回归、岭回归、LASSO回归、弹性网络
- 模型评估:使用交叉验证和多种评价指标
- 模型调优:通过网格搜索优化超参数
- 模型解释:分析特征重要性和模型可解释性
- 模型部署:创建预测函数进行实际应用
9.9.2. 关键发现¶
- 特征工程的重要性:多项式特征显著提升了模型性能
- 正则化的作用:岭回归和LASSO回归有效防止了过拟合
- 模型选择:不同模型在不同特征集上表现不同
- 交叉验证:提供了更可靠的模型评估
9.9.3. 实际应用价值¶
- 房价预测:可用于房地产投资决策
- 风险评估:帮助银行评估贷款风险
- 市场分析:理解影响房价的关键因素
- 政策制定:为城市规划提供数据支持
9.9.4. 技能提升¶
通过本练习,我们掌握了:
- 完整的机器学习项目流程
- 多种回归算法的实际应用
- 特征工程和模型调优技巧
- 模型评估和解释方法
- 实际业务场景中的应用
这个综合练习展示了回归分析在实际问题中的完整应用,为后续的机器学习学习奠定了坚实基础。