9. 回归方法综合应用练习 - Statistics and Data Science in Python

9.1. 介绍¶

本实验将综合运用前面学到的各种回归方法，包括线性回归、多项式回归、岭回归、LASSO回归等，通过一个完整的项目来展示回归分析的全流程。我们将使用真实的房价数据集，从数据探索、特征工程、模型选择到最终预测，全面展示回归分析的实际应用。

9.2. 知识点¶

数据探索性分析（EDA）
特征工程与数据预处理
多种回归方法对比
模型选择与调优
交叉验证与模型评估
结果可视化与解释

# 导入必要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
import warnings
warnings.filterwarnings('ignore')

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

print("库导入成功！")

9.3. 数据加载与探索¶

我们使用加州房价数据集，这是一个经典的回归问题数据集：

# 加载加州房价数据集
housing = fetch_california_housing()
X = housing.data
y = housing.target

print(f"数据集形状: {X.shape}")
print(f"特征名称: {housing.feature_names}")
print(f"目标变量范围: {y.min():.2f} - {y.max():.2f}")

# 创建DataFrame
df = pd.DataFrame(X, columns=housing.feature_names)
df['房价'] = y

print("\\n数据集基本信息:")
print(df.describe())

# 检查缺失值
print("\\n缺失值检查:")
print(df.isnull().sum())

# 数据分布可视化
plt.figure(figsize=(15, 12))

# 房价分布
plt.subplot(3, 3, 1)
plt.hist(y, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('房价分布')
plt.xlabel('房价 (十万美元)')
plt.ylabel('频次')

# 各特征分布
for i, feature in enumerate(housing.feature_names):
    plt.subplot(3, 3, i+2)
    plt.hist(X[:, i], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
    plt.title(f'{feature}分布')
    plt.xlabel(feature)
    plt.ylabel('频次')

plt.tight_layout()
plt.show()

# 特征与目标变量的关系分析
plt.figure(figsize=(15, 10))

# 散点图矩阵
for i, feature in enumerate(housing.feature_names):
    plt.subplot(2, 4, i+1)
    plt.scatter(X[:, i], y, alpha=0.5, s=1)
    plt.xlabel(feature)
    plt.ylabel('房价')
    plt.title(f'{feature} vs 房价')

# 相关性热力图
plt.subplot(2, 4, 8)
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('特征相关性热力图')

plt.tight_layout()
plt.show()

# 计算特征与目标变量的相关性
correlations = df.corr()['房价'].drop('房价').sort_values(key=abs, ascending=False)
print("\\n特征与房价的相关性（按绝对值排序）:")
print(correlations)

9.4. 数据预处理¶

在进行建模之前，我们需要对数据进行预处理：

# 数据预处理
# 1. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"训练集大小: {X_train.shape}")
print(f"测试集大小: {X_test.shape}")

# 2. 特征标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\\n标准化前后对比（训练集前5个样本）:")
print("原始数据:")
print(X_train[:5])
print("\\n标准化后:")
print(X_train_scaled[:5])

# 3. 特征选择（选择最重要的特征）
selector = SelectKBest(score_func=f_regression, k=6)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# 获取选择的特征名称
selected_features = [housing.feature_names[i] for i in selector.get_support(indices=True)]
print(f"\\n选择的特征: {selected_features}")

# 4. 创建多项式特征
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly_features.fit_transform(X_train_selected)
X_test_poly = poly_features.transform(X_test_selected)

print(f"\\n多项式特征数量: {X_train_poly.shape[1]}")

# 显示预处理后的数据形状
print("\\n预处理后的数据形状:")
print(f"训练集: {X_train_scaled.shape}")
print(f"训练集（特征选择后）: {X_train_selected.shape}")
print(f"训练集（多项式特征）: {X_train_poly.shape}")

9.5. 多种回归方法对比¶

现在我们将使用不同的回归方法来建模，并比较它们的性能：

# 定义多种回归模型
models = {
    '线性回归': LinearRegression(),
    '岭回归': Ridge(alpha=1.0),
    'LASSO回归': Lasso(alpha=0.1),
    '弹性网络': ElasticNet(alpha=0.1, l1_ratio=0.5)
}

# 定义不同的特征集
feature_sets = {
    '原始特征': (X_train_scaled, X_test_scaled),
    '选择特征': (X_train_selected, X_test_selected),
    '多项式特征': (X_train_poly, X_test_poly)
}

# 计算评价指标
def calculate_metrics(y_true, y_pred):
    """计算各种评价指标"""
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    return {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2
    }

# 训练和评估所有模型组合
results = {}

for feature_name, (X_train_f, X_test_f) in feature_sets.items():
    results[feature_name] = {}
    
    for model_name, model in models.items():
        # 训练模型
        model.fit(X_train_f, y_train)
        
        # 预测
        y_pred_train = model.predict(X_train_f)
        y_pred_test = model.predict(X_test_f)
        
        # 计算指标
        train_metrics = calculate_metrics(y_train, y_pred_train)
        test_metrics = calculate_metrics(y_test, y_pred_test)
        
        # 交叉验证
        cv_scores = cross_val_score(model, X_train_f, y_train, cv=5, scoring='neg_mean_squared_error')
        cv_rmse = np.sqrt(-cv_scores.mean())
        
        results[feature_name][model_name] = {
            'model': model,
            'train_metrics': train_metrics,
            'test_metrics': test_metrics,
            'cv_rmse': cv_rmse,
            'y_pred_test': y_pred_test
        }
        
        print(f"{feature_name} + {model_name}:")
        print(f"  测试集 RMSE: {test_metrics['RMSE']:.3f}")
        print(f"  测试集 R²: {test_metrics['R²']:.3f}")
        print(f"  交叉验证 RMSE: {cv_rmse:.3f}")
        print()

# 创建结果对比表
comparison_data = []

for feature_name in feature_sets.keys():
    for model_name in models.keys():
        result = results[feature_name][model_name]
        comparison_data.append({
            '特征集': feature_name,
            '模型': model_name,
            'RMSE': result['test_metrics']['RMSE'],
            'R²': result['test_metrics']['R²'],
            'CV_RMSE': result['cv_rmse']
        })

comparison_df = pd.DataFrame(comparison_data)
print("模型性能对比表:")
print(comparison_df.round(3))

# 找出最佳模型
best_idx = comparison_df['CV_RMSE'].idxmin()
best_result = comparison_df.iloc[best_idx]
print(f"\\n最佳模型组合:")
print(f"特征集: {best_result['特征集']}")
print(f"模型: {best_result['模型']}")
print(f"交叉验证 RMSE: {best_result['CV_RMSE']:.3f}")
print(f"测试集 R²: {best_result['R²']:.3f}")

# 可视化模型性能对比
plt.figure(figsize=(20, 12))

# 1. RMSE对比
plt.subplot(2, 4, 1)
pivot_rmse = comparison_df.pivot(index='特征集', columns='模型', values='RMSE')
sns.heatmap(pivot_rmse, annot=True, cmap='YlOrRd', fmt='.3f')
plt.title('RMSE对比')

# 2. R²对比
plt.subplot(2, 4, 2)
pivot_r2 = comparison_df.pivot(index='特征集', columns='模型', values='R²')
sns.heatmap(pivot_r2, annot=True, cmap='YlGnBu', fmt='.3f')
plt.title('R²对比')

# 3. 交叉验证RMSE对比
plt.subplot(2, 4, 3)
pivot_cv = comparison_df.pivot(index='特征集', columns='模型', values='CV_RMSE')
sns.heatmap(pivot_cv, annot=True, cmap='RdYlBu_r', fmt='.3f')
plt.title('交叉验证RMSE对比')

# 4. 模型性能条形图
plt.subplot(2, 4, 4)
model_avg_rmse = comparison_df.groupby('模型')['CV_RMSE'].mean().sort_values()
plt.bar(range(len(model_avg_rmse)), model_avg_rmse.values, color='skyblue')
plt.xticks(range(len(model_avg_rmse)), model_avg_rmse.index, rotation=45)
plt.title('模型平均性能')
plt.ylabel('平均CV RMSE')

# 5. 特征集性能条形图
plt.subplot(2, 4, 5)
feature_avg_rmse = comparison_df.groupby('特征集')['CV_RMSE'].mean().sort_values()
plt.bar(range(len(feature_avg_rmse)), feature_avg_rmse.values, color='lightgreen')
plt.xticks(range(len(feature_avg_rmse)), feature_avg_rmse.index, rotation=45)
plt.title('特征集平均性能')
plt.ylabel('平均CV RMSE')

# 6. 最佳模型预测结果
best_feature = best_result['特征集']
best_model = best_result['模型']
best_pred = results[best_feature][best_model]['y_pred_test']

plt.subplot(2, 4, 6)
plt.scatter(y_test, best_pred, alpha=0.5, s=10)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('实际房价')
plt.ylabel('预测房价')
plt.title(f'最佳模型预测结果\\n{best_feature} + {best_model}')

# 7. 残差分析
plt.subplot(2, 4, 7)
residuals = y_test - best_pred
plt.scatter(best_pred, residuals, alpha=0.5, s=10)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('预测房价')
plt.ylabel('残差')
plt.title('残差分析')

# 8. 残差分布
plt.subplot(2, 4, 8)
plt.hist(residuals, bins=30, alpha=0.7, color='orange', edgecolor='black')
plt.xlabel('残差')
plt.ylabel('频次')
plt.title('残差分布')

plt.tight_layout()
plt.show()

9.6. 模型调优¶

现在我们对最佳模型进行超参数调优：

# 超参数调优
# 获取最佳特征集
if best_feature == '原始特征':
    X_train_opt, X_test_opt = X_train_scaled, X_test_scaled
elif best_feature == '选择特征':
    X_train_opt, X_test_opt = X_train_selected, X_test_selected
else:  # 多项式特征
    X_train_opt, X_test_opt = X_train_poly, X_test_poly

# 定义超参数网格
param_grids = {
    '岭回归': {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]},
    'LASSO回归': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0]},
    '弹性网络': {'alpha': [0.001, 0.01, 0.1, 1.0], 'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]}
}

# 对每个模型进行网格搜索
tuned_models = {}

for model_name, param_grid in param_grids.items():
    if model_name == '岭回归':
        base_model = Ridge()
    elif model_name == 'LASSO回归':
        base_model = Lasso(max_iter=10000)
    else:  # 弹性网络
        base_model = ElasticNet(max_iter=10000)
    
    # 网格搜索
    grid_search = GridSearchCV(
        base_model, 
        param_grid, 
        cv=5, 
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    grid_search.fit(X_train_opt, y_train)
    
    tuned_models[model_name] = grid_search.best_estimator_
    
    print(f"{model_name} 最优参数: {grid_search.best_params_}")
    print(f"{model_name} 最优CV分数: {-grid_search.best_score_:.3f}")
    print()

# 评估调优后的模型
print("调优后的模型性能:")
for model_name, model in tuned_models.items():
    y_pred = model.predict(X_test_opt)
    metrics = calculate_metrics(y_test, y_pred)
    print(f"{model_name}: RMSE = {metrics['RMSE']:.3f}, R² = {metrics['R²']:.3f}")

# 找出最终最佳模型
final_results = []
for model_name, model in tuned_models.items():
    y_pred = model.predict(X_test_opt)
    metrics = calculate_metrics(y_test, y_pred)
    final_results.append({
        '模型': model_name,
        'RMSE': metrics['RMSE'],
        'R²': metrics['R²']
    })

final_df = pd.DataFrame(final_results)
print("\\n最终模型性能对比:")
print(final_df.round(3))

# 最佳最终模型
best_final_idx = final_df['RMSE'].idxmin()
best_final_model = final_df.iloc[best_final_idx]
print(f"\\n最终最佳模型: {best_final_model['模型']}")
print(f"RMSE: {best_final_model['RMSE']:.3f}")
print(f"R²: {best_final_model['R²']:.3f}")

9.7. 模型解释与特征重要性¶

让我们分析最佳模型的特征重要性：

# 特征重要性分析
best_final_model_name = best_final_model['模型']
best_final_model_obj = tuned_models[best_final_model_name]

# 获取特征重要性
if hasattr(best_final_model_obj, 'coef_'):
    feature_importance = np.abs(best_final_model_obj.coef_)
    
    # 获取特征名称
    if best_feature == '原始特征':
        feature_names = housing.feature_names
    elif best_feature == '选择特征':
        feature_names = selected_features
    else:  # 多项式特征
        feature_names = [f'特征_{i}' for i in range(len(feature_importance))]
    
    # 创建特征重要性DataFrame
    importance_df = pd.DataFrame({
        '特征': feature_names,
        '重要性': feature_importance
    }).sort_values('重要性', ascending=False)
    
    print(f"最佳模型 ({best_final_model_name}) 特征重要性:")
    print(importance_df.head(10))
    
    # 可视化特征重要性
    plt.figure(figsize=(12, 8))
    
    # 特征重要性条形图
    plt.subplot(2, 2, 1)
    top_features = importance_df.head(10)
    plt.barh(range(len(top_features)), top_features['重要性'], color='skyblue')
    plt.yticks(range(len(top_features)), top_features['特征'])
    plt.xlabel('特征重要性')
    plt.title('前10个重要特征')
    plt.gca().invert_yaxis()
    
    # 特征重要性分布
    plt.subplot(2, 2, 2)
    plt.hist(feature_importance, bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
    plt.xlabel('特征重要性')
    plt.ylabel('频次')
    plt.title('特征重要性分布')
    
    # 累积重要性
    plt.subplot(2, 2, 3)
    cumulative_importance = np.cumsum(importance_df['重要性'].values)
    cumulative_importance = cumulative_importance / cumulative_importance[-1] * 100
    plt.plot(range(1, len(cumulative_importance) + 1), cumulative_importance, 'o-')
    plt.xlabel('特征数量')
    plt.ylabel('累积重要性 (%)')
    plt.title('累积特征重要性')
    plt.grid(True, alpha=0.3)
    
    # 最佳模型预测结果详细分析
    plt.subplot(2, 2, 4)
    y_pred_final = best_final_model_obj.predict(X_test_opt)
    plt.scatter(y_test, y_pred_final, alpha=0.5, s=10)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('实际房价')
    plt.ylabel('预测房价')
    plt.title(f'最终最佳模型预测结果\\n{best_final_model_name}')
    
    plt.tight_layout()
    plt.show()
    
else:
    print("该模型不支持特征重要性分析")

9.8. 模型部署与预测¶

最后，我们展示如何使用训练好的模型进行新数据的预测：

# 创建预测函数
def predict_house_price(model, scaler, selector, poly_features, house_features):
    """
    预测房价函数
    
    参数:
    model: 训练好的模型
    scaler: 标准化器
    selector: 特征选择器
    poly_features: 多项式特征生成器
    house_features: 房屋特征数组
    """
    # 数据预处理
    house_scaled = scaler.transform(house_features.reshape(1, -1))
    house_selected = selector.transform(house_scaled)
    house_poly = poly_features.transform(house_selected)
    
    # 根据最佳特征集选择数据
    if best_feature == '原始特征':
        X_pred = house_scaled
    elif best_feature == '选择特征':
        X_pred = house_selected
    else:  # 多项式特征
        X_pred = house_poly
    
    # 预测
    prediction = model.predict(X_pred)
    return prediction[0]

# 示例：预测几个新房屋的价格
print("房屋价格预测示例:")
print("=" * 50)

# 创建示例房屋数据
example_houses = [
    [8.3252, 41, 6.984127, 1.023810, 322, 2.555556, 37.88, -122.23],  # 示例1
    [5.2500, 52, 8.696429, 1.052632, 496, 2.173913, 37.85, -122.25],  # 示例2
    [3.8462, 52, 6.730769, 1.173077, 558, 2.673077, 37.84, -122.26],  # 示例3
]

house_descriptions = [
    "高收入地区，大房子",
    "中等收入地区，中等房子", 
    "低收入地区，小房子"
]

for i, (house, description) in enumerate(zip(example_houses, house_descriptions)):
    house_array = np.array(house)
    predicted_price = predict_house_price(
        best_final_model_obj, scaler, selector, poly_features, house_array
    )
    
    print(f"房屋 {i+1} ({description}):")
    print(f"  特征: {house}")
    print(f"  预测房价: ${predicted_price:.2f} 十万美元")
    print(f"  预测房价: ${predicted_price*100000:.0f} 美元")
    print()

# 批量预测
print("批量预测示例:")
print("=" * 30)

# 使用测试集的一部分进行批量预测
batch_size = 10
batch_houses = X_test[:batch_size]
batch_predictions = []

for house in batch_houses:
    pred = predict_house_price(
        best_final_model_obj, scaler, selector, poly_features, house
    )
    batch_predictions.append(pred)

batch_predictions = np.array(batch_predictions)
batch_actual = y_test[:batch_size]

print("前10个测试样本的预测结果:")
for i in range(batch_size):
    print(f"样本 {i+1}: 实际=${batch_actual[i]:.2f}, 预测=${batch_predictions[i]:.2f}, 误差=${abs(batch_actual[i]-batch_predictions[i]):.2f}")

# 计算批量预测的准确性
batch_rmse = np.sqrt(mean_squared_error(batch_actual, batch_predictions))
batch_mae = mean_absolute_error(batch_actual, batch_predictions)
print(f"\\n批量预测性能:")
print(f"RMSE: {batch_rmse:.3f}")
print(f"MAE: {batch_mae:.3f}")

9.9. 总结¶

通过本综合应用练习，我们完成了完整的回归分析流程：

9.9.1. 项目流程回顾¶

数据探索：分析了加州房价数据集的分布和特征关系
数据预处理：标准化、特征选择、多项式特征生成
模型训练：对比了线性回归、岭回归、LASSO回归、弹性网络
模型评估：使用交叉验证和多种评价指标
模型调优：通过网格搜索优化超参数
模型解释：分析特征重要性和模型可解释性
模型部署：创建预测函数进行实际应用

9.9.2. 关键发现¶

特征工程的重要性：多项式特征显著提升了模型性能
正则化的作用：岭回归和LASSO回归有效防止了过拟合
模型选择：不同模型在不同特征集上表现不同
交叉验证：提供了更可靠的模型评估

9.9.3. 实际应用价值¶

房价预测：可用于房地产投资决策
风险评估：帮助银行评估贷款风险
市场分析：理解影响房价的关键因素
政策制定：为城市规划提供数据支持

9.9.4. 技能提升¶

通过本练习，我们掌握了：

完整的机器学习项目流程
多种回归算法的实际应用
特征工程和模型调优技巧
模型评估和解释方法
实际业务场景中的应用

这个综合练习展示了回归分析在实际问题中的完整应用，为后续的机器学习学习奠定了坚实基础。