7. 使用矩阵计算岭回归系数 - Statistics and Data Science in Python

7.1. 介绍¶

岭回归的系数可以通过矩阵运算直接计算，而不需要迭代优化。本实验将展示如何使用矩阵运算计算岭回归系数，并比较不同方法的计算效率。

7.2. 知识点¶

岭回归的数学原理
矩阵运算计算岭回归系数
数值稳定性分析
计算效率对比
奇异值分解（SVD）在岭回归中的应用

# 导入必要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.datasets import make_regression, load_boston
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from scipy.linalg import solve, lstsq
import warnings
warnings.filterwarnings('ignore')

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

print("库导入成功！")

7.3. 岭回归的数学原理¶

7.3.1. 岭回归目标函数¶

岭回归的目标函数为：

J(\\theta) = \\|X\\theta - y\\|^2 + \\alpha\\|\\theta\\|^2

(1)

7.3.2. 闭式解¶

对目标函数求导并令导数为0，得到岭回归的闭式解：

\\theta = (X^TX + \\alpha I)^{-1}X^Ty

(2)

其中：

$X$ 是设计矩阵（特征矩阵）
$y$ 是目标向量
$\\alpha$ 是正则化参数
$I$ 是单位矩阵

7.3.3. 数值稳定性¶

当 $X^TX$ 接近奇异时，直接求逆可能不稳定。可以使用以下方法：

Cholesky分解
QR分解
奇异值分解（SVD）

7.4. 数据准备¶

我们使用波士顿房价数据集来演示矩阵计算方法：

# 加载数据
boston = load_boston()
X = boston.data
y = boston.target

print(f"数据集形状: {X.shape}")
print(f"特征名称: {boston.feature_names}")

# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 添加偏置项（截距）
X_with_bias = np.column_stack([np.ones(X_scaled.shape[0]), X_scaled])

print(f"\\n添加偏置项后的形状: {X_with_bias.shape}")
print("前5行数据:")
print(X_with_bias[:5])

7.5. 矩阵计算方法实现¶

现在我们将实现不同的矩阵计算方法来计算岭回归系数：

def ridge_regression_direct(X, y, alpha):
    """
    方法1：直接矩阵求逆
    θ = (X^T X + αI)^(-1) X^T y
    """
    XTX = X.T @ X
    n_features = XTX.shape[0]
    I = np.eye(n_features)
    
    # 计算 (X^T X + αI)^(-1)
    XTX_alpha_I = XTX + alpha * I
    XTX_inv = np.linalg.inv(XTX_alpha_I)
    
    # 计算系数
    theta = XTX_inv @ X.T @ y
    return theta

def ridge_regression_cholesky(X, y, alpha):
    """
    方法2：Cholesky分解
    更数值稳定的方法
    """
    XTX = X.T @ X
    n_features = XTX.shape[0]
    I = np.eye(n_features)
    
    # Cholesky分解
    L = np.linalg.cholesky(XTX + alpha * I)
    
    # 求解 L L^T θ = X^T y
    # 先求解 L z = X^T y
    z = np.linalg.solve(L, X.T @ y)
    # 再求解 L^T θ = z
    theta = np.linalg.solve(L.T, z)
    
    return theta

def ridge_regression_svd(X, y, alpha):
    """
    方法3：SVD分解
    最数值稳定的方法
    """
    U, s, Vt = np.linalg.svd(X, full_matrices=False)
    
    # 计算 SVD 形式的岭回归解
    # θ = V (S^2 + αI)^(-1) S U^T y
    s_squared = s**2
    s_regularized = s_squared + alpha
    
    # 计算系数
    theta = Vt.T @ (s / s_regularized) @ (U.T @ y)
    
    return theta

def ridge_regression_qr(X, y, alpha):
    """
    方法4：QR分解
    """
    # 构造增广矩阵 [X; sqrt(α)I]
    n_samples, n_features = X.shape
    sqrt_alpha = np.sqrt(alpha)
    
    # 构造增广矩阵
    X_augmented = np.vstack([X, sqrt_alpha * np.eye(n_features)])
    y_augmented = np.hstack([y, np.zeros(n_features)])
    
    # QR分解
    Q, R = np.linalg.qr(X_augmented)
    
    # 求解 R θ = Q^T y_augmented
    QTy = Q.T @ y_augmented
    theta = np.linalg.solve(R, QTy)
    
    return theta

print("岭回归矩阵计算方法定义完成！")

# 测试不同的矩阵计算方法
alpha = 1.0
methods = {
    '直接求逆': ridge_regression_direct,
    'Cholesky分解': ridge_regression_cholesky,
    'SVD分解': ridge_regression_svd,
    'QR分解': ridge_regression_qr
}

results = {}
for method_name, method_func in methods.items():
    try:
        start_time = time.time()
        theta = method_func(X_with_bias, y, alpha)
        end_time = time.time()
        
        # 计算预测值
        y_pred = X_with_bias @ theta
        mse = mean_squared_error(y, y_pred)
        
        results[method_name] = {
            'theta': theta,
            'mse': mse,
            'time': end_time - start_time
        }
        
        print(f"{method_name}:")
        print(f"  计算时间: {end_time - start_time:.6f} 秒")
        print(f"  MSE: {mse:.6f}")
        print(f"  系数数量: {len(theta)}")
        print()
        
    except Exception as e:
        print(f"{method_name} 计算失败: {e}")
        print()

# 比较不同方法的结果
print("系数对比（前5个系数）:")
for method_name, result in results.items():
    print(f"{method_name}: {result['theta'][:5]}")

# 与sklearn的岭回归对比
from sklearn.linear_model import Ridge

# sklearn岭回归
sklearn_ridge = Ridge(alpha=alpha, fit_intercept=False)
sklearn_ridge.fit(X_with_bias, y)
sklearn_theta = sklearn_ridge.coef_
sklearn_y_pred = sklearn_ridge.predict(X_with_bias)
sklearn_mse = mean_squared_error(y, sklearn_y_pred)

print("Sklearn岭回归结果:")
print(f"  MSE: {sklearn_mse:.6f}")
print(f"  系数: {sklearn_theta[:5]}")

# 计算与sklearn结果的差异
print("\\n与sklearn结果的差异:")
for method_name, result in results.items():
    diff = np.linalg.norm(result['theta'] - sklearn_theta)
    print(f"{method_name}: {diff:.10f}")

# 可视化不同方法的结果
plt.figure(figsize=(15, 10))

# 系数对比
plt.subplot(2, 3, 1)
methods_list = list(results.keys())
theta_values = [results[method]['theta'] for method in methods_list]

for i, (method, theta) in enumerate(zip(methods_list, theta_values)):
    plt.plot(theta, label=method, marker='o', markersize=4)
plt.xlabel('特征索引')
plt.ylabel('系数值')
plt.title('不同方法的系数对比')
plt.legend()
plt.grid(True, alpha=0.3)

# 计算时间对比
plt.subplot(2, 3, 2)
times = [results[method]['time'] for method in methods_list]
plt.bar(methods_list, times, color='skyblue', alpha=0.7)
plt.ylabel('计算时间 (秒)')
plt.title('计算时间对比')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

# MSE对比
plt.subplot(2, 3, 3)
mses = [results[method]['mse'] for method in methods_list]
plt.bar(methods_list, mses, color='lightgreen', alpha=0.7)
plt.ylabel('MSE')
plt.title('MSE对比')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

# 预测vs实际值
plt.subplot(2, 3, 4)
best_method = min(results.keys(), key=lambda x: results[x]['mse'])
best_theta = results[best_method]['theta']
best_y_pred = X_with_bias @ best_theta

plt.scatter(y, best_y_pred, alpha=0.6, color='blue')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
plt.xlabel('实际值')
plt.ylabel('预测值')
plt.title(f'最佳方法预测结果\\n({best_method})')
plt.grid(True, alpha=0.3)

# 残差分析
plt.subplot(2, 3, 5)
residuals = y - best_y_pred
plt.scatter(best_y_pred, residuals, alpha=0.6, color='red')
plt.axhline(y=0, color='black', linestyle='--')
plt.xlabel('预测值')
plt.ylabel('残差')
plt.title('残差分析')
plt.grid(True, alpha=0.3)

# 系数重要性
plt.subplot(2, 3, 6)
feature_names = ['截距'] + list(boston.feature_names)
coef_importance = np.abs(best_theta)
sorted_indices = np.argsort(coef_importance)[::-1]

plt.barh(range(len(coef_importance)), coef_importance[sorted_indices])
plt.yticks(range(len(coef_importance)), [feature_names[i] for i in sorted_indices])
plt.xlabel('系数绝对值')
plt.title('特征重要性')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

7.6. 数值稳定性分析¶

让我们分析不同方法在数值稳定性方面的表现：

# 数值稳定性分析
def analyze_condition_number(X):
    """分析矩阵的条件数"""
    XTX = X.T @ X
    condition_number = np.linalg.cond(XTX)
    return condition_number

def analyze_singular_values(X):
    """分析奇异值"""
    U, s, Vt = np.linalg.svd(X, full_matrices=False)
    return s

# 分析原始数据的条件数
condition_number = analyze_condition_number(X_with_bias)
print(f"X^T X的条件数: {condition_number:.2e}")

# 分析奇异值
singular_values = analyze_singular_values(X_with_bias)
print(f"\\n奇异值范围: {singular_values.min():.2e} 到 {singular_values.max():.2e}")
print(f"奇异值比值: {singular_values.max() / singular_values.min():.2e}")

# 可视化奇异值
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.semilogy(singular_values, 'o-')
plt.xlabel('奇异值索引')
plt.ylabel('奇异值')
plt.title('奇异值分布')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.semilogy(singular_values / singular_values.max(), 'o-')
plt.xlabel('奇异值索引')
plt.ylabel('归一化奇异值')
plt.title('归一化奇异值分布')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 测试不同正则化参数的影响
alphas = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
alpha_results = {}

for alpha in alphas:
    # 使用SVD方法（最稳定）
    theta_svd = ridge_regression_svd(X_with_bias, y, alpha)
    y_pred_svd = X_with_bias @ theta_svd
    mse_svd = mean_squared_error(y, y_pred_svd)
    
    alpha_results[alpha] = {
        'theta': theta_svd,
        'mse': mse_svd,
        'coef_norm': np.linalg.norm(theta_svd)
    }

print("\\n不同正则化参数的影响:")
print("α\\tMSE\\t\\t系数范数")
print("-" * 30)
for alpha in alphas:
    result = alpha_results[alpha]
    print(f"{alpha}\\t{result['mse']:.6f}\\t{result['coef_norm']:.6f}")

# 可视化正则化参数的影响
plt.figure(figsize=(15, 5))

# MSE vs 正则化参数
plt.subplot(1, 3, 1)
alphas_list = list(alpha_results.keys())
mses_list = [alpha_results[alpha]['mse'] for alpha in alphas_list]
plt.semilogx(alphas_list, mses_list, 'o-', linewidth=2, markersize=8)
plt.xlabel('正则化参数 α')
plt.ylabel('MSE')
plt.title('MSE vs 正则化参数')
plt.grid(True, alpha=0.3)

# 系数范数 vs 正则化参数
plt.subplot(1, 3, 2)
coef_norms = [alpha_results[alpha]['coef_norm'] for alpha in alphas_list]
plt.semilogx(alphas_list, coef_norms, 'o-', linewidth=2, markersize=8, color='red')
plt.xlabel('正则化参数 α')
plt.ylabel('系数范数')
plt.title('系数范数 vs 正则化参数')
plt.grid(True, alpha=0.3)

# 系数路径
plt.subplot(1, 3, 3)
coef_paths = np.array([alpha_results[alpha]['theta'] for alpha in alphas_list]).T
for i in range(min(5, coef_paths.shape[0])):  # 只显示前5个系数
    plt.semilogx(alphas_list, coef_paths[i], 'o-', label=f'系数 {i}', linewidth=2, markersize=4)
plt.xlabel('正则化参数 α')
plt.ylabel('系数值')
plt.title('系数路径')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

7.7. 总结¶

通过本实验，我们学习了：

7.7.1. 岭回归的矩阵计算方法¶

直接求逆： $(X^TX + \\alpha I)^{-1}X^Ty$ - 简单但数值不稳定
Cholesky分解：更稳定的方法，适合正定矩阵
SVD分解：最数值稳定的方法，适合病态矩阵
QR分解：通过增广矩阵实现，计算效率高

7.7.2. 数值稳定性¶

条件数：衡量矩阵的数值稳定性
奇异值：反映矩阵的秩和数值特性
正则化：通过添加正则化项改善数值稳定性

7.7.3. 方法选择建议¶

小规模数据：直接求逆或Cholesky分解
中等规模数据：Cholesky分解或QR分解
大规模数据：SVD分解或迭代方法
病态矩阵：SVD分解

7.7.4. 实际应用¶

矩阵计算方法的选择取决于数据规模和数值特性
正则化参数的选择需要在偏差和方差之间平衡
数值稳定性分析有助于选择合适的方法

这些矩阵计算方法为理解岭回归的数学原理和实际实现提供了重要基础。