")XGBoost最终得分为264.79分
发布时间:2025-06-24 20:15:52 作者:北方职教升学中心 阅读量:609
创建滚动统计特征
# 特征工程尝试一import pandas as pdimport numpy as npfrom sklearn.preprocessing import LabelEncoder# 合并训练和测试数据集以便于特征工程train_df['is_train'] = 1test_df['is_train'] = 0test_df['target'] = np.nanfull_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)# 处理日期特征full_df['day_of_week'] = full_df['dt'] % 7 # 假设1代表周一full_df['is_weekend'] = full_df['day_of_week'].isin([5, 6]).astype(int)# 处理房屋类型le = LabelEncoder()full_df['type'] = le.fit_transform(full_df['type'])# 创建时间序列特征def create_lag_features(df, lags, col): for lag in lags: df[f'{col}_lag_{lag}'] = df.groupby('id')[col].shift(lag) return dflags = [1, 2, 3, 7, 14, 30]full_df = create_lag_features(full_df, lags, 'target')# 创建滚动统计特征def create_rolling_features(df, windows, col): for window in windows: df[f'{col}_roll_mean_{window}'] = df.groupby('id')[col].shift(1).rolling(window).mean() df[f'{col}_roll_max_{window}'] = df.groupby('id')[col].shift(1).rolling(window).max() df[f'{col}_roll_min_{window}'] = df.groupby('id')[col].shift(1).rolling(window).min() df[f'{col}_roll_std_{window}'] = df.groupby('id')[col].shift(1).rolling(window).std() return dfwindows = [3, 7, 14, 30]full_df = create_rolling_features(full_df, windows, 'target')# 填充缺失值full_df.fillna(0, inplace=True)# 拆分回训练和测试集train_df = full_df[full_df['is_train'] == 1].drop(['is_train'], axis=1)test_df = full_df[full_df['is_train'] == 0].drop(['is_train', 'target'], axis=1)# 保存处理后的数据集train_df.to_csv('new_train.csv', index=False)test_df.to_csv('new_test.csv', index=False)print("特征工程完成,处理后的数据集已保存。和 XGBoost 相比,其在大规模数据集上跑起来更加轻盈。完成了特征工程后,我们分别采取三个集成学习模型进行训练:
# XGBoostimport pandas as pdimport numpy as npimport xgboost as xgbfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import mean_squared_error# 加载处理后的数据train_df = pd.read_csv(r'D:\数学建模竞赛\2024DW夏令营2机器学习\dataset\train.csvv')test_df = pd.read_csv(r'D:\数学建模竞赛\2024DW夏令营2机器学习\dataset\test.csv')# 准备训练数据和标签X = train_df.drop(columns=['target', 'id', 'dt'])y = train_df['target']X_test = test_df.drop(columns=['id', 'dt'])# 拆分训练集和验证集X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)# 创建XGBoost DMatrixtrain_data = xgb.DMatrix(X_train, label=y_train)val_data = xgb.DMatrix(X_val, label=y_val)test_data = xgb.DMatrix(X_test)# 设置参数params = { 'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'eta': 0.05, 'max_depth': 6, 'subsample': 0.9, 'colsample_bytree': 0.9, 'alpha': 0.1, 'lambda': 1.0,}# 设置回调函数callbacks = [ xgb.callback.EarlyStopping(rounds=500, save_best=True), xgb.callback.EvaluationMonitor(period=100)]# 训练模型model = xgb.train(params, train_data, num_boost_round=50000, evals=[(train_data, 'train'), (val_data, 'eval')], callbacks=callbacks)# 进行预测predictions = model.predict(test_data)# 创建预测结果数据框results_df = test_df[['id', 'dt']].copy()results_df['target'] = predictions# 保存预测结果results_df.to_csv('XGB_submit.csv', index=False)print("模型训练和预测完成,预测结果已保存。这也是分数为何居高不下的原因之一。最大值、请原谅我说这条路实在是走的很费劲,因为本人之前基本没有特征工程很成功的经验。我先开始尝试的是调参,调参的两大基本方法便是网格搜索法和贝叶斯优化。")
(1)历史平移特征:通过历史平移获取上个阶段的信息;
(2)差分特征:可以帮助获取相邻阶段的增长差异,描述数据的涨减变化情况。2、90天)下的移动平均值;以及创建差分特征,一个名为create_diff_features
的函数被定义来计算目标变量target
在不同滞后阶数(1、")# LightGBM训练和预测代码import lightgbm as lgbfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import mean_squared_errorfrom lightgbm import early_stopping, log_evaluation# 加载数据train_df = pd.read_csv(r'D:\数学建模竞赛\2024DW夏令营2机器学习\dataset\train.csv')test_df = pd.read_csv(r'D:\数学建模竞赛\2024DW夏令营2机器学习\dataset\test.csv')# 准备训练数据和标签X = train_df.drop(columns=['target', 'id', 'dt'])y = train_df['target']X_test = test_df.drop(columns=['id', 'dt'])# 拆分训练集和验证集X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)# 创建LightGBM数据集train_data = lgb.Dataset(X_train, label=y_train)val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)# 设置参数params = { 'objective': 'regression', 'metric': 'mse', 'boosting_type': 'gbdt', 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9}# 设置回调函数callbacks = [ log_evaluation(period=100), # 每100轮记录一次日志 early_stopping(stopping_rounds=500) # 500轮没有提升时提前停止]# 训练模型model = lgb.train(params, train_data, num_boost_round=50000, valid_sets=[train_data, val_data], callbacks=callbacks)# 进行预测predictions = model.predict(X_test, num_iteration=model.best_iteration)# 创建预测结果数据框results_df = test_df[['id', 'dt']].copy()results_df['target'] = predictions# 保存预测结果results_df.to_csv('submit7.csv', index=False)print("模型训练和预测完成,预测结果已保存。到这里,我突然意识到了一个问题,特征工程并不是各种花里胡哨的堆砌,简单的也许能直击要害。嗯,不错,算是努力没有白费,这也算是日拱一卒,功不唐捐嘛。
learning_rate
: 0.2
max_depth
: 12.75(向下取整为12)
min_child_samples
: 20
num_leaves
: 86
然后,我将上述参数组合用在了LightGBM中,确实上了一点分,达到了251.22分。中位数、
解决时间序列问题的方法有很多,本次,我主要采用的是机器学习中的Boosting提升树家族的模型。轻量梯度提升机(LightGBM)以及CatBoost。
上图是我的代码输出日志,详细的反映了每一次迭代贝叶斯优化的过程。7、

本次笔记就到此为止了,也算是记录了我这三天来的所有学习和尝试吧。和窗口统计特征
# 尝试三import pandas as pdimport numpy as npfrom sklearn.preprocessing import LabelEncoder# 合并训练数据和测试数据train_df = pd.read_csv(r'D:\数学建模竞赛\2024DW夏令营2机器学习\dataset\train.csv')test_df = pd.read_csv(r'D:\数学建模竞赛\2024DW夏令营2机器学习\dataset\test.csv')data = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)data = data.sort_values(['id', 'dt'], ascending=False).reset_index(drop=True)# 历史平移特征for i in range(10, 51, 5): data[f'target_shift{i}'] = data.groupby('id')['target'].shift(i)# 历史平移 + 差分特征for i in range(1, 4): for j in range(10, 51, 5): data[f'target_shift{j}_diff{i}'] = data.groupby('id')[f'target_shift{j}'].diff(i)# 增加二阶差分特征for i in range(10, 51, 5): data[f'target_shift{i}_diff2'] = data.groupby('id')[f'target_shift{i}'].diff(2)# 窗口统计特征for win in [15, 30, 50, 70]: data[f'target_win{win}_mean'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').mean().values data[f'target_win{win}_max'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').max().values data[f'target_win{win}_min'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').min().values data[f'target_win{win}_std'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').std().values# 历史平移 + 窗口统计特征for win in [7, 14, 28, 35, 50, 70]: for j in range(10, 51, 5): data[f'target_shift{j}_win{win}_mean'] = data.groupby('id')[f'target_shift{j}'].rolling(window=win, min_periods=3, closed='left').mean().values data[f'target_shift{j}_win{win}_max'] = data.groupby('id')[f'target_shift{j}'].rolling(window=win, min_periods=3, closed='left').max().values data[f'target_shift{j}_win{win}_min'] = data.groupby('id')[f'target_shift{j}'].rolling(window=win, min_periods=3, closed='left').min().values data[f'target_shift{j}_win{win}_std'] = data.groupby('id')[f'target_shift{j}'].rolling(window=win, min_periods=3, closed='left').std().values# 填充缺失值data.fillna(0, inplace=True)# 拆分回训练和测试集train_df = data[data['target'].notna()]test_df = data[data['target'].isna()].drop(['target'], axis=1)# 保存处理后的数据集train_df.to_csv('1_train_optimized.csv', index=False)test_df.to_csv('1_test_optimized.csv', index=False)print("优化后的特征工程完成,处理后的数据集已保存。季度等;(2)移动平均:计算更长时间范围内的移动平均;
(3)差分特征:计算电力消耗的差分;
(4)交互特征:构建不同特征之间的交互特征;
(5)聚合特征:对每个房屋类型计算全局的统计量,例如均值、60天、
这就有了尝试三:主要构建了历史平移特征、中位数、
# 特征工程尝试二import pandas as pdimport numpy as npfrom sklearn.preprocessing import LabelEncoderimport lightgbm as lgbfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import mean_squared_errorfrom lightgbm import early_stopping, log_evaluation# 加载数据train_df = pd.read_csv(r'D:\数学建模竞赛\2024DW夏令营2机器学习\dataset\train.csv')test_df = pd.read_csv(r'D:\数学建模竞赛\2024DW夏令营2机器学习\dataset\test.csv')# 合并训练和测试数据集以便于特征工程train_df['is_train'] = 1test_df['is_train'] = 0test_df['target'] = np.nanfull_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)# 处理日期特征full_df['day_of_week'] = full_df['dt'] % 7 # 假设1代表周一full_df['is_weekend'] = full_df['day_of_week'].isin([5, 6]).astype(int)full_df['month'] = (full_df['dt'] // 30) % 12 # 假设每月30天full_df['quarter'] = (full_df['dt'] // 90) % 4 # 假设每季度90天# 处理房屋类型le = LabelEncoder()full_df['type'] = le.fit_transform(full_df['type'])# 创建移动平均特征def create_moving_average_features(df, windows, col): for window in windows: df[f'{col}_ma_{window}'] = df.groupby('id')[col].transform(lambda x: x.rolling(window).mean()) return dfwindows = [7, 30, 60, 90]full_df = create_moving_average_features(full_df, windows, 'target')# 创建差分特征def create_diff_features(df, lags, col): for lag in lags: df[f'{col}_diff_{lag}'] = df.groupby('id')[col].diff(lag) return dflags = [1, 2, 3, 7, 14, 30]full_df = create_diff_features(full_df, lags, 'target')# 填充缺失值full_df.fillna(0, inplace=True)# 拆分回训练和测试集train_df = full_df[full_df['is_train'] == 1].drop(['is_train'], axis=1)test_df = full_df[full_df['is_train'] == 0].drop(['is_train', 'target'], axis=1)# 保存处理后的数据集train_df.to_csv('processed_train.csv', index=False)test_df.to_csv('processed_test.csv', index=False)print("特征工程完成,处理后的数据集已保存。过程很辛苦,也很有趣,感谢Datawhale这个用心的开源组织!我们第三期见!