1.数据读取
利用原生xgboost库读取libsvm数据
1
2
|
import xgboost as xgb data = xgb.DMatrix(libsvm文件) |
使用sklearn读取libsvm数据
1
2
|
from sklearn.datasets import load_svmlight_file X_train,y_train = load_svmlight_file(libsvm文件) |
使用pandas读取完数据后在转化为标准形式
2.模型训练过程
1.未调参基线模型
使用xgboost原生库进行训练
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
import xgboost as xgb from sklearn.metrics import accuracy_score dtrain = xgb.DMatrix(f_train, label = l_train) dtest = xgb.DMatrix(f_test, label = l_test) param = { 'max_depth' : 2 , 'eta' : 1 , 'silent' : 0 , 'objective' : 'binary:logistic' } num_round = 2 bst = xgb.train(param, dtrain, num_round) train_preds = bst.predict(dtrain) train_predictions = [ round (value) for value in train_preds] #进行四舍五入的操作--变成0.1(算是设定阈值的符号函数) train_accuracy = accuracy_score(l_train, train_predictions) #使用sklearn进行比较正确率 print ( "Train Accuary: %.2f%%" % (train_accuracy * 100.0 )) from xgboost import plot_importance #显示特征重要性 plot_importance(bst) #打印重要程度结果。 pyplot.show() |
使用XGBClassifier进行训练
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
# 未设定早停止, 未进行矩阵变换 from xgboost import XGBClassifier from sklearn.datasets import load_svmlight_file #用于直接读取svmlight文件形式, 否则就需要使用xgboost.DMatrix(文件名)来读取这种格式的文件 from sklearn.metrics import accuracy_score from matplotlib import pyplot num_round = 100 bst1 = XGBClassifier(max_depth = 2 , learning_rate = 1 , n_estimators = num_round, #弱分类树太少的话取不到更多的特征重要性 silent = True , objective = 'binary:logistic' ) bst1.fit(f_train, l_train) train_preds = bst1.predict(f_train) train_accuracy = accuracy_score(l_train, train_preds) print ( "Train Accuary: %.2f%%" % (train_accuracy * 100.0 )) preds = bst1.predict(f_test) test_accuracy = accuracy_score(l_test, preds) print ( "Test Accuracy: %.2f%%" % (test_accuracy * 100.0 )) from xgboost import plot_importance #显示特征重要性 plot_importance(bst1) #打印重要程度结果。 pyplot.show() |
2.两种交叉验证方式
使用cross_val_score进行交叉验证
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
#利用model_selection进行交叉训练 from xgboost import XGBClassifier from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import cross_val_score from sklearn.metrics import accuracy_score from matplotlib import pyplot param = { 'max_depth' : 2 , 'eta' : 1 , 'silent' : 0 , 'objective' : 'binary:logistic' } num_round = 100 bst2 = XGBClassifier(max_depth = 2 , learning_rate = 0.1 ,n_estimators = num_round, silent = True , objective = 'binary:logistic' ) bst2.fit(f_train, l_train) kfold = StratifiedKFold(n_splits = 10 , random_state = 7 ) results = cross_val_score(bst2, f_train, l_train, cv = kfold) #对数据进行十折交叉验证--9份训练,一份测试 print (results) print ( "CV Accuracy: %.2f%% (%.2f%%)" % (results.mean() * 100 , results.std() * 100 )) from xgboost import plot_importance #显示特征重要性 plot_importance(bst2) #打印重要程度结果。 pyplot.show() |
使用GridSearchCV进行网格搜索
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
#使用sklearn中提供的网格搜索进行测试--找出最好参数,并作为默认训练参数 from xgboost import XGBClassifier from sklearn.model_selection import GridSearchCV from sklearn.metrics import accuracy_score from matplotlib import pyplot params = { 'max_depth' : 2 , 'eta' : 0.1 , 'silent' : 0 , 'objective' : 'binary:logistic' } bst = XGBClassifier(max_depth = 2 , learning_rate = 0.1 , silent = True , objective = 'binary:logistic' ) param_test = { 'n_estimators' : range ( 1 , 51 , 1 ) } clf = GridSearchCV(estimator = bst, param_grid = param_test, scoring = 'accuracy' , cv = 5 ) # 5折交叉验证 clf.fit(f_train, l_train) #默认使用最优的参数 preds = clf.predict(f_test) test_accuracy = accuracy_score(l_test, preds) print ( "Test Accuracy of gridsearchcv: %.2f%%" % (test_accuracy * 100.0 )) clf.cv_results_, clf.best_params_, clf.best_score_ |
3.早停止调参–early_stopping_rounds(查看的是损失是否变化)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
#进行提早停止的单独实例 import xgboost as xgb from xgboost import XGBClassifier from sklearn.metrics import accuracy_score from matplotlib import pyplot param = { 'max_depth' : 2 , 'eta' : 1 , 'silent' : 0 , 'objective' : 'binary:logistic' } num_round = 100 bst = XGBClassifier(max_depth = 2 , learning_rate = 0.1 , n_estimators = num_round, silent = True , objective = 'binary:logistic' ) eval_set = [(f_test, l_test)] bst.fit(f_train, l_train, early_stopping_rounds = 10 , eval_metric = "error" ,eval_set = eval_set, verbose = True ) #early_stopping_rounds--当多少次的效果差不多时停止 eval_set--用于显示损失率的数据 verbose--显示错误率的变化过程 # make prediction preds = bst.predict(f_test) test_accuracy = accuracy_score(l_test, preds) print ( "Test Accuracy: %.2f%%" % (test_accuracy * 100.0 )) |
4.多数据观察训练损失
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
#多参数顺 import xgboost as xgb from xgboost import XGBClassifier from sklearn.metrics import accuracy_score from matplotlib import pyplot num_round = 100 bst = XGBClassifier(max_depth = 2 , learning_rate = 0.1 , n_estimators = num_round, silent = True , objective = 'binary:logistic' ) eval_set = [(f_train, l_train), (f_test, l_test)] bst.fit(f_train, l_train, eval_metric = [ "error" , "logloss" ], eval_set = eval_set, verbose = True ) # make prediction preds = bst.predict(f_test) test_accuracy = accuracy_score(l_test, preds) print ( "Test Accuracy: %.2f%%" % (test_accuracy * 100.0 )) |
5.模型保存与读取
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
#模型保存 bst.save_model( 'demo.model' ) #模型读取与预测 modelfile = 'demo.model' # 1 bst = xgb.Booster({ 'nthread' : 8 }, model_file = modelfile) # 2 f_test1 = xgb.DMatrix(f_test) #尽量使用xgboost的自己的数据矩阵 ypred1 = bst.predict(f_test1) train_predictions = [ round (value) for value in ypred1] test_accuracy1 = accuracy_score(l_test, train_predictions) print ( "Test Accuracy: %.2f%%" % (test_accuracy1 * 100.0 )) |
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/hot7732788/article/details/90903152