1
0
mirror of https://github.com/newnius/YAO-optimizer.git synced 2025-12-13 08:26:43 +00:00
This commit is contained in:
2020-06-29 23:46:27 +08:00
parent 3ccc32945c
commit e4a9ceafe5
3 changed files with 164 additions and 38 deletions

148
rf.py
View File

@@ -1,6 +1,7 @@
# _*_coding:utf-8_*_
import numpy as np
import pandas as pd
import os
def load_data(trainfile, testfile):
@@ -8,11 +9,12 @@ def load_data(trainfile, testfile):
testdata = pd.read_csv(testfile)
feature_data = traindata.iloc[:, 1:-1]
label_data = traindata.iloc[:, -1]
test_feature = testdata.iloc[:, 1:]
return feature_data, label_data, test_feature
test_feature = testdata.iloc[:, 1:-1]
test_label = testdata.iloc[:, -1]
return feature_data, label_data, test_feature, test_label
def random_forest_train(feature_data, label_data, test_feature, submitfile):
def random_forest_train(feature_data, label_data, test_feature):
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
@@ -23,9 +25,9 @@ def random_forest_train(feature_data, label_data, test_feature, submitfile):
'max_depth': 13,
'min_samples_split': 10,
'min_samples_leaf': 5, # 10
'max_features': 7
'max_features': len(X_train.columns)
}
print(X_test)
# print(X_test)
model = RandomForestRegressor(**params)
model.fit(X_train, y_train)
# 对测试集进行预测
@@ -33,23 +35,115 @@ def random_forest_train(feature_data, label_data, test_feature, submitfile):
# 计算准确率
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
print(RMSE)
# print(abs(y_test - y_pred) / y_test)
# print(RMSE)
'''
submit = pd.read_csv(submitfile)
print(submit)
print(model.predict(test_feature))
submit['CPU'] = model.predict(test_feature)
submit.to_csv('my_random_forest_prediction1.csv', index=False)
print(submit)
print(model.predict(test_feature))
'''
return model.predict(test_feature)
def random_forest_parameter_tuning1(feature_data, label_data, test_feature):
def linear_regression_train(feature_data, label_data, test_feature):
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
params = {}
# print(X_test)
model = LinearRegression(**params)
model.fit(X_train, y_train)
# 对测试集进行预测
y_pred = model.predict(X_test)
# 计算准确率
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
# print(abs(y_test - y_pred) / y_test)
# print(RMSE)
return model.predict(test_feature)
def adaboost_train(feature_data, label_data, test_feature):
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
params = {}
# print(X_test)
model = AdaBoostRegressor(**params)
model.fit(X_train, y_train)
# 对测试集进行预测
y_pred = model.predict(X_test)
# 计算准确率
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
# print(abs(y_test - y_pred) / y_test)
# print(RMSE)
return model.predict(test_feature)
def gbdt_train(feature_data, label_data, test_feature):
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
params = {
'loss': 'ls',
'n_estimators': 70,
'max_depth': 13,
'min_samples_split': 10,
'min_samples_leaf': 5, # 10
'max_features': len(X_train.columns)
}
# print(X_test)
model = GradientBoostingRegressor(**params)
model.fit(X_train, y_train)
# 对测试集进行预测
y_pred = model.predict(X_test)
# 计算准确率
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
# print(abs(y_test - y_pred) / y_test)
# print(RMSE)
return model.predict(test_feature)
def decision_tree_train(feature_data, label_data, test_feature):
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
params = {
'max_depth': 13,
}
# print(X_test)
model = DecisionTreeRegressor(**params)
model.fit(X_train, y_train)
# 对测试集进行预测
y_pred = model.predict(X_test)
# 计算准确率
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
# print(abs(y_test - y_pred) / y_test)
# print(RMSE)
return model.predict(test_feature)
def random_forest_parameter_tuning1(feature_data, label_data):
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.23)
X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
param_test1 = {
'n_estimators': range(10, 71, 10)
}
@@ -67,13 +161,13 @@ def random_forest_parameter_tuning1(feature_data, label_data, test_feature):
return model.best_score_, model.best_params_
def random_forest_parameter_tuning2(feature_data, label_data, test_feature):
def random_forest_parameter_tuning2(feature_data, label_data):
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.23)
X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
param_test2 = {
'max_depth': range(3, 14, 2),
'min_samples_split': range(50, 201, 20)
@@ -98,7 +192,7 @@ def random_forest_parameter_tuning3(feature_data, label_data, test_feature):
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.23)
X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
param_test3 = {
'min_samples_split': range(10, 90, 20),
'min_samples_leaf': range(10, 60, 10),
@@ -123,7 +217,7 @@ def random_forest_parameter_tuning4(feature_data, label_data, test_feature):
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.23)
X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
param_test4 = {
'max_features': range(3, 9, 2)
}
@@ -142,8 +236,28 @@ def random_forest_parameter_tuning4(feature_data, label_data, test_feature):
if __name__ == '__main__':
algorithm = os.getenv('algorithm', 'rf')
trainfile = 'data/train.csv'
testfile = 'data/test.csv'
submitfile = 'data/sample_submit.csv'
feature_data, label_data, test_feature = load_data(trainfile, testfile)
random_forest_train(feature_data, label_data, test_feature, submitfile)
feature_data, label_data, test_feature, test_label = load_data(trainfile, testfile)
if algorithm == 'lr':
y_pred = linear_regression_train(feature_data, label_data, test_feature)
elif algorithm == 'ada':
y_pred = adaboost_train(feature_data, label_data, test_feature)
elif algorithm == 'gbdt':
y_pred = adaboost_train(feature_data, label_data, test_feature)
elif algorithm == 'tree':
y_pred = decision_tree_train(feature_data, label_data, test_feature)
else:
y_pred = random_forest_train(feature_data, label_data, test_feature)
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(test_label, y_pred)
RMSE = np.sqrt(MSE)
var = np.var(test_label)
r2 = 1 - MSE / var
# print(abs(test_label - y_pred) / test_label)
print(RMSE, r2)
for i in range(20):
print("{},{},{}".format(test_label[i], y_pred[i], (y_pred[i] - test_label[i]) / test_label[i]))