diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index fd8abb7..37876ce 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -3,6 +3,8 @@
+
+
@@ -53,7 +55,7 @@
-
+
@@ -63,7 +65,7 @@
-
+
@@ -73,8 +75,8 @@
-
-
+
+
@@ -85,8 +87,12 @@
-
-
+
+
+
+
+
+
@@ -135,6 +141,7 @@
joblib
traceback
models[job]
+ range
@@ -155,9 +162,9 @@
+
-
@@ -200,7 +207,7 @@
-
+
@@ -245,12 +252,12 @@
-
+
-
+
@@ -394,13 +401,6 @@
-
-
-
-
-
-
-
@@ -412,10 +412,21 @@
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
diff --git a/rf.py b/rf.py
index 42ab42b..a61c63d 100644
--- a/rf.py
+++ b/rf.py
@@ -1,6 +1,7 @@
# _*_coding:utf-8_*_
import numpy as np
import pandas as pd
+import os
def load_data(trainfile, testfile):
@@ -8,11 +9,12 @@ def load_data(trainfile, testfile):
testdata = pd.read_csv(testfile)
feature_data = traindata.iloc[:, 1:-1]
label_data = traindata.iloc[:, -1]
- test_feature = testdata.iloc[:, 1:]
- return feature_data, label_data, test_feature
+ test_feature = testdata.iloc[:, 1:-1]
+ test_label = testdata.iloc[:, -1]
+ return feature_data, label_data, test_feature, test_label
-def random_forest_train(feature_data, label_data, test_feature, submitfile):
+def random_forest_train(feature_data, label_data, test_feature):
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
@@ -23,9 +25,9 @@ def random_forest_train(feature_data, label_data, test_feature, submitfile):
'max_depth': 13,
'min_samples_split': 10,
'min_samples_leaf': 5, # 10
- 'max_features': 7
+ 'max_features': len(X_train.columns)
}
- print(X_test)
+ # print(X_test)
model = RandomForestRegressor(**params)
model.fit(X_train, y_train)
# 对测试集进行预测
@@ -33,23 +35,115 @@ def random_forest_train(feature_data, label_data, test_feature, submitfile):
# 计算准确率
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
- print(RMSE)
-
+ # print(abs(y_test - y_pred) / y_test)
+ # print(RMSE)
+ '''
submit = pd.read_csv(submitfile)
print(submit)
- print(model.predict(test_feature))
submit['CPU'] = model.predict(test_feature)
submit.to_csv('my_random_forest_prediction1.csv', index=False)
print(submit)
+ print(model.predict(test_feature))
+ '''
+ return model.predict(test_feature)
-def random_forest_parameter_tuning1(feature_data, label_data, test_feature):
+def linear_regression_train(feature_data, label_data, test_feature):
+ from sklearn.linear_model import LinearRegression
+ from sklearn.model_selection import train_test_split
+ from sklearn.metrics import mean_squared_error
+
+ X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
+ params = {}
+ # print(X_test)
+ model = LinearRegression(**params)
+ model.fit(X_train, y_train)
+ # 对测试集进行预测
+ y_pred = model.predict(X_test)
+ # 计算准确率
+ MSE = mean_squared_error(y_test, y_pred)
+ RMSE = np.sqrt(MSE)
+ # print(abs(y_test - y_pred) / y_test)
+ # print(RMSE)
+ return model.predict(test_feature)
+
+
+def adaboost_train(feature_data, label_data, test_feature):
+ from sklearn.ensemble import AdaBoostRegressor
+ from sklearn.model_selection import train_test_split
+ from sklearn.metrics import mean_squared_error
+
+ X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
+ params = {}
+ # print(X_test)
+ model = AdaBoostRegressor(**params)
+ model.fit(X_train, y_train)
+ # 对测试集进行预测
+ y_pred = model.predict(X_test)
+ # 计算准确率
+ MSE = mean_squared_error(y_test, y_pred)
+ RMSE = np.sqrt(MSE)
+ # print(abs(y_test - y_pred) / y_test)
+ # print(RMSE)
+ return model.predict(test_feature)
+
+
+def gbdt_train(feature_data, label_data, test_feature):
+ from sklearn.ensemble import GradientBoostingRegressor
+ from sklearn.model_selection import train_test_split
+ from sklearn.metrics import mean_squared_error
+
+ X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
+ params = {
+ 'loss': 'ls',
+ 'n_estimators': 70,
+ 'max_depth': 13,
+ 'min_samples_split': 10,
+ 'min_samples_leaf': 5, # 10
+ 'max_features': len(X_train.columns)
+ }
+ # print(X_test)
+ model = GradientBoostingRegressor(**params)
+ model.fit(X_train, y_train)
+ # 对测试集进行预测
+ y_pred = model.predict(X_test)
+ # 计算准确率
+ MSE = mean_squared_error(y_test, y_pred)
+ RMSE = np.sqrt(MSE)
+ # print(abs(y_test - y_pred) / y_test)
+ # print(RMSE)
+ return model.predict(test_feature)
+
+
+def decision_tree_train(feature_data, label_data, test_feature):
+ from sklearn.tree import DecisionTreeRegressor
+ from sklearn.model_selection import train_test_split
+ from sklearn.metrics import mean_squared_error
+
+ X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
+ params = {
+ 'max_depth': 13,
+ }
+ # print(X_test)
+ model = DecisionTreeRegressor(**params)
+ model.fit(X_train, y_train)
+ # 对测试集进行预测
+ y_pred = model.predict(X_test)
+ # 计算准确率
+ MSE = mean_squared_error(y_test, y_pred)
+ RMSE = np.sqrt(MSE)
+ # print(abs(y_test - y_pred) / y_test)
+ # print(RMSE)
+ return model.predict(test_feature)
+
+
+def random_forest_parameter_tuning1(feature_data, label_data):
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
- X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.23)
+ X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
param_test1 = {
'n_estimators': range(10, 71, 10)
}
@@ -67,13 +161,13 @@ def random_forest_parameter_tuning1(feature_data, label_data, test_feature):
return model.best_score_, model.best_params_
-def random_forest_parameter_tuning2(feature_data, label_data, test_feature):
+def random_forest_parameter_tuning2(feature_data, label_data):
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
- X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.23)
+ X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
param_test2 = {
'max_depth': range(3, 14, 2),
'min_samples_split': range(50, 201, 20)
@@ -98,7 +192,7 @@ def random_forest_parameter_tuning3(feature_data, label_data, test_feature):
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
- X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.23)
+ X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
param_test3 = {
'min_samples_split': range(10, 90, 20),
'min_samples_leaf': range(10, 60, 10),
@@ -123,7 +217,7 @@ def random_forest_parameter_tuning4(feature_data, label_data, test_feature):
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
- X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.23)
+ X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
param_test4 = {
'max_features': range(3, 9, 2)
}
@@ -142,8 +236,28 @@ def random_forest_parameter_tuning4(feature_data, label_data, test_feature):
if __name__ == '__main__':
+ algorithm = os.getenv('algorithm', 'rf')
trainfile = 'data/train.csv'
testfile = 'data/test.csv'
- submitfile = 'data/sample_submit.csv'
- feature_data, label_data, test_feature = load_data(trainfile, testfile)
- random_forest_train(feature_data, label_data, test_feature, submitfile)
+ feature_data, label_data, test_feature, test_label = load_data(trainfile, testfile)
+ if algorithm == 'lr':
+ y_pred = linear_regression_train(feature_data, label_data, test_feature)
+ elif algorithm == 'ada':
+ y_pred = adaboost_train(feature_data, label_data, test_feature)
+ elif algorithm == 'gbdt':
+ y_pred = adaboost_train(feature_data, label_data, test_feature)
+ elif algorithm == 'tree':
+ y_pred = decision_tree_train(feature_data, label_data, test_feature)
+ else:
+ y_pred = random_forest_train(feature_data, label_data, test_feature)
+
+ from sklearn.metrics import mean_squared_error
+
+ MSE = mean_squared_error(test_label, y_pred)
+ RMSE = np.sqrt(MSE)
+ var = np.var(test_label)
+ r2 = 1 - MSE / var
+ # print(abs(test_label - y_pred) / test_label)
+ print(RMSE, r2)
+ for i in range(20):
+ print("{},{},{}".format(test_label[i], y_pred[i], (y_pred[i] - test_label[i]) / test_label[i]))
diff --git a/serve.py b/serve.py
index e5862ff..b7960b5 100644
--- a/serve.py
+++ b/serve.py
@@ -62,7 +62,7 @@ def train_models(job):
# 计算准确率
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
- print('RMSE of ' + job + ' is ' + str(RMSE))
+ print('RMSE of {}:{} is {}'.format(job, label, str(RMSE)))
models[job]['lock'].release()
@@ -181,7 +181,8 @@ class MyHandler(BaseHTTPRequestHandler):
csvfile, delimiter=',',
quotechar='|', quoting=csv.QUOTE_MINIMAL
)
- spamwriter.writerow(values)
+ for i in range(5):
+ spamwriter.writerow(values)
models[job]['lock'].release()
msg = {'code': 0, 'error': ""}