update

2025-12-13 08:26:43 +00:00 · 2020-06-29 23:46:27 +08:00
parent 3ccc32945c
commit e4a9ceafe5
3 changed files with 164 additions and 38 deletions
--- a/rf.py
+++ b/rf.py
@@ -1,6 +1,7 @@
 # _*_coding:utf-8_*_
 import numpy as np
 import pandas as pd
+import os


 def load_data(trainfile, testfile):
@@ -8,11 +9,12 @@ def load_data(trainfile, testfile):
 	testdata = pd.read_csv(testfile)
 	feature_data = traindata.iloc[:, 1:-1]
 	label_data = traindata.iloc[:, -1]
-	test_feature = testdata.iloc[:, 1:]
-	return feature_data, label_data, test_feature
+	test_feature = testdata.iloc[:, 1:-1]
+	test_label = testdata.iloc[:, -1]
+	return feature_data, label_data, test_feature, test_label


-def random_forest_train(feature_data, label_data, test_feature, submitfile):
+def random_forest_train(feature_data, label_data, test_feature):
 	from sklearn.ensemble import RandomForestRegressor
 	from sklearn.model_selection import train_test_split
 	from sklearn.metrics import mean_squared_error
@@ -23,9 +25,9 @@ def random_forest_train(feature_data, label_data, test_feature, submitfile):
 		'max_depth': 13,
 		'min_samples_split': 10,
 		'min_samples_leaf': 5,  # 10
-		'max_features': 7
+		'max_features': len(X_train.columns)
 	}
-	print(X_test)
+	# print(X_test)
 	model = RandomForestRegressor(**params)
 	model.fit(X_train, y_train)
 	# 对测试集进行预测
@@ -33,23 +35,115 @@ def random_forest_train(feature_data, label_data, test_feature, submitfile):
 	# 计算准确率
 	MSE = mean_squared_error(y_test, y_pred)
 	RMSE = np.sqrt(MSE)
-	print(RMSE)
-
+	# print(abs(y_test - y_pred) / y_test)
+	# print(RMSE)
+	'''
 	submit = pd.read_csv(submitfile)
 	print(submit)
-	print(model.predict(test_feature))
 	submit['CPU'] = model.predict(test_feature)
 	submit.to_csv('my_random_forest_prediction1.csv', index=False)
 	print(submit)
+	print(model.predict(test_feature))
+	'''
+	return model.predict(test_feature)


-def random_forest_parameter_tuning1(feature_data, label_data, test_feature):
+def linear_regression_train(feature_data, label_data, test_feature):
+	from sklearn.linear_model import LinearRegression
+	from sklearn.model_selection import train_test_split
+	from sklearn.metrics import mean_squared_error
+
+	X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
+	params = {}
+	# print(X_test)
+	model = LinearRegression(**params)
+	model.fit(X_train, y_train)
+	# 对测试集进行预测
+	y_pred = model.predict(X_test)
+	# 计算准确率
+	MSE = mean_squared_error(y_test, y_pred)
+	RMSE = np.sqrt(MSE)
+	# print(abs(y_test - y_pred) / y_test)
+	# print(RMSE)
+	return model.predict(test_feature)
+
+
+def adaboost_train(feature_data, label_data, test_feature):
+	from sklearn.ensemble import AdaBoostRegressor
+	from sklearn.model_selection import train_test_split
+	from sklearn.metrics import mean_squared_error
+
+	X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
+	params = {}
+	# print(X_test)
+	model = AdaBoostRegressor(**params)
+	model.fit(X_train, y_train)
+	# 对测试集进行预测
+	y_pred = model.predict(X_test)
+	# 计算准确率
+	MSE = mean_squared_error(y_test, y_pred)
+	RMSE = np.sqrt(MSE)
+	# print(abs(y_test - y_pred) / y_test)
+	# print(RMSE)
+	return model.predict(test_feature)
+
+
+def gbdt_train(feature_data, label_data, test_feature):
+	from sklearn.ensemble import GradientBoostingRegressor
+	from sklearn.model_selection import train_test_split
+	from sklearn.metrics import mean_squared_error
+
+	X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
+	params = {
+		'loss': 'ls',
+		'n_estimators': 70,
+		'max_depth': 13,
+		'min_samples_split': 10,
+		'min_samples_leaf': 5,  # 10
+		'max_features': len(X_train.columns)
+	}
+	# print(X_test)
+	model = GradientBoostingRegressor(**params)
+	model.fit(X_train, y_train)
+	# 对测试集进行预测
+	y_pred = model.predict(X_test)
+	# 计算准确率
+	MSE = mean_squared_error(y_test, y_pred)
+	RMSE = np.sqrt(MSE)
+	# print(abs(y_test - y_pred) / y_test)
+	# print(RMSE)
+	return model.predict(test_feature)
+
+
+def decision_tree_train(feature_data, label_data, test_feature):
+	from sklearn.tree import DecisionTreeRegressor
+	from sklearn.model_selection import train_test_split
+	from sklearn.metrics import mean_squared_error
+
+	X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
+	params = {
+		'max_depth': 13,
+	}
+	# print(X_test)
+	model = DecisionTreeRegressor(**params)
+	model.fit(X_train, y_train)
+	# 对测试集进行预测
+	y_pred = model.predict(X_test)
+	# 计算准确率
+	MSE = mean_squared_error(y_test, y_pred)
+	RMSE = np.sqrt(MSE)
+	# print(abs(y_test - y_pred) / y_test)
+	# print(RMSE)
+	return model.predict(test_feature)
+
+
+def random_forest_parameter_tuning1(feature_data, label_data):
 	from sklearn.ensemble import RandomForestRegressor
 	from sklearn.model_selection import train_test_split
 	from sklearn.metrics import mean_squared_error
 	from sklearn.model_selection import GridSearchCV

-	X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.23)
+	X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
 	param_test1 = {
 		'n_estimators': range(10, 71, 10)
 	}
@@ -67,13 +161,13 @@ def random_forest_parameter_tuning1(feature_data, label_data, test_feature):
 	return model.best_score_, model.best_params_


-def random_forest_parameter_tuning2(feature_data, label_data, test_feature):
+def random_forest_parameter_tuning2(feature_data, label_data):
 	from sklearn.ensemble import RandomForestRegressor
 	from sklearn.model_selection import train_test_split
 	from sklearn.metrics import mean_squared_error
 	from sklearn.model_selection import GridSearchCV

-	X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.23)
+	X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
 	param_test2 = {
 		'max_depth': range(3, 14, 2),
 		'min_samples_split': range(50, 201, 20)
@@ -98,7 +192,7 @@ def random_forest_parameter_tuning3(feature_data, label_data, test_feature):
 	from sklearn.metrics import mean_squared_error
 	from sklearn.model_selection import GridSearchCV

-	X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.23)
+	X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
 	param_test3 = {
 		'min_samples_split': range(10, 90, 20),
 		'min_samples_leaf': range(10, 60, 10),
@@ -123,7 +217,7 @@ def random_forest_parameter_tuning4(feature_data, label_data, test_feature):
 	from sklearn.metrics import mean_squared_error
 	from sklearn.model_selection import GridSearchCV

-	X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.23)
+	X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, test_size=0.01)
 	param_test4 = {
 		'max_features': range(3, 9, 2)
 	}
@@ -142,8 +236,28 @@ def random_forest_parameter_tuning4(feature_data, label_data, test_feature):


 if __name__ == '__main__':
+	algorithm = os.getenv('algorithm', 'rf')
 	trainfile = 'data/train.csv'
 	testfile = 'data/test.csv'
-	submitfile = 'data/sample_submit.csv'
-	feature_data, label_data, test_feature = load_data(trainfile, testfile)
-	random_forest_train(feature_data, label_data, test_feature, submitfile)
+	feature_data, label_data, test_feature, test_label = load_data(trainfile, testfile)
+	if algorithm == 'lr':
+		y_pred = linear_regression_train(feature_data, label_data, test_feature)
+	elif algorithm == 'ada':
+		y_pred = adaboost_train(feature_data, label_data, test_feature)
+	elif algorithm == 'gbdt':
+		y_pred = adaboost_train(feature_data, label_data, test_feature)
+	elif algorithm == 'tree':
+		y_pred = decision_tree_train(feature_data, label_data, test_feature)
+	else:
+		y_pred = random_forest_train(feature_data, label_data, test_feature)
+
+	from sklearn.metrics import mean_squared_error
+
+	MSE = mean_squared_error(test_label, y_pred)
+	RMSE = np.sqrt(MSE)
+	var = np.var(test_label)
+	r2 = 1 - MSE / var
+	# print(abs(test_label - y_pred) / test_label)
+	print(RMSE, r2)
+	for i in range(20):
+		print("{},{},{}".format(test_label[i], y_pred[i], (y_pred[i] - test_label[i]) / test_label[i]))