from turtle import shape  
	import pandas as pd  
	import  pprint  
	import numpy as np  
	import scipy.stats as stats  
	from  sklearn import random_projection  
	from sklearn.ensemble import RandomForestRegressor  
	from sklearn.model_selection import RandomizedSearchCV  
	from sklearn.model_selection import GridSearchCV  
	import matplotlib.pyplot as plt  
	from sklearn import metrics  
	from sklearn.model_selection import train_test_split  
	from openpyxl import load_workbook  
	from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score  
	from sklearn.metrics import accuracy_score  
	#导入数据    
	X_train=pd.read_excel("D:\科研\RandomForest\新建文件夹\X_train.xlsx")    
	X_test=pd.read_excel("D:\科研\RandomForest\新建文件夹\X_test.xlsx")    
	Y_train=pd.read_excel("D:\科研\RandomForest\新建文件夹\Y_train.xlsx")    
	Y_test=pd.read_excel("D:\科研\RandomForest\新建文件夹\Y_test.xlsx")    
	    
	Y_train = Y_train.values.ravel()    
	# X_train = X_train.values.ravel()    
    
	print(X_test.shape)    
	print(Y_test.shape)    
	print(Y_train.shape)    
	print(X_train.shape)

1、调n_estimators参数 :

	
	###调n_estimators参数  
	ScoreAll = []  
	for i in range(10,200,1):   #criterion = 'entropy'  
	    DT = RandomForestRegressor(n_estimators = i,random_state = 66)
	    score = cross_val_score(DT,X_train,Y_train,cv=10).mean()  
	    ScoreAll.append([i,score])  
	ScoreAll = np.array(ScoreAll)  
	  
	max_score = np.where(ScoreAll==np.max(ScoreAll[:,1]))[0][0] ##这句话看似很长的，其实就是找出最高得分对应的索引  
	print("最优参数以及最高得分:",ScoreAll[max_score])    
	plt.figure(figsize=[20,5])  
	plt.plot(ScoreAll[:,0],ScoreAll[:,1])  
	plt.show()

可以看到，103为得分最高点，暂定n_estimators为103，接着调下边的参数。

2、探索max_depth的最佳参数

	ScoreAll = []  
	for i in range(10,30,3):  
	    DT = RandomForestRegressor(n_estimators = 103,random_state = 66,max_depth =i ) #,criterion = 'entropy'  
	    score = cross_val_score(DT,X_train,Y_train,cv=10).mean()  
	    ScoreAll.append([i,score])  
	ScoreAll = np.array(ScoreAll)  
	  
	max_score = np.where(ScoreAll==np.max(ScoreAll[:,1]))[0][0] 
	print("最优参数以及最高得分:",ScoreAll[max_score])    
	plt.figure(figsize=[20,5])  
	plt.plot(ScoreAll[:,0],ScoreAll[:,1])  
	plt.show()

根据曲线，暂定树深为14。

3、min_samples_split的最佳参数

	ScoreAll = []  
	for i in range(2,10,1):  
	    DT = RandomForestRegressor(n_estimators = 103,random_state = 66,max_depth =14,min_samples_split = i ) #,criterion = 'entropy'  
	    score = cross_val_score(DT,X_train,Y_train,cv=10).mean()  
	    ScoreAll.append([i,score])  
	ScoreAll = np.array(ScoreAll)

从2后模型得分快速下降，后模型得分升高，说明出现了过拟合的情况。故最小样本数选择2

4、min_samples_leaf最佳参数

调参代码：

	ScoreAll = []  
	for i in range(2,10,1):  
	    DT = RandomForestRegressor(n_estimators = 103,random_state = 66,max_depth =14,min_samples_leaf = i,min_samples_split = 2,) #,criterion = 'entropy'  
	    score = cross_val_score(DT,X_train,Y_train,cv=10).mean()  
	    ScoreAll.append([i,score])  
	ScoreAll = np.array(ScoreAll)

根据曲线显示，最小叶子数选为7

5、max_features调参

	param_grid = {  
	    'max_features':np.arange(0.1, 1)}  
	rfc = RandomForestRegressor(random_state=66,n_estimators = 103,max_depth = 14,min_samples_leaf =7 ,min_samples_split =2 )  
	GS = GridSearchCV(rfc,param_grid,cv=10)  
	GS.fit(X_train,Y_train)  
	print(GS.best_params_)  
	print(GS.best_score_)

6、小范围修改

由于 min_samples_leaf 和min_samples_split会相互影像，故把这三个参数一起网格搜索：

	param_grid = {  
	    'max_features':np.arange(0.1, 1),  
	    'min_samples_leaf':np.arange(5,15),  
	    'min_samples_split':np.arange(2,10),  
	    }  
	rfc = RandomForestRegressor(random_state=66,n_estimators = 103,max_depth = 14 )  
	GS = GridSearchCV(rfc,param_grid,cv=10)  
	GS.fit(X_train,Y_train)  
	print(GS.best_params_)  
	print(GS.best_score_)

结果：

{'max_features': 0.1, 
'min_samples_leaf': 5,
 'min_samples_split': 2}
0.5697799544764119

参数值:

n_estimators	103
max_depth	14
max_features	0.1
min_samples_leaf	5
min_samples_split	2

7、在得到的最优参数附近进行小范围网格搜索

	import time  
	start = time.time()  
	param_grid = {  
	  'n_estimators':np.arange(100, 110),  
	  'max_depth':np.arange(12, 16),  
	  'min_samples_leaf':np.arange(1, 8),  
	  'min_samples_split':np.arange(2, 5),  
	   'max_features':np.arange(0.1, 1)  
}  
	  
	rfc = RandomForestRegressor(random_state=66)  
	GS = GridSearchCV(rfc,param_grid,cv=10)  
	GS.fit(X_train,Y_train)  
	end = time.time()  
	print("循环运行时间:%.2f秒"%(end-start))  
	print(GS.best_params_)  
	print(GS.best_score_)

结果：

参数	旧值	新值
n_estimators	103	106
max_depth	14	14
max_features 0.1	0.1
min_samples_leaf	5	1
min_samples_split	2	3

8、随机森林回归预测

def test_RandomForestRegressor(X_train,X_test,Y_train,Y_test):  
	    regr=RandomForestRegressor(n_estimators=103,max_depth=14,min_samples_split=3,min_samples_leaf=1,max_features=0.1)  
	    regr.fit(X_train,Y_train)  
	    predictions = regr.predict(X_test)  
	    data = pd.DataFrame(predictions)  
	    print("树木个数:103,max_depth=14,split=2,min_samples_split=3,min_samples_leaf=1,max_feature=0.1","Traing Score:%f"%regr.score(X_train,Y_train),"Testing Score:%f"%regr.score(X_test,Y_test))  
	    predictions=regr.predict(X_test)  
	    # plt.plot(Y_test['date'], predictions['predictionspredictions'], 'ro', label = 'prediction')  
	    # print(predictions)  
	    writer = pd.ExcelWriter("D:\科研\RandomForest\新建文件夹\predict06.xlsx")      # 写入Excel文件  
	    data.to_excel(writer,sheet_name='num',na_rep='nana',index=False)        # ‘page_1’是写入excel的sheet名  
	    writer.save()  
	    writer.close()  
	  
	test_RandomForestRegressor(X_train,X_test,Y_train,Y_test)

感谢：本文调参参考了疯狂学GIS的博文，从中学习颇多。https://blog.csdn.net/zhebushibiaoshifu/category_10827929.html?spm=1001.2014.3001.5515