|
楼主 |
发表于 2017-6-7 20:02:50
|
显示全部楼层
- % random forest随机森林 RF
- clc,clear,close all
- warning off
- feature jit off
- data = xlsread('数据3.xlsx',3,'B3:P137'); % 加载数据
- rng(1945,'twister') % For reproducibility
- % OOB(Out-of-Bag)样本
- x = data(:,2:end); % 各个指标
- y = data(:,1); % 输出,按照年份聚类
- ntree = 300; % CART决策树的数量
- BaggedEnsemble = TreeBagger(ntree,x,y,'Method','C','oobvarimp','on','OOBPred','On','NVarToSample',2,'MinLeaf',5 );
- %% 原始样本分布情况
- % RF来计算样本的相似度矩阵(Proximity Matrix)
- Prox = fillProximities(BaggedEnsemble);
- PM = Prox.Proximity;
- figure(1),
- [mdsPM,eigvalue] = mdsProx(Prox,'Colors','rgbcmshpkx*.o><');
- xlabel('Dim1');ylabel('Dim2');title('相似度矩阵投影图')
- %% Outliers
- % 异常样本也称作离群数据(Outliers),通常定义为远离数据主体的样本。
- OM = Prox.OutlierMeasure;
- figure(2),bar(OM);
- xlabel('样本数');ylabel('Outliers');
- % 剔除异常样本
- abnormal = find(OM>3);
- data(abnormal,:) = []; % 删除异常样本
- x = data(:,2:end); % 更新各个指标
- y = data(:,1); % 更新输出,按照年份聚类
- ntree = 300; % CART决策树的数量
- BaggedEnsemble = TreeBagger(ntree,x,y,'Method','C','oobvarimp','on','OOBPred','On','NVarToSample',2,'MinLeaf',5 );
- Prox=[];Prox = fillProximities(BaggedEnsemble);
- PM=[];PM = Prox.Proximity;
- figure(111),
- [mdsPM,eigvalue] = mdsProx(Prox,'Colors','rgbcmshpkx*.o><');
- xlabel('Dim1');ylabel('Dim2');title('去除异常样本的相似度矩阵投影图')
- xx = x;
- yy = y;
- %% OOB估计误差
- ntree = 300; % CART决策树的数量
- BaggedEnsemble=[];
- BaggedEnsemble = TreeBagger(ntree,x,y,'Method','C','oobvarimp','on','OOBPred','On','NVarToSample',1,'MinLeaf',5 );
- oobErrorBaggedEnsemble=[];oobErrorBaggedEnsemble = oobError(BaggedEnsemble);
- figure(3),plot(oobErrorBaggedEnsemble)
- xlabel(['决策树数量,ntree=',num2str(ntree)]);
- ylabel('OOB分类误差');
- ntree = 500; % CART决策树的数量
- BaggedEnsemble=[];
- BaggedEnsemble = TreeBagger(ntree,x,y,'Method','C','oobvarimp','on','OOBPred','On','NVarToSample',1,'MinLeaf',5 );
- oobErrorBaggedEnsemble=[];oobErrorBaggedEnsemble = oobError(BaggedEnsemble);
- figure(4),plot(oobErrorBaggedEnsemble)
- xlabel(['决策树数量,ntree=',num2str(ntree)]);
- ylabel('OOB分类误差');
- ntree = 1000; % CART决策树的数量
- BaggedEnsemble=[];
- BaggedEnsemble = TreeBagger(ntree,x,y,'Method','C','oobvarimp','on','OOBPred','On','NVarToSample',1,'MinLeaf',5 );
- oobErrorBaggedEnsemble=[];oobErrorBaggedEnsemble = oobError(BaggedEnsemble);
- figure(5),plot(oobErrorBaggedEnsemble)
- xlabel(['决策树数量,ntree=',num2str(ntree)]);
- ylabel('OOB分类误差');
- %% 确定mtry
- for mtry=1:15
- ntree = 300; % CART决策树的数量
- BaggedEnsemble=[];
- BaggedEnsemble = TreeBagger(ntree,x,y,'Method','C','OOBPred','On','NVarToSample',mtry,'MinLeaf',5 );
- oobErrorBaggedEnsemble = oobError(BaggedEnsemble);
- ROC(mtry) = mean(oobErrorBaggedEnsemble);
- end
- figure(6),plot(ROC,'o-');
- title(['best mtry = ',num2str( find(ROC==max(ROC)) )])
- xlabel('mtry'); ylabel('ROC(Repeated Cross-Validation)');
- %% 变量的重要性求解
- ntree = 300; % CART决策树的数量
- BaggedEnsemble=[];
- BaggedEnsemble = TreeBagger(ntree,x,y,'Method','C','oobvarimp','on','OOBPred','On','NVarToSample',1,'MinLeaf',5 );
- Prox=[];Prox = fillProximities(BaggedEnsemble); % RF来计算样本的相似度矩阵(Proximity Matrix)
- MDA = Prox.OOBPermutedVarDeltaError;
- MDG = Prox.OOBPermutedVarDeltaMeanMargin;
- % MDA、MDG画图
- [wu,sortMDA] = sort(MDA,'ascend'); % 升序排列
- [wu,sortMDG] = sort(MDG,'ascend'); % 升序排列
- index = {'应急响应能力X21','耕地灌溉率X2','饮水保障X3','抗旱经费支出X10',...
- '水库蓄水能力X20','作物种植比例X4','农业总产值比例X18','人均粮食产量X16',...
- '旱地比重X5','人均耕地面积X1','农用设施配备比例X19','外出务工比例X12',...
- '土壤墒情X8','年平均降水量X7'}';
- figure(7),
- axes1 = axes('Position',[0.3 0.15 0.6 0.7]);
- axis(axes1);
- for i=1:length(sortMDA)
- plot(i,MDA( sortMDA(i) ),'ro');
- hold on
- text(-6.2,min(MDA)+(i-1)*(max(MDA)-min(MDA))/length(sortMDA),index( sortMDA(i) ));
- end
- set(gca,'ytick',[]) % 同时去掉 x 轴和 y 轴的刻度
- set(gca, 'XGrid', 'on'); % 网格
- set(gca, 'YGrid', 'on'); % 网格
- xlabel('MDA')
- hold off
- figure(8),
- axes1 = axes('Position',[0.3 0.15 0.6 0.7]);
- axis(axes1);
- for i=1:length(sortMDG)
- plot(i,MDG( sortMDG(i) ),'ro');
- hold on
- text(-6.2,min(MDG)+(i-1)*(max(MDG)-min(MDG))/length(sortMDG),index( sortMDG(i) ));
- end
- set(gca,'ytick',[]) % 同时去掉 x 轴和 y 轴的刻度
- set(gca, 'XGrid', 'on'); % 网格
- set(gca, 'YGrid', 'on'); % 网格
- xlabel('MDG')
- hold off
- %% 按照MDG进行删除指标计算ROC
- ntree = 300; % CART决策树的数量
- mtry = 1; % CART决策树生长过程中分枝时随机选择的变量个数
- indexP = ones(length(sortMDG),1);
- for i=1:13 % 删除前8个不重要指标
- x=xx;
- x(:, sortMDG(1:i) ) = [];
- BaggedEnsemble=[];
- BaggedEnsemble = TreeBagger(ntree,x,y,'Method','C','OOBPred','On','NVarToSample',mtry,'MinLeaf',5 );
- oobErrorBaggedEnsemble=[];
- oobErrorBaggedEnsemble = oobError(BaggedEnsemble);
- AUC(i) = mean(oobErrorBaggedEnsemble);
- indexP(i) = i/13;
- end
- figure(9),plot(AUC,'r-.')
- xlabel('删除不重要指标的个数');ylabel('OOB-AUC');
- %% 重复交叉验证过程指标选择概率排序
- figure(10),
- axes2 = axes('Position',[0.3 0.15 0.6 0.7]);
- axis(axes2);
- for i=1:length(sortMDG)
- plot(indexP(i),i,'ro');
- hold on
- text(-0.43,i,index( sortMDG(i) ));
- end
- set(gca,'ytick',[]) % 同时去掉 x 轴和 y 轴的刻度
- set(gca, 'XGrid', 'on'); % 网格
- set(gca, 'YGrid', 'on'); % 网格
- xlabel('指标被选中的概率')
- hold off
复制代码 |
|