您好,欢迎访问三七文档
数据挖掘与R语言----预测海藻数量苏州大学唐煜2SUDA问题描述与目标有害藻类对河流生态环境具有破坏性一年的不同时间收集欧洲多条不同河流的水样测定不同的化学性质(便宜、方便)测定7种有害藻类的存在频率(昂贵、缓慢)目标构建基于化学性质预测藻类的模型了解影响藻类频率的因素3SUDA数据说明•来源:来自于ERUDIT研究网络•两个数据集:训练样本(200个)和测试样本(140个)被用于1999的COIL国际数据分析竞赛(Coil1999CompetitionData)3个名义变量+8个水样化学参数+7种有害藻类的频率4SUDA变量含义--名义变量水样收集的季节(春、夏、秋、冬)1收集样本的河流大小(大、中、小)2河水速度(高、中、低)35SUDA变量含义--化学参数最大PH值(mxPH)1最小含氧量(mnO2)2平均氯化物含量(CL)3平均硝酸盐含量(NO3)4平均氨含量(NH4)5平均正磷酸盐含量(oPO4)6平均磷酸盐含量PO47均匀叶绿素含量(Chla)86SUDA加载数据生成数据集algaeR添加包从文件中读入install.packages('DMwR')library(DMwR)read.table载入R语言的两种方法7SUDA从文件中读入数据Analysis.txt1Eval.txt2Sols.txt3algae-read.table('Analysis.txt',+header=F,dec='.',+col.names=c('season','size','speed','mxPH','mnO2','Cl',+'NO3','NH4','oPO4','PO4','Chla','a1','a2','a3','a4','a5','a6','a7'),+na.strings=c('XXXXXXX'))head(algae)algae[1:6,]~ltorgo/DataMiningWithR/8SUDA数据可视化和摘要summary(algae)hist(algae$mxPH,prob=T)qq.plot()bwplot(size~a1,data=algae)stripplot(season~a3|size)boxplot(algae$oPO4)描述性统计条件箱图直方图正态性检验箱线图9SUDA描述性统计名义变量取值的频数数值变量均值中位数四分位数最值缺失值个数10SUDA直方图Histogramofalgae$mxPHalgae$mxPHDensity6789100.00.10.20.30.40.50.60.7hist(algae$mxPH,prob=T)Histogramofalgae$mxPHalgae$mxPHFrequency678910010203040506070hist(algae$mxPH)11SUDA正态性检验library(car)par(mfrow=c(1,2))hist(algae$mxPH,prob=T,xlab='',main='HistogramofmaximumpHvalue',ylim=0:1)lines(density(algae$mxPH,na.rm=T))rug(jitter(algae$mxPH))qq.plot(algae$mxPH,main='NormalQQplotofmaximumpH')par(mfrow=c(1,1))HistogramofmaximumpHvaluDensity6789100.00.20.40.60.81.0-3-2-101236789NormalQQplotofmaximumpnormquantilesalgae$mxPH12SUDA箱线图boxplot(algae$oPO4,ylab=Orthophosphate(oPO4))rug(jitter(algae$oPO4),side=2)abline(h=mean(algae$oPO4,na.rm=T),lty=2)0100200300400500Orthophosphate(oPO4)13SUDA离群点标示plot(algae$NH4,xlab=)abline(h=mean(algae$NH4,na.rm=T),lty=1)abline(h=mean(algae$NH4,na.rm=T)+sd(algae$NH4,na.rm=T),lty=2)abline(h=median(algae$NH4,na.rm=T),lty=3)identify(algae$NH4)0501001502000500010000150002000025000algae$NH42035153algae[!is.na(algae$NH4)&algae$NH419000,]algae[algae$NH419000,]14SUDA条件箱图library(lattice)bwplot(size~a1,data=algae,ylab='RiverSize',xlab='AlgalA1')AlgalA1RiverSizelargemediumsmall02040608015SUDA条件分位箱图library(Hmisc)bwplot(size~a1,data=algae,panel=panel.bpplot,probs=seq(.01,.49,by=.01),datadensity=TRUE,ylab='RiverSize',xlab='AlgalA1')AlgalA1RiverSizelargemediumsmall02040608016SUDA更复杂的条件箱图stripplot(season~a3|size,data=algae[!is.na(algae$size),])a3autumnspringsummerwinter010203040largemediumautumnspringsummerwintersmall17SUDA连续因子离散化minO2-equal.count(na.omit(algae$mnO2),number=4,overlap=1/5)stripplot(season~a3|minO2,data=algae[!is.na(algae$mnO2),])a3autumnspringsummerwinter010203040minO2minO2autumnspringsummerwinterminO2010203040minO218SUDA数据缺失的处理50%24%16%10%根据案例之间的相似性填补根据变量之间相关关系填补将含有缺失值案例直接剔除使用处理缺失值数据的工具19SUDA直接剔除algae[!complete.cases(algae),]nrow(algae[!complete.cases(algae),])algae-na.omit(algae)algae-algae[-c(62,199),]manyNAs(algae,0.2)algae-algae[-manyNAs(algae),]20SUDA最高频率值填补algae[48,mxPH]-mean(algae$mxPH,na.rm=T)algae[is.na(algae$Chla),Chla]-median(algae$Chla,na.rm=T)algae-centralImputation(algae)对数值型变量,采用中位数;对名义变量,采用众数21SUDA变量相关关系填补cor(algae[,4:18],use=complete.obs)symnum(cor(algae[,4:18],use=complete.obs))data(algae)algae-algae[-manyNAs(algae),]lm(PO4~oPO4,data=algae)algae[28,PO4]-42.897+1.293*algae[28,oPO4]fillPO4-function(oP){if(is.na(oP))return(NA)elsereturn(42.897+1.293*oP)}algae[is.na(algae$PO4),PO4]-sapply(algae[is.na(algae$PO4),oPO4],fillPO4)22SUDA案例相似性填补data(algae)algae-algae[-manyNAs(algae),]algae-knnImputation(algae,k=10)algae-knnImputation(algae,k=10,meth=median)采用k个最相似数据的加权平均值,权重()dedw−=()()∑==piiiiyxyxd1,,δ其中()()⎪⎩⎪⎨⎧−=≠=数值变量名义变量且名义变量且201,iiiiiiiiiyxyxyxyxδ23SUDA建立预测模型通过数据建立解释变量与响应变量之间关系的模型模型选择回归树模型多元线性回归24SUDA多元线性回归data(algae)algae-algae[-manyNAs(algae),]clean.algae-knnImputation(algae,k=10)lm.a1-lm(a1~.,data=clean.algae[,1:12])lm2.a1-update(lm.a1,.~.-season)anova(lm.a1,lm2.a1)final.lm-step(lm.a1,direction=backward)summary(final.lm)25SUDA回归树library(rpart)data(algae)algae-algae[-manyNAs(algae),]rt.a1-rpart(a1~.,data=algae[,1:12])prettyTree(rt.a1)26SUDA模型的评价和选择拟合效果预测性能计算效率可解释性选择标准27SUDA拟合效果和预测性能lm.predictions.a1-predict(final.lm,clean.algae)rt.predictions.a1-predict(rt.a1,algae)(mae.a1.lm-mean(abs(lm.predictions.a1-algae[,a1])))(mae.a1.rt-mean(abs(rt.predictions.a1-algae[,a1])))(mse.a1.lm-mean((lm.predictions.a1-algae[,a1])^2))[1]295.5407(mse.a1.rt-mean((rt.predictions.a1-algae[,a1])^2))[1]161.9202(nmse.a1.lm-mean((lm.predictions.a1-algae[,'a1'])^2)/mean((mean(algae[,'a1'])-algae[,'a1'])^2))(nmse.a1.rt-mean((rt.predictions.a1-algae[,'a1'])^2)/mean((mean(algae[,'a1'])-algae[,'a1'])^2))regr.eval(algae[,a1],rt.predictions.a1,train.y=algae[,a1])28SUDA图示效果old.par-par(mfrow=c(1,2))plot(lm.predictions.a1,algae[,a1],main=LinearModel,xlab=Predictions,ylab=TrueValues)abline(0,1,lty=2)plot(rt.predictions.a1,algae[,a1],main=RegressionTree,xlab=Predictions,ylab=TrueValues)abline(0,1,lty=2)par(old.par)-101030
本文标题:数据挖掘与R软件
链接地址:https://www.777doc.com/doc-4194991 .html