您好,欢迎访问三七文档
当前位置:首页 > IT计算机/网络 > 数据挖掘与识别 > 21_尚硅谷大数据之MapReduce扩展案例
第7章MapReduce扩展案例7.1倒排索引案例(多job串联)1)需求:有大量的文本(文档、网页),需要建立搜索索引a.txtb.txtc.txt(1)第一次预期输出结果atguigu--a.txt3atguigu--b.txt2atguigu--c.txt2pingping--a.txt1pingping--b.txt3pingping--c.txt1ss--a.txt2ss--b.txt1ss--c.txt1(2)第二次预期输出结果atguiguc.txt--2b.txt--2a.txt--3pingpingc.txt--1b.txt--3a.txt--1ssc.txt--1b.txt--1a.txt--22)第一次处理(1)第一次处理,编写OneIndexMapperpackagecom.atguigu.mapreduce.index;importjava.io.IOException;importorg.apache.hadoop.io.IntWritable;importorg.apache.hadoop.io.LongWritable;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Mapper;importorg.apache.hadoop.mapreduce.lib.input.FileSplit;publicclassOneIndexMapperextendsMapperLongWritable,Text,Text,IntWritable{Stringname;Textk=newText();IntWritablev=newIntWritable();@Overrideprotectedvoidsetup(Contextcontext)throwsIOException,InterruptedException{//获取文件名称FileSplitsplit=(FileSplit)context.getInputSplit();name=split.getPath().getName();}@Overrideprotectedvoidmap(LongWritablekey,Textvalue,Contextcontext)throwsIOException,InterruptedException{//1获取1行Stringline=value.toString();//2切割String[]fields=line.split();for(Stringword:fields){//3拼接k.set(word+--+name);v.set(1);//4写出context.write(k,v);}}}(2)第一次处理,编写OneIndexReducerpackagecom.atguigu.mapreduce.index;importjava.io.IOException;importorg.apache.hadoop.io.IntWritable;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Reducer;publicclassOneIndexReducerextendsReducerText,IntWritable,Text,IntWritable{@Overrideprotectedvoidreduce(Textkey,IterableIntWritablevalues,Contextcontext)throwsIOException,InterruptedException{intcount=0;//1累加求和for(IntWritablevalue:values){count+=value.get();}//2写出context.write(key,newIntWritable(count));}}(3)第一次处理,编写OneIndexDriverpackagecom.atguigu.mapreduce.index;importorg.apache.hadoop.conf.Configuration;importorg.apache.hadoop.fs.Path;importorg.apache.hadoop.io.IntWritable;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Job;importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;publicclassOneIndexDriver{publicstaticvoidmain(String[]args)throwsException{args=newString[]{e:/input/inputoneindex,e:/output5};Configurationconf=newConfiguration();Jobjob=Job.getInstance(conf);job.setJarByClass(OneIndexDriver.class);job.setMapperClass(OneIndexMapper.class);job.setReducerClass(OneIndexReducer.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(IntWritable.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);FileInputFormat.setInputPaths(job,newPath(args[0]));FileOutputFormat.setOutputPath(job,newPath(args[1]));job.waitForCompletion(true);}}(4)查看第一次输出结果atguigu--a.txt3atguigu--b.txt2atguigu--c.txt2pingping--a.txt1pingping--b.txt3pingping--c.txt1ss--a.txt2ss--b.txt1ss--c.txt13)第二次处理(1)第二次处理,编写TwoIndexMapperpackagecom.atguigu.mapreduce.index;importjava.io.IOException;importorg.apache.hadoop.io.LongWritable;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Mapper;publicclassTwoIndexMapperextendsMapperLongWritable,Text,Text,Text{Textk=newText();Textv=newText();@Overrideprotectedvoidmap(LongWritablekey,Textvalue,Contextcontext)throwsIOException,InterruptedException{//1获取1行数据Stringline=value.toString();//2用“--”切割String[]fields=line.split(--);k.set(fields[0]);v.set(fields[1]);//3输出数据context.write(k,v);}}(2)第二次处理,编写TwoIndexReducerpackagecom.atguigu.mapreduce.index;importjava.io.IOException;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Reducer;publicclassTwoIndexReducerextendsReducerText,Text,Text,Text{@Overrideprotectedvoidreduce(Textkey,IterableTextvalues,Contextcontext)throwsIOException,InterruptedException{//atguigua.txt3//atguigub.txt2//atguiguc.txt2//atguiguc.txt--2b.txt--2a.txt--3StringBuildersb=newStringBuilder();//1拼接for(Textvalue:values){sb.append(value.toString().replace(\t,--)+\t);}//2写出context.write(key,newText(sb.toString()));}}(3)第二次处理,编写TwoIndexDriverpackagecom.atguigu.mapreduce.index;importorg.apache.hadoop.conf.Configuration;importorg.apache.hadoop.fs.Path;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Job;importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;publicclassTwoIndexDriver{publicstaticvoidmain(String[]args)throwsException{args=newString[]{e:/input/inputtwoindex,e:/output6};Configurationconfig=newConfiguration();Jobjob=Job.getInstance(config);job.setJarByClass(TwoIndexDriver.class);job.setMapperClass(TwoIndexMapper.class);job.setReducerClass(TwoIndexReducer.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(Text.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);FileInputFormat.setInputPaths(job,newPath(args[0]));FileOutputFormat.setOutputPath(job,newPath(args[1]));booleanresult=job.waitForCompletion(true);System.exit(result?0:1);}}(4)第二次查看最终结果atguiguc.txt--2b.txt--2a.txt--3pingpingc.txt--1b.txt--3a.txt--1ssc.txt--1b.txt--1
本文标题:21_尚硅谷大数据之MapReduce扩展案例
链接地址:https://www.777doc.com/doc-3131746 .html