从零开始学习Hadoop-第2章第一个MapReduce程序

2013-10-29

从零开始学习Hadoop--第2章第一个MapReduce程序packagecom.brianchen.hadoopimportjava.io.IOException

从零开始学习Hadoop--第2章第一个MapReduce程序

packagecom.brianchen.hadoop;

importjava.io.IOException;
importjava.util.StringTokenizer;

importorg.apache.hadoop.conf.Configuration;
importorg.apache.hadoop.fs.Path;
importorg.apache.hadoop.io.IntWritable;
importorg.apache.hadoop.io.Text;
importorg.apache.hadoop.mapreduce.Job;
importorg.apache.hadoop.mapreduce.Mapper;
importorg.apache.hadoop.mapreduce.Reducer;
importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;
importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
importorg.apache.hadoop.util.GenericOptionsParser;

public classWordCount {
public staticclass TokenizerMapper
extendsMapper<Object, Text, Text, IntWritable>{
private finalstatic IntWritable one = new IntWritable(1);
private Textword = new Text();

public voidmap(Object key, Text value, Context context)
throwsIOException, InterruptedException {
StringTokenizeritr = new StringTokenizer(value.toString());
while(itr.hasMoreTokens()){
word.set(itr.nextToken());
context.write(word,one);
}
}
}

public staticclass IntSumReducer extends
Reducer<Text,IntWritable,Text,IntWritable>{
privateIntWritable result = new IntWritable();
public voidreduce(Text key, Iterable<IntWritable> values, Contextcontext)
throwsIOException, InterruptedException {
int sum =0;
for(IntWritable val : values) {
sum +=val.get();
}
result.set(sum);
context.write(key,result);
}
}

public staticvoid main(String[] args) throws Exception {
Configurationconf = new Configuration();
String[]otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
if(otherArgs.length != 2) {
System.err.println("Usage:wordcount <in> <out>");
System.exit(2);
}

Job job = newJob(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job,new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job,new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true)? 0 : 1);
}
}

这里的代码，跟前一节有点不太一样。
1)”publicstatic class TokenizerMapper”
这表示TokenizerMapper类是WordCount类的内部静态类，这种方式可以将TokenizerMapper隐藏在WordCount类内部，且TokenizerMapper类不引用WordCount类的任何变量和函数。
2)“private final static IntWritable one”
跟上一节的定义相比，这里多了”privatefinalstatic”，”private”表示这个变量是类的私有变量，“final”表示这变量只能在定义的时候被赋值一次，以后不可更改，”static”表示这是一个静态变量，独立于对象，被该类的所有实例共享，这种做法的好处是，one这个值是私有的不可更改的仅仅只有一个，代码更可靠，更节省内存空间。

热点排行