从零开始学习Hadoop--第2章 第一个MapReduce程序
packagecom.brianchen.hadoop;
importjava.io.IOException;
importjava.util.StringTokenizer;
importorg.apache.hadoop.conf.Configuration;
importorg.apache.hadoop.fs.Path;
importorg.apache.hadoop.io.IntWritable;
importorg.apache.hadoop.io.Text;
importorg.apache.hadoop.mapreduce.Job;
importorg.apache.hadoop.mapreduce.Mapper;
importorg.apache.hadoop.mapreduce.Reducer;
importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;
importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
importorg.apache.hadoop.util.GenericOptionsParser;
public classWordCount {
public staticclass TokenizerMapper
extendsMapper<Object, Text, Text, IntWritable>{
private finalstatic IntWritable one = new IntWritable(1);
private Textword = new Text();
public voidmap(Object key, Text value, Context context)
throwsIOException, InterruptedException {
StringTokenizeritr = new StringTokenizer(value.toString());
while(itr.hasMoreTokens()){
word.set(itr.nextToken());
context.write(word,one);
}
}
}
public staticclass IntSumReducer extends
Reducer<Text,IntWritable,Text,IntWritable>{
privateIntWritable result = new IntWritable();
public voidreduce(Text key, Iterable<IntWritable> values, Contextcontext)
throwsIOException, InterruptedException {
int sum =0;
for(IntWritable val : values) {
sum +=val.get();
}
result.set(sum);
context.write(key,result);
}
}
public staticvoid main(String[] args) throws Exception {
Configurationconf = new Configuration();
String[]otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
if(otherArgs.length != 2) {
System.err.println("Usage:wordcount <in> <out>");
System.exit(2);
}
Job job = newJob(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job,new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job,new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true)? 0 : 1);
}
}
这里的代码,跟前一节有点不太一样。
1)”publicstatic class TokenizerMapper”
这表示TokenizerMapper类是WordCount类的内部静态类,这种方式可以将TokenizerMapper隐藏在WordCount类内部,且TokenizerMapper类不引用WordCount类的任何变量和函数。
2)“private final static IntWritable one”
跟上一节的定义相比,这里多了”privatefinalstatic”,”private”表示这个变量是类的私有变量,“final”表示这变量只能在定义的时候被赋值一次,以后不可更改,”static”表示这是一个静态变量,独立于对象,被该类的所有实例共享,这种做法的好处是,one这个值是私有的不可更改的仅仅只有一个,代码更可靠,更节省内存空间。
4.3.1“cd ~/wordcount_02”
4.3.2“javac -classpath/home/brian/usr/hadoop/hadoop-1.2.1/hadoop-core-1.2.1.jar:/home/brian/usr/hadoop/hadoop-1.2.1/lib/commons-cli-1.2.jar-d ./classes/ ./src/WordCount.java ”
4.4.1“jar -cvf wordcount.jar -C ./classes/ . ”
4.5.1“cd ~/usr/bin/hadoop/hadoop-1.2.1”
4.5.2“./bin/hadoop fs -rmr output”
4.5.3“./bin/hadoop jar /home/brian/wordcount_02/wordcount.jarcom.brianchen.hadoop.WordCount readme.txt output”
4.6.1“./bin/hadoop fs -cat output/part-r-00000”