pig分析日志脚本(1) 统计行数和单词个数wordcount
--统计数据的行数
cd hdfs:///
A = LOAD '/logdata/2012*/*/nohup_*' AS (name:chararray) ;
B = GROUP A BY name;
C = FOREACH B GENERATE group, COUNT(A);
D = ORDER C BY ($1);
E = FILTER D BY $1 > 200;
dump E;
--统计单词的个数
A = LOAD'/logdata/20130131/*/*' AS (line: chararray) ;
B = foreach Agenerate flatten(TOKENIZE((chararray)$0)) as word;
C = group B by word;
D = foreach Cgenerate COUNT(B), group;
E = ORDER D BY ($0);
F = FILTER E BY $0> 200;
DUMP F;
参考资料:
http://salsahpc.indiana.edu/ScienceCloud/pig_word_count_tutorial.htm