hive日志分析实战(一)
分析用户玩家流失率
(流失:如果一用户登陆某游戏某区服后接下来一周时间内登陆该区服的天数少于2天,则认为该用户已流失)
日志格式如下:
{"cnt":3,"src":"bbs","time":20130622063117,"qid":"100005648","gkey":"yjjh","skey":"S9"}{"cnt":2,"src":"null","time":20130622005615,"qid":"100015499","gkey":"dgwm","skey":"592120005"}{"cnt":5,"src":"txt","time":20130622044917,"qid":"100021254","gkey":"yjjh","skey":"S1"}{"cnt":1,"src":"null","time":20130622090137,"qid":"100023162","gkey":"wulin","skey":"S20"}{"cnt":1,"src":"null","time":20130622090417,"qid":"100024132","gkey":"wulin","skey":"S20"}{"cnt":1,"src":"null","time":20130622090526,"qid":"100025487","gkey":"wulin","skey":"S20"}{"cnt":1,"src":"loginhistory","time":20130622101001,"qid":"100030555","gkey":"sxd","skey":"S149"}{"cnt":1,"src":"se-yxlist-frxz","time":20130622101158,"qid":"100035304","gkey":"frxz","skey":"S12"}{"cnt":5,"src":"se","time":20130622100838,"qid":"100035995","gkey":"ktpd","skey":"S9"}{"cnt":2,"src":"null","time":20130622101413,"qid":"100035995","gkey":"xjsj","skey":"S22"}
create external table login_interm (cnt string,src string,time string,qid string,gkey string,skey string) row format serde 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' with SERDEPROPERTIES ('input.regex' = '\\{"cnt":([^,]*),"src":"([^"]*)","time":([^,]*),"qid":"([^"]*)","gkey":"([^"]*)","skey":"([^"]*)"\}','output.format.string' = '%1$s %2$s %3$s %4$s %5$s %6$s') location '/login_logs/';create external table login_info (cnt string,src string,time string,qid string,gkey string,skey string) PARTITIONED BY(pgkey STRING) CLUSTERED BY(skey,qid) SORTED BY(skey,qid,time) INTO 32 BUCKETS stored as SEQUENCEFILE;from login_interm insert overwrite table login_info PARTITION (pgkey) select cnt,src,time,qid,gkey,skey,gkey;
create table login_stat_use_reduce (login_times int,login_days int,qid string,gkey string,skey string);add file '/home/hadoop/reduce.py';from (select cnt,time,qid,gkey,skey from login_info where cnt is not null distribute by gkey,skey,qid sort by gkey,skey,qid,time) map_out insert overwrite table login_stat_use_reduce reduce cnt,time,qid,gkey,skey using '/usr/bin/python2.7 ./reduce.py' as login_times,days,qid,gkey,skey;
#!/usr/bin/python2.7 #coding:utf-8import datetimeimport timeimport sys,loggingdef datetime_toString(dt): """把datetime转成字符串""" return dt.strftime("%Y%m%d")def string_toDatetime(string): """把字符串转成datetime""" return datetime.datetime.strptime(string, "%Y%m%d")def string_toTimestamp(strTime): """把字符串转成时间戳形式""" return time.mktime(string_toDatetime(strTime).timetuple())def timestamp_toString(stamp): """把时间戳转成字符串形式""" return time.strftime("%Y%m%d%H", time.localtime(stamp))def datetime_toTimestamp(dateTime): """把datetime类型转外时间戳形式""" return time.mktime(dateTime.timetuple()) def substract_DateTime(dateStr1,dateStr2): """ 返回两个日期之间的差 """ d1=string_toDatetime(dateStr1) d2=string_toDatetime(dateStr2) return d2-d1def substract_TimeStamp(dateStr1,dateStr2): """ 两个日期的 timestamp 差值 """ ts1= string_toTimestamp(dateStr1) ts2= string_toTimestamp(dateStr2) return ts1-ts2def compare_dateTime(dateStr1,dateStr2): """两个日期的比较, 当然也可以用timestamep方法比较,都可以实现.""" date1 = string_toDatetime(dateStr1) date2 = string_toDatetime(dateStr2) return date1.date()>date2.date()def dateTime_Add(dateStr,days=0): """ 指定日期加上 一个时间段,天,小时,或分钟之后的日期 """ date1= string_toDatetime(dateStr) return date1+datetime.timedelta(days=days)first_line = 1;pre_login_days = 0pre_login_cnt = 0pre_gkey = ""pre_skey = ""pre_qid = ""logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename='./worker.log', filemode='a')for line in sys.stdin: line = line.strip().strip('\t'); logging.error("data:"+line) if line == "": continue cnt,time,qid,gkey,skey = line.split('\t') if not cnt or not time or not qid or not gkey or not skey: continue login_date = time[0:8] if first_line: lost = 0; pre_gkey = gkey pre_skey = skey pre_qid = qid pre_login_days = 1 pre_login_date = login_date pre_login_cnt = eval(cnt) first_line = 0 continue if gkey != pre_gkey or skey != pre_skey or qid != pre_qid: if pre_login_days < 3: print "%d\t%d\t%s\t%s\t%s"%(pre_login_cnt,pre_login_days,pre_qid,pre_gkey,pre_skey) lost = 0; pre_gkey = gkey pre_skey = skey pre_qid = qid pre_login_days = 1 pre_login_date = login_date pre_login_cnt = eval(cnt) else: if lost or pre_login_days >= 3: continue if compare_dateTime(login_date,datetime_toString(dateTime_Add(pre_login_date,7))): lost = 1 continue pre_login_days += 1 pre_login_cnt += eval(cnt)if pre_login_days < 3: print "%d\t%d\t%s\t%s\t%s"%(pre_login_cnt,pre_login_days,pre_qid,pre_gkey,pre_skey)