hive 操作专利数据札记

2013-12-10
hive 操作专利数据笔记create table cite(citing int,cited int)row format deliminted fields terminated
hive 操作专利数据笔记
create table cite(citing int,cited int)row format deliminted fields terminated by ','stored as textfile;load data inpath '/tmp/file1.txt'  overwrite into table cite;select * from cite limit 10;create table cite_count(cited int ,count int);insert overwrite table cite_count select cited,count(citing)  from cite group by cited;select * from cite_count limit 10;alter table cited_count rename to cite_count;create table cite_num(cited_count int,num int) clustered by (cited_count) sorted by (cited_count desc) into 10 buckets;insert overwrite table cite_num select count,count(cited) from cite_count group by count;# 引用次数最多的10个专利select * from cite_count sort by count desc limit 10#创建被引用专利唯一数据表create table cited_unique(cited int)#写值insert overwrite table cited_unique select distinct cited from cite;#查询有多少个专利被引用select max(cited) from cited_unique #创建表create table apat_one(patent int ,GYEAR String,GDATE String,APPYEAR String,COUNTRY String,POSTATE String,ASSIGNEE String,ASSCODE String,CLAIMS string,NCLASS String,CAT String,SUBCAT string,CMADE string,CRECEIVE string,RATIOCIT string,GENERAL string,ORIGINAL string,FWDAPLAG string,BCKGTLAG string,SELFCTUB string,SELFCTLB string,SECDUPBD string,SECDLWBD string)CLUSTERED BY(patent) SORTED BY(patent) INTO 32 BUCKETSrow format delimited fields terminated by ','STORED AS textfile;#加载数据load data inpath '/patent/production/input/apat63_99.txt' overwrite  into  table apat;select  * from apat limit 100;问题：load data 时，怎么样做到自动分区和分桶？#统计每个国家的专利数create table country_apat_countas select  country ,count(1) numfrom apat group by country #国家的信息中有'，使用函数去掉create table country_apat_countas select  regexp_replace(country,'"','' ) ,count(1) numfrom apat group by regexp_replace(country,'"','' )alter table country_apat_count replace columns (country string,num bigint)# 修改表alter table country_apat_count rename to country_apat_num;#查询前10条记录select * from country_apat_count limit 10;set hive.exec.dynamic.partition = true;create table apat_one(patent int,GYEAR string, GDATE String,APPYEAR String,POSTATE String,ASSIGNEE String,ASSCODE String,CLAIMS string,NCLASS String,CAT String,SUBCAT string,CMADE string,CRECEIVE string,RATIOCIT string,GENERAL string,ORIGINAL string,FWDAPLAG string,BCKGTLAG string,SELFCTUB string,SELFCTLB string,SECDUPBD string,SECDLWBD string)partitioned by (COUNTRY String)CLUSTERED BY(patent) SORTED BY(patent) INTO 32 BUCKETSSTORED AS textfileinsert overwrite table apat_one partition (COUNTRY)select patent,GDATE,GYEAR,APPYEAR,POSTATE,ASSIGNEE,ASSCODE,CLAIMS,NCLASS,CAT,SUBCAT,CMADE,CRECEIVE,RATIOCIT,GENERAL,ORIGINAL,FWDAPLAG,BCKGTLAG,SELFCTUB,SELFCTLB,SECDUPBD,SECDLWBD,regexp_replace(COUNTRY,'"','') from apat #partent,GYEAR,GDATE,APPYEAR,COUNTRY,POSTATE,ASSIGNEE,ASSCODE,CLAIMS,NCLASS,CAT,SUBCAT,CMADE,CRECEIVE,RATIOCIT,GENERAL,ORIGINAL,FWDAPLAG,BCKGTLAG,SELFCTUB,SELFCTLB,SECDUPBD,SECDLWBD#example#INSERT OVERWRITE TABLE T PARTITION (ds, hr) #SELECT key, value, ds, hr FROM srcpart WHERE ds is not null and hr>10;# trim 函数的使用select trim(' abc  ') from country_apat_num limit 1;#regexp_replace 函数的使用，替换“ 成空select regexp_replace(country,'"','') from country_apat_num limit 5;
热点排行
编程

hive 操作专利数据札记