# 常用统计分析 SQL 在 AWK 中的实现

user 表，字段：

1 zhangsan hubei
3 lisi tianjin
4 wangmazi guangzhou
2 wangwu beijing

consumer 表，字段：
id cost date

1 15 20121213
2 20 20121213
3 100 20121213
4 99 20121213
1 25 20121114
2 108 20121114
3 100 20121114
4 66 20121114
1 15 20121213
1 115 20121114

OS 版本：
uname -a
CYGWIN_NT-6.1 june-PC 1.7.9(0.237/5/3) 2011-03-29 10:10 i686 Cygwin
awk 版本：
awk --version
GNU Awk 3.1.8

1、查询整张表记录，where 条件过滤，关键词：where
select * from user; awk 1 user;
select * from consumer where cost > 100;
awk '\$2>100' consumer

2、对某个字段去重，或者按记录去重，关键词：distinct
select distinct(date) from consumer;
awk '!a[\$3]++{print \$3}' consumer
select distinct(*) from consumer;
awk '!a[\$0]++' consumer

3、记录按序输出，关键词：order by
select id from user order by id;
awk '{a[\$1]}END{asorti(a);for(i=1;i<=length(a);i++){print a[i]}}' user

4、取前多少条记录，关键词：limit
select * from consumer limit 2;
awk 'NR<=2' consumer
awk 'NR>2{exit}1' consumer # performance is better

5、分组求和统计，关键词：group by、having、sum、count
select id, count(1), sum(cost) from consumer group by id having count(1) > 2;
awk '{a[\$1]=a[\$1]==""?\$2:a[\$1]","\$2}END{for(i in a){c=split(a[i],b,",");if(c>2){sum=0;for(j in b){sum+=b[j]};print i"\t"c"\t"sum}}}' consumer

6、模糊查询，关键词：like（like属于通配，也可正则 REGEXP）
select name from user where name like 'wang%';
awk '\$2 ~/^wang/{print \$2}' user
awk '/.*bei\$/{print \$3}' user
awk '\$3 ~/bei/{print \$3}' user

7、多表 join 关联查询，关键词：join
select a.* , b.* from user a inner join consumer b  on a.id = b.id and b.id = 2;
awk 'ARGIND==1{a[\$1]=\$0;next}{if((\$1 in a)&&\$1==2){print a[\$1]"\t"\$2"\t"\$3}}' user consumer

8、多表水平联接，关键词：union all
select a.* from user a union all select b.* from user b;
awk 1 user user
select a.* from user a union select b.* from user b;
awk '!a[\$0]++' user user

9、随机抽样统计，关键词：order by rand()
SELECT * FROM consumer ORDER BY RAND() LIMIT 2;
awk 'BEGIN{srand();while(i<2){k=int(rand()*10)+1;if(!(k in a)){a[k];i++}}}(NR in a)' consumer

• 0

开心

• 0

板砖

• 0

感动

• 0

有用

• 0

疑问

• 0

难过

• 0

无聊

• 0

震惊