在離線平臺中,90%以上的離線分析任務都是使用Hive實現,其中必然會使用很多窗口分析函數,如果SparkSQL支持窗口分析函數,
那么對于后面Hive向SparkSQL中的遷移的工作量會大大降低,使用方式如下:
1、初始化數據
創建表
- create table window_test2 (url string, rate int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
- url1,12
- url2,11
- url1,23
- url2,25
- url1,58
- url3,11
- url2,25
- url3,58
- url2,11
加載數據:
- load data local inpath '/opt/bin/short_opt/windows2.data' overwrite into table window_test2 ;
2、窗口函數測試
查詢所有數據
- select * from window_test2;
| url | rate |
+-------+-------+
| url1 | 12 |
| url2 | 11 |
| url1 | 23 |
| url2 | 25 |
| url1 | 58 |
| url3 | 11 |
| url2 | 25 |
| url3 | 58 |
| url2 | 11 |
+-------+-------+
分組排序:
- select url,rate,row_number() over(partition by url order by rate desc) as r from window_test2;
| url | rate | r |
+-------+-------+----+
| url1 | 58 | 1 |
| url1 | 23 | 2 |
| url1 | 12 | 3 |
| url2 | 25 | 1 |
| url2 | 25 | 2 |
| url2 | 11 | 3 |
| url2 | 11 | 4 |
| url3 | 58 | 1 |
| url3 | 11 | 2 |
+-------+-------+----+
分組統計sum
- select url,rate,sum(rate) over(partition by url ) as r from window_test2;
| url | rate | r |
+-------+-------+-----+
| url1 | 12 | 93 |
| url1 | 23 | 93 |
| url1 | 58 | 93 |
| url2 | 11 | 72 |
| url2 | 25 | 72 |
| url2 | 25 | 72 |
| url2 | 11 | 72 |
| url3 | 11 | 69 |
| url3 | 58 | 69 |
+-------+-------+-----+
分組統計avg
- select url,rate,avg(rate) over(partition by url ) as r from window_test2;
| url | rate | r |
+-------+-------+-------+
| url1 | 12 | 31.0 |
| url1 | 23 | 31.0 |
| url1 | 58 | 31.0 |
| url2 | 25 | 18.0 |
| url2 | 11 | 18.0 |
| url2 | 11 | 18.0 |
| url2 | 25 | 18.0 |
| url3 | 11 | 34.5 |
| url3 | 58 | 34.5 |
+-------+-------+-------+
分組統計count
- select url,rate,count(rate) over(partition by url ) as r from window_test2;
| url | rate | r |
+-------+-------+----+
| url1 | 12 | 3 |
| url1 | 23 | 3 |
| url1 | 58 | 3 |
| url2 | 11 | 4 |
| url2 | 25 | 4 |
| url2 | 25 | 4 |
| url2 | 11 | 4 |
| url3 | 11 | 2 |
| url3 | 58 | 2 |
+-------+-------+----+
分組lag
- select url,rate,lag(rate) over(partition by url ) as r from window_test2;
| url | rate | r |
+-------+-------+-------+
| url1 | 12 | NULL |
| url1 | 23 | 12 |
| url1 | 58 | 23 |
| url2 | 25 | NULL |
| url2 | 11 | 25 |
| url2 | 11 | 11 |
| url2 | 25 | 11 |
| url3 | 11 | NULL |
| url3 | 58 | 11 |
+-------+-------+-------+