1 groupby (large table grouping-local aggregation + global aggregation)
Example 1:
select label,sum(cnt) as all from ( select rd,label,sum(1) as cnt from ( select id,label,round(rand(),2) as rd,value from tmp1 ) as tmp group by rd,label ) as tmp group by label;
Example 2:
select split(new_source,'\\_')[0] as source ,sum(cnt) as cnt from (select concat(source,'_', rand()*100) as new_source ,count(1) as cnt from test_table where day ='2022-01-01' group by concat(source,'_', rand()*100) )tt group by split(new_source,'\\_')[0]
2 join (large and medium table Join - add salt + small table expansion)
Example 1:
select label,sum(value) as all from ( select rd,label,sum(value) as cnt from ( select as rd, as label,* as value from ( select id,round(rand(),1) as rd,label,value from tmp1 ) as tmp1 join ( select id,rd,label,value from tmp2 lateral view explode(split('0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9',',')) mytable as rd ) as tmp2 on = and = ) as tmp1 group by rd,label ) as tmp1 group by label;
Example 2:
select source ,source_name ,sum(cnt) as cnt from (select ,new_source ,nvl(source_name,'unknown') as source_name ,count(imei) as cnt from (select imei ,source ,concat(cast(rand()*10 as int ),'_',source ) as new_source from test_table_1 where day ='2022-01-01' ) t1 inner join ( select source_name ,concat(preflix,'_',source) as new_source from test_table_1 where day ='2022-01-01' lateral view explode(split('0,1,2,3,4,5,6,7,8,9,10',','))b as preflix ) t2 on t1.new_source =t2.new_source group by ,new_source ,nvl(source_name,'unknown') ) tta group by source ,source_name
3 Double big table Join - Sampling tilt key + BroadJoin
##Before optimization:create table test.tmp_table_test_all as select imei ,lable_id ,nvl(label_name,'unknown') from tmp_table_1 t1 left join (select lable_id ,label_name from tmp_table_2 where day ='2024-01-01') t2 on t1.lable_id =t2.lable_id where ='2024-01-01' ; ## After optimization:create table test.tmp_table_test_all_new as with tmp_table_test_1 as (select lable_id ,count(1) as cnt from tmp_table_1 t1 tablesample(5 percent) --Sampling5%Data of,reducetable scanThe quantity of group by lable_id order by cnt desc limit 100 ) select imei ,lable_id ,nvl(label_name,'unknown') as label_name from tmp_table_1 t1 left join tmp_table_test_1 t2 on t1.lable_id =t2.lable_id left join (select lable_id ,label_name from tmp_table_2 where day ='2024-01-01') t3 on t1.lable_id =t3.lable_id where ='2024-01-01' and t2.lable_id is null union all select imei ,lable_id ,nvl(label_name,'unknown') as label_name from tmp_table_1 t1 inner join (select lable_id from tmp_table_test_1 t1 left join tmp_table_2 t2 on t1.lable_id =t2.lable_id where ='2024-01-01') t3 on t1.lable_id =t3.lable_id where ='2024-01-01' ;
4 Summary
This is the end of this article about how to deal with data skew in Hive. For more related content on processing data skew in Hive, please search for my previous articles or continue browsing the related articles below. I hope everyone will support me in the future!