SoFunction
Updated on 2025-03-03

Methods to handle data skew in Hive

1 groupby (large table grouping-local aggregation + global aggregation)

Example 1:

select label,sum(cnt) as all from 
(
    select rd,label,sum(1) as cnt from 
    (
        select id,label,round(rand(),2) as rd,value from tmp1
    ) as tmp
    group by rd,label
) as tmp
group by label;

Example 2:

select 
	split(new_source,'\\_')[0] as source 
	,sum(cnt) as cnt 
from  
(select  
	concat(source,'_', rand()*100) as  new_source
	,count(1) as cnt 
from  test_table 
where day ='2022-01-01'
group by 
	concat(source,'_', rand()*100)
)tt 
group by 
	split(new_source,'\\_')[0]

2 join (large and medium table Join - add salt + small table expansion)

Example 1:

select label,sum(value) as all from 
(
    select rd,label,sum(value) as cnt from
    (
        select  as rd, as label,* as value 
        from 
        (
            select id,round(rand(),1) as rd,label,value from tmp1
        ) as tmp1
        join
        (
            select id,rd,label,value from tmp2
            lateral view explode(split('0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9',',')) mytable as rd
        ) as tmp2
        on  =  and  = 
    ) as tmp1
    group by rd,label
) as tmp1
group by label;

Example 2:

select 
	source
	,source_name
	,sum(cnt) as cnt 
from  
(select 
	 
	,new_source
	,nvl(source_name,'unknown') as source_name 
	,count(imei) as cnt 
from  
(select  
	imei
	,source 
	,concat(cast(rand()*10 as int ),'_',source ) as new_source
from  test_table_1
where day ='2022-01-01'
) t1 
inner join 
(
select 
	source_name 
	,concat(preflix,'_',source) as new_source
from  test_table_1
where day ='2022-01-01'
lateral view explode(split('0,1,2,3,4,5,6,7,8,9,10',','))b as preflix 
) t2 
on t1.new_source =t2.new_source
group by 
 
,new_source
,nvl(source_name,'unknown')
) tta  
group by 
	source
	,source_name

3 Double big table Join - Sampling tilt key + BroadJoin

##Before optimization:create table test.tmp_table_test_all as 
select  
imei 
,lable_id 
,nvl(label_name,'unknown')
from tmp_table_1  t1  
left join 
(select  
lable_id
,label_name
from  tmp_table_2 
where day ='2024-01-01') t2 
on t1.lable_id =t2.lable_id
where  ='2024-01-01'
;
 
## After optimization:create table test.tmp_table_test_all_new  as 
 
 
with tmp_table_test_1 as 
(select  
lable_id 
,count(1) as cnt 
from tmp_table_1  t1 
tablesample(5 percent) --Sampling5%Data of,reducetable scanThe quantity of
group by lable_id
order by cnt desc 
limit 100
) 
 
 
select  
	imei 
	,lable_id 
	,nvl(label_name,'unknown') as  label_name
from tmp_table_1  t1 
left join  tmp_table_test_1  t2
on t1.lable_id =t2.lable_id
left join 
(select  
	lable_id
	,label_name
from  tmp_table_2 
where day ='2024-01-01') t3
on t1.lable_id =t3.lable_id
where  ='2024-01-01' and  t2.lable_id is null 
 
union all  
 
select  
	imei 
	,lable_id 
	,nvl(label_name,'unknown') as  label_name 
from tmp_table_1  t1 
inner  join 
(select  
	lable_id
from  tmp_table_test_1  t1 
left   join   tmp_table_2  t2 
on t1.lable_id =t2.lable_id
where  ='2024-01-01') t3
on t1.lable_id =t3.lable_id
where  ='2024-01-01' 
;

4 Summary

This is the end of this article about how to deal with data skew in Hive. For more related content on processing data skew in Hive, please search for my previous articles or continue browsing the related articles below. I hope everyone will support me in the future!