用户 ID | 用户名 | 兴趣标签 |
1 | 张三 | {古典,爵士,R&B,乡村} |
2 | 黎四 | {民歌,纯音乐} |
3 | 王五 | {HipHop,爵士,R&B,嘻哈,雷鬼} |
... | ... | ... |
1000000000 | 文本2 | {摇滚,} |
用户 ID | 用户名 |
1 | 张三 |
2 | 黎四 |
N | ... |
标签 ID | 用户名 |
1 | 古典 |
2 | 民歌 |
N | ... |
标签 ID | 用户名 |
1 | [ 1,3,7,123,423 ] |
2 | [ 5,31] |
N | ... |
create or replace function random_string(length integer) returns text as$$declarechars text[] := '{0,1,2,3,4,5,6,7,8,9,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z}';result text := '';i integer := 0;length2 integer := (select trunc(random() * length + 1));beginif length2 < 0 thenraise exception 'Given length cannot be less than 0';end if;for i in 1..length2 loopresult := result || chars[1+random()*(array_length(chars, 1)-1)];end loop;return result;end;$$ language plpgsql;
create or replace function random_int_array(int, int)returns int[] language sql as$$select array_agg(round(random()* $1)::int)from generate_series(1, $2)$$;
create or replace function random_string_array(int, int)returns TEXT[] language sql as$$select array_agg(random_string($1)) from generate_series(1, $2);$$;
create table account(uin bigint primary KEY,name varchar,tag TEXT []);
insert into account select generate_series(1,10000000), random_string(20),random_string_array(5,10);create index tag_inx on account USING GIN(tag);
explain analyze select uin,name from account where tag @>ARRAY['GN','o'];QUERY PLAN---------------------------------------------------------------------------------------------------------------------Bitmap Heap Scan on account (cost=52.81..466.86 rows=105 width=19) (actual time=4.263..4.502 rows=184 loops=1)Recheck Cond: (tag @> '{GN,o}'::text[])Heap Blocks: exact=184-> Bitmap Index Scan on tag_inx (cost=0.00..52.78 rows=105 width=0) (actual time=4.240..4.240 rows=184 loops=1)Index Cond: (tag @> '{GN,o}'::text[])Planning Time: 0.108 msExecution Time: 4.528 ms
explain analyze select count(uin) from account where tag && ARRAY['lvXe','Zt'];------------------------------------------------------------------------------------------------------------------------------Aggregate (cost=21816.39..21816.40 rows=1 width=8) (actual time=8.236..8.238 rows=1 loops=1)-> Bitmap Heap Scan on account (cost=109.08..21800.56 rows=6332 width=8) (actual time=1.655..7.901 rows=5390 loops=1)Recheck Cond: (tag && '{lvXe,Zt}'::text[])Heap Blocks: exact=5327-> Bitmap Index Scan on tag_inx (cost=0.00..107.49 rows=6332 width=0) (actual time=0.962..0.962 rows=5390 loops=1)Index Cond: (tag && '{lvXe,Zt}'::text[])Planning Time: 0.110 msExecution Time: 8.270 ms
create table tag_dict (tagid int primary key,taginfo text);
insert into tag_dict select generate_series(1,100000), md5(random()::text);
create table account1(uin bigint primary KEY,name varchar,tag INT []);
insert into account1 select generate_series(1,10000000), random_string(20),random_int_array(100000,10);
test=> explain analyze select uin,name from account1 where tag @> ARRAY[100,5711];QUERY PLAN-----------------------------------------------------------------------------------------------------------------------------Gather (cost=1000.00..191007.68 rows=250 width=19) (actual time=982.585..1000.806 rows=0 loops=1)Workers Planned: 2Workers Launched: 2-> Parallel Seq Scan on account1 (cost=0.00..189982.68 rows=104 width=19) (actual time=962.640..962.640 rows=0 loops=3)Filter: (tag @> '{100,5711}'::integer[])Rows Removed by Filter: 3333333Planning Time: 0.205 msJIT:Functions: 12Options: Inlining false, Optimization false, Expressions true, Deforming trueTiming: Generation 2.280 ms, Inlining 0.000 ms, Optimization 1.176 ms, Emission 14.189 ms, Total 17.645 msExecution Time: 1001.574 ms(12 rows)
create index tag_inx_2 on account1 USING GIN(tag);
test=> explain analyze select uin,name from account1 where tag @> ARRAY[100,5711];QUERY PLAN---------------------------------------------------------------------------------------------------------------------Bitmap Heap Scan on account1 (cost=49.94..1021.13 rows=250 width=19) (actual time=0.126..0.127 rows=0 loops=1)Recheck Cond: (tag @> '{100,5711}'::integer[])-> Bitmap Index Scan on tag_inx_2 (cost=0.00..49.87 rows=250 width=0) (actual time=0.124..0.124 rows=0 loops=1)Index Cond: (tag @> '{100,5711}'::integer[])Planning Time: 0.410 msExecution Time: 0.171 ms(6 rows)
test=> explain analyze select uin,name from account1 where tag @> ARRAY[61568,97350];QUERY PLAN---------------------------------------------------------------------------------------------------------------------Bitmap Heap Scan on account1 (cost=49.94..1021.13 rows=250 width=19) (actual time=0.130..0.131 rows=1 loops=1)Recheck Cond: (tag @> '{61568,97350}'::integer[])Heap Blocks: exact=1-> Bitmap Index Scan on tag_inx_2 (cost=0.00..49.87 rows=250 width=0) (actual time=0.125..0.125 rows=1 loops=1)Index Cond: (tag @> '{61568,97350}'::integer[])Planning Time: 0.071 msExecution Time: 0.151 ms(7 rows)
test=> explain analyze select count(uin) from account1 where tag && ARRAY[61568,97350];QUERY PLAN---------------------------------------------------------------------------------------------------------------------------------------Gather (cost=1961.06..173801.15 rows=99750 width=19) (actual time=5.020..28.885 rows=2066 loops=1)Workers Planned: 2Workers Launched: 2-> Parallel Bitmap Heap Scan on account1 (cost=961.06..162826.15 rows=41562 width=19) (actual time=1.623..3.305 rows=689 loops=3)Recheck Cond: (tag && '{61568,97350}'::integer[])Heap Blocks: exact=2053-> Bitmap Index Scan on tag_inx_2 (cost=0.00..936.12 rows=99750 width=0) (actual time=0.685..0.685 rows=2066 loops=1)Index Cond: (tag && '{61568,97350}'::integer[])Planning Time: 0.082 msJIT:Functions: 12Options: Inlining false, Optimization false, Expressions true, Deforming trueTiming: Generation 2.078 ms, Inlining 0.000 ms, Optimization 0.270 ms, Emission 3.489 ms, Total 5.836 msExecution Time: 29.725 ms(14 rows)
create extension roaringbitmap;
create table tag_uin_list(tagid int primary key,uin_offset int,uinbits roaringbitmap);
insert into tag_uin_listselect tagid, uin_offset, rb_build_agg(uin::int) as uinbits from(selectunnest(tag) as tagid,(uin / (2^31)::int8) as uin_offset,mod(uin, (2^31)::int8) as uinfrom account1) tgroup by tagid, uin_offset;
explain analyze select sum(ub) from(select uin_offset,rb_or_cardinality_agg(uinbits) as ubfrom tag_uin_listwhere tagid in (1,3,10,200)group by uin_offset) t;QUERY PLAN---------------------------------------------------------------------------------------------------------------------------------------------Aggregate (cost=32.47..32.48 rows=1 width=32) (actual time=0.964..0.966 rows=1 loops=1)-> GroupAggregate (cost=32.42..32.46 rows=1 width=12) (actual time=0.955..0.956 rows=1 loops=1)Group Key: tag_uin_list.uin_offset-> Sort (cost=32.42..32.43 rows=4 width=22) (actual time=0.107..0.109 rows=4 loops=1)Sort Key: tag_uin_list.uin_offsetSort Method: quicksort Memory: 25kB-> Bitmap Heap Scan on tag_uin_list (cost=17.20..32.38 rows=4 width=22) (actual time=0.044..0.067 rows=4 loops=1)Recheck Cond: (tagid = ANY ('{1,3,10,200}'::integer[]))Heap Blocks: exact=4-> Bitmap Index Scan on tag_uin_list_pkey (cost=0.00..17.20 rows=4 width=0) (actual time=0.031..0.031 rows=4 loops=1)Index Cond: (tagid = ANY ('{1,3,10,200}'::integer[]))Planning Time: 0.289 msExecution Time: 1.083 ms(13 rows)
explain analyze select uin_offset,rb_or_agg(uinbits) as ubfrom tag_uin_listwhere tagid in (1,3,10,200)group by uin_offset;QUERY PLAN---------------------------------------------------------------------------------------------------------------------------------------GroupAggregate (cost=32.42..32.46 rows=1 width=36) (actual time=0.246..0.246 rows=1 loops=1)Group Key: uin_offset-> Sort (cost=32.42..32.43 rows=4 width=22) (actual time=0.043..0.045 rows=4 loops=1)Sort Key: uin_offsetSort Method: quicksort Memory: 25kB-> Bitmap Heap Scan on tag_uin_list (cost=17.20..32.38 rows=4 width=22) (actual time=0.029..0.036 rows=4 loops=1)Recheck Cond: (tagid = ANY ('{1,3,10,200}'::integer[]))Heap Blocks: exact=4-> Bitmap Index Scan on tag_uin_list_pkey (cost=0.00..17.20 rows=4 width=0) (actual time=0.021..0.021 rows=4 loops=1)Index Cond: (tagid = ANY ('{1,3,10,200}'::integer[]))Planning Time: 0.119 msExecution Time: 0.310 ms(12 rows)
test=> select relname, pg_size_pretty(pg_relation_size(relid)) from pg_stat_user_tables where schemaname='public' order by pg_relation_size(relid) desc;relname | pg_size_pretty--------------+----------------account | 1545 MBaccount1 | 1077 MBt_user | 651 MBtag_dict | 6672 kBtag_uin_list | 5888 kB(5 rows)test=> select indexrelname, pg_size_pretty(pg_relation_size(relid)) from pg_stat_user_indexes where schemaname='public' order by pg_relation_size(relid) desc;indexrelname | pg_size_pretty-------------------+----------------tag_inx | 1545 MBaccount_pkey | 1545 MBtag_inx_2 | 1077 MBaccount1_pkey | 1077 MBt_user_pkey | 651 MBtag_dict_pkey | 6672 kBtag_uin_list_pkey | 5888 kB(7 rows)
查询项 | 方案1 | 方案2 | roaringbitmap 方案 |
查询包含指定标签的用户列表 | 4.528ms | 0.151ms | 0.310ms |
查询具备共同标签的用户个数 | 8.27ms | 29.725ms | 1.083ms |
数据容量统计 | 4635MB | 3244.344MB | 1237.12MB |
本页内容是否解决了您的问题?