Тестирование PostgreSQL 8.1 на больших таблицах: денормализация

Как один из вариантов оптимизации скорости выборки может быть использована денормализация таблиц. Продолжим работать с тем же тестовым распределением, но теперь в таблице фактов заменим числовые ключи на строковые значения, соответствующие значениям из справочников. Для генерации тестового распределения использована приведенная ниже функция, создающая строки из символа "0", повторенного случайное число раз от нуля до заданного аргументом значения.

Для вычисления времени выполнения запроса на этот раз использую команду explain analyze, хотя имхо она дает заниженное значение времени. Количество возвращаемых строк также можно увидеть в выводе названной команды, но из соображений удобства привожу и результат запроса на получение числа строк.

Параметры сервера:

Intel(R) Core(TM)2 Quad CPU Q6700 @ 2.66GHz
MemTotal: 8310900 kB


Параметры сервера PostgreSQL:

work_mem = 100000 # min 64, size in KB
maintenance_work_mem = 2000000 # min 1024, size in KB
shared_buffers = 200000 # min 16 or max_connections*2, 8KB each


PostgreSQL version

select version();
version
----------------------------------------------------------------------------------------------------------------
PostgreSQL 8.1.15 on i486-pc-linux-gnu, compiled by GCC cc (GCC) 4.1.2 20061115 (prerelease) (Debian 4.1.1-21)
(1 row)


Вспомогательные функции:

CREATE OR REPLACE FUNCTION public.populate_varstring(length int)
RETURNS text AS
$BODY$

return [string repeat 0 [expr {round($1*rand())}]]

$BODY$
LANGUAGE 'pltcl' VOLATILE;


CREATE OR REPLACE FUNCTION public.populate_string(length int)
RETURNS text AS
$BODY$

return [string repeat 0 $1]

$BODY$
LANGUAGE 'pltcl' VOLATILE;


Нормализованная таблица без справочников (10M записей)

create table tmp_facts7(a1 int,a2 int,a3 int,a4 int);
insert into tmp_facts7(a1,a2,a3,a4) select (18*random())::int,(18*random())::int,
(18*random())::int,(18*random())::int from generate_series(1,10000000);

create index tmp_facts7_a1_idx on tmp_facts7(a1);
create index tmp_facts7_a2_idx on tmp_facts7(a2);
create index tmp_facts7_a3_idx on tmp_facts7(a3);
create index tmp_facts7_a4_idx on tmp_facts7(a4);

select count(*) from tmp_facts7 where a1=2 and a2=2 and a3=2 and a4=2;
count
-------
102
(1 row)

explain analyze select * from tmp_facts7 where a1=2 and a2=2 and a3=2 and a4=2;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on tmp_facts7 (cost=858.50..862.52 rows=1 width=16) (actual time=312.659..315.412 rows=102 loops=1)
Recheck Cond: ((a4 = 2) AND (a3 = 2) AND (a2 = 2))
Filter: (a1 = 2)
-> BitmapAnd (cost=858.50..858.50 rows=1 width=0) (actual time=311.972..311.972 rows=0 loops=1)
-> Bitmap Index Scan on tmp_facts7_a4_idx (cost=0.00..286.00 rows=50000 width=0) (actual time=94.804..94.804 rows=555224 loops=1)
Index Cond: (a4 = 2)
-> Bitmap Index Scan on tmp_facts7_a3_idx (cost=0.00..286.00 rows=50000 width=0) (actual time=97.709..97.709 rows=555616 loops=1)
Index Cond: (a3 = 2)
-> Bitmap Index Scan on tmp_facts7_a2_idx (cost=0.00..286.00 rows=50000 width=0) (actual time=94.862..94.862 rows=555418 loops=1)
Index Cond: (a2 = 2)
Total runtime: 315.702 ms

SELECT relname, relpages, reltuples FROM pg_class
WHERE NOT relname LIKE 'pg_%' ORDER BY relpages DESC;

relname | relpages | reltuples
----------------------------------------+----------+-------------
tmp_facts7 | 58824 | 1e+07
tmp_facts7_a1_idx | 21899 | 1e+07
tmp_facts7_a2_idx | 21899 | 1e+07
tmp_facts7_a3_idx | 21899 | 1e+07
tmp_facts7_a4_idx | 21899 | 1e+07


Нормализованная таблица со справочниками (10M записей + 10,100,1k,10k записей в справочниках)

drop table tmp_facts7_a1;
create table tmp_facts7_a1(id int, name text);
insert into tmp_facts7_a1
select generate_series,populate_string(generate_series) from generate_series(1,10);
create index tmp_facts7_a1_name_idx on tmp_facts7_a1(name);


drop table tmp_facts7_a2;
create table tmp_facts7_a2(id int, name text);
insert into tmp_facts7_a2
select generate_series,populate_string(generate_series) from generate_series(1,100);
create index tmp_facts7_a2_name_idx on tmp_facts7_a2(name);


drop table tmp_facts7_a3;
create table tmp_facts7_a3(id int, name text);
insert into tmp_facts7_a3
select generate_series,populate_string(generate_series) from generate_series(1,1000);
create index tmp_facts7_a3_name_idx on tmp_facts7_a3(name);


drop table tmp_facts7_a4;
create table tmp_facts7_a4(id int, name text);
insert into tmp_facts7_a4
select generate_series,populate_string(generate_series) from generate_series(1,10000);
create index tmp_facts7_a4_name_idx on tmp_facts7_a4(name);

select count(*)
from tmp_facts7
join tmp_facts7_a1 on tmp_facts7.a1=tmp_facts7_a1.id
join tmp_facts7_a2 on tmp_facts7.a2=tmp_facts7_a2.id
join tmp_facts7_a3 on tmp_facts7.a3=tmp_facts7_a3.id
join tmp_facts7_a4 on tmp_facts7.a4=tmp_facts7_a4.id
where tmp_facts7_a1.name='00' and tmp_facts7_a2.name='00' and tmp_facts7_a3.name='00' and tmp_facts7_a4.name='00';
90

explain analyze
select count(*)
from tmp_facts7
join tmp_facts7_a1 on tmp_facts7.a1=tmp_facts7_a1.id
join tmp_facts7_a2 on tmp_facts7.a2=tmp_facts7_a2.id
join tmp_facts7_a3 on tmp_facts7.a3=tmp_facts7_a3.id
join tmp_facts7_a4 on tmp_facts7.a4=tmp_facts7_a4.id
where tmp_facts7_a1.name='00' and tmp_facts7_a2.name='00' and tmp_facts7_a3.name='00' and tmp_facts7_a4.name='00';
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=210529.30..210529.31 rows=1 width=0) (actual time=12160.879..12160.880 rows=1 loops=1)
-> Hash Join (cost=164.09..210529.22 rows=33 width=0) (actual time=53.455..12160.696 rows=90 loops=1)
Hash Cond: ("outer".a4 = "inner".id)
-> Hash Join (cost=21.98..210384.80 rows=132 width=4) (actual time=2.790..12159.510 rows=1747 loops=1)
Hash Cond: ("outer".a1 = "inner".id)
-> Hash Join (cost=20.85..210369.85 rows=2500 width=8) (actual time=1.326..12144.334 rows=30961 loops=1)
Hash Cond: ("outer".a3 = "inner".id)
-> Hash Join (cost=3.25..209827.25 rows=100000 width=12) (actual time=0.083..11806.323 rows=556026 loops=1)
Hash Cond: ("outer".a2 = "inner".id)
-> Seq Scan on tmp_facts7 (cost=0.00..158824.00 rows=10000000 width=16) (actual time=0.002..6252.510 rows=10000000 loops=1)
-> Hash (cost=3.25..3.25 rows=1 width=4) (actual time=0.037..0.037 rows=1 loops=1)
-> Seq Scan on tmp_facts7_a2 (cost=0.00..3.25 rows=1 width=4) (actual time=0.002..0.034 rows=1 loops=1)
Filter: (name = '00'::text)
-> Hash (cost=17.59..17.59 rows=5 width=4) (actual time=0.024..0.024 rows=1 loops=1)
-> Bitmap Heap Scan on tmp_facts7_a3 (cost=2.02..17.59 rows=5 width=4) (actual time=0.022..0.022 rows=1 loops=1)
Recheck Cond: (name = '00'::text)
-> Bitmap Index Scan on tmp_facts7_a3_name_idx (cost=0.00..2.02 rows=5 width=0) (actual time=0.019..0.019 rows=1 loops=1)
Index Cond: (name = '00'::text)
-> Hash (cost=1.12..1.12 rows=1 width=4) (actual time=0.013..0.013 rows=1 loops=1)
-> Seq Scan on tmp_facts7_a1 (cost=0.00..1.12 rows=1 width=4) (actual time=0.007..0.010 rows=1 loops=1)
Filter: (name = '00'::text)
-> Hash (cost=141.98..141.98 rows=50 width=4) (actual time=0.044..0.044 rows=1 loops=1)
-> Bitmap Heap Scan on tmp_facts7_a4 (cost=2.17..141.98 rows=50 width=4) (actual time=0.042..0.042 rows=1 loops=1)
Recheck Cond: (name = '00'::text)
-> Bitmap Index Scan on tmp_facts7_a4_name_idx (cost=0.00..2.17 rows=50 width=0) (actual time=0.038..0.038 rows=1 loops=1)
Index Cond: (name = '00'::text)
Total runtime: 12160.956 ms

select count(*)
from tmp_facts7
where
tmp_facts7.a1 IN (select id from tmp_facts7_a1 where name='00') and
tmp_facts7.a2 IN (select id from tmp_facts7_a2 where name='00') and
tmp_facts7.a3 IN (select id from tmp_facts7_a3 where name='00') and
tmp_facts7.a4 IN (select id from tmp_facts7_a4 where name='00');
90

QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=208849.46..208849.47 rows=1 width=0) (actual time=11530.444..11530.444 rows=1 loops=1)
-> Hash IN Join (cost=10.41..208849.45 rows=1 width=0) (actual time=54.422..11530.272 rows=90 loops=1)
Hash Cond: ("outer".a1 = "inner".id)
-> Hash IN Join (cost=9.28..208848.31 rows=1 width=4) (actual time=4.918..11529.142 rows=1711 loops=1)
Hash Cond: ("outer".a2 = "inner".id)
-> Hash IN Join (cost=6.03..208845.04 rows=1 width=8) (actual time=0.388..11513.718 rows=30963 loops=1)
Hash Cond: ("outer".a3 = "inner".id)
-> Hash IN Join (cost=3.02..208837.02 rows=1000 width=12) (actual time=0.083..11252.212 rows=556092 loops=1)
Hash Cond: ("outer".a4 = "inner".id)
-> Seq Scan on tmp_facts7 (cost=0.00..158824.00 rows=10000000 width=16) (actual time=0.009..6100.572 rows=10000000 loops=1)
-> Hash (cost=3.01..3.01 rows=1 width=4) (actual time=0.067..0.067 rows=1 loops=1)
-> Index Scan using tmp_facts7_a4_name_idx on tmp_facts7_a4 (cost=0.00..3.01 rows=1 width=4) (actual time=0.065..0.066 rows=1 loops=1)
Index Cond: (name = '00'::text)
-> Hash (cost=3.01..3.01 rows=1 width=4) (actual time=0.051..0.051 rows=1 loops=1)
-> Index Scan using tmp_facts7_a3_name_idx on tmp_facts7_a3 (cost=0.00..3.01 rows=1 width=4) (actual time=0.048..0.049 rows=1 loops=1)
Index Cond: (name = '00'::text)
-> Hash (cost=3.25..3.25 rows=1 width=4) (actual time=0.036..0.036 rows=1 loops=1)
-> Seq Scan on tmp_facts7_a2 (cost=0.00..3.25 rows=1 width=4) (actual time=0.003..0.034 rows=1 loops=1)
Filter: (name = '00'::text)
-> Hash (cost=1.12..1.12 rows=1 width=4) (actual time=0.013..0.013 rows=1 loops=1)
-> Seq Scan on tmp_facts7_a1 (cost=0.00..1.12 rows=1 width=4) (actual time=0.007..0.011 rows=1 loops=1)
Filter: (name = '00'::text)
Total runtime: 11530.519 ms


select count(*)
from tmp_facts7,tmp_facts7_a1,tmp_facts7_a2,tmp_facts7_a3,tmp_facts7_a4
where tmp_facts7.a1=tmp_facts7_a1.id and tmp_facts7.a2=tmp_facts7_a2.id and tmp_facts7.a3=tmp_facts7_a3.id and tmp_facts7.a4=tmp_facts7_a4.id
and tmp_facts7_a1.name='00' and tmp_facts7_a2.name='00' and tmp_facts7_a3.name='00' and tmp_facts7_a4.name='00';
90


explain analyze
select count(*)
from tmp_facts7,tmp_facts7_a1,tmp_facts7_a2,tmp_facts7_a3,tmp_facts7_a4
where tmp_facts7.a1=tmp_facts7_a1.id and tmp_facts7.a2=tmp_facts7_a2.id and tmp_facts7.a3=tmp_facts7_a3.id and tmp_facts7.a4=tmp_facts7_a4.id
and tmp_facts7_a1.name='00' and tmp_facts7_a2.name='00' and tmp_facts7_a3.name='00' and tmp_facts7_a4.name='00';
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=208849.46..208849.47 rows=1 width=0) (actual time=11471.422..11471.423 rows=1 loops=1)
-> Hash Join (cost=10.41..208849.45 rows=1 width=0) (actual time=55.361..11471.247 rows=90 loops=1)
Hash Cond: ("outer".a1 = "inner".id)
-> Hash Join (cost=9.28..208848.31 rows=1 width=4) (actual time=5.006..11470.182 rows=1711 loops=1)
Hash Cond: ("outer".a2 = "inner".id)
-> Hash Join (cost=6.03..208845.04 rows=1 width=8) (actual time=0.419..11454.973 rows=30963 loops=1)
Hash Cond: ("outer".a3 = "inner".id)
-> Hash Join (cost=3.02..208837.02 rows=1000 width=12) (actual time=0.100..11194.330 rows=556092 loops=1)
Hash Cond: ("outer".a4 = "inner".id)
-> Seq Scan on tmp_facts7 (cost=0.00..158824.00 rows=10000000 width=16) (actual time=0.011..6054.843 rows=10000000 loops=1)
-> Hash (cost=3.01..3.01 rows=1 width=4) (actual time=0.083..0.083 rows=1 loops=1)
-> Index Scan using tmp_facts7_a4_name_idx on tmp_facts7_a4 (cost=0.00..3.01 rows=1 width=4) (actual time=0.080..0.081 rows=1 loops=1)
Index Cond: (name = '00'::text)
-> Hash (cost=3.01..3.01 rows=1 width=4) (actual time=0.059..0.059 rows=1 loops=1)
-> Index Scan using tmp_facts7_a3_name_idx on tmp_facts7_a3 (cost=0.00..3.01 rows=1 width=4) (actual time=0.056..0.057 rows=1 loops=1)
Index Cond: (name = '00'::text)
-> Hash (cost=3.25..3.25 rows=1 width=4) (actual time=0.049..0.049 rows=1 loops=1)
-> Seq Scan on tmp_facts7_a2 (cost=0.00..3.25 rows=1 width=4) (actual time=0.010..0.048 rows=1 loops=1)
Filter: (name = '00'::text)
-> Hash (cost=1.12..1.12 rows=1 width=4) (actual time=0.025..0.025 rows=1 loops=1)
-> Seq Scan on tmp_facts7_a1 (cost=0.00..1.12 rows=1 width=4) (actual time=0.019..0.022 rows=1 loops=1)
Filter: (name = '00'::text)
Total runtime: 11471.499 ms


Денормализованная таблица (10M записей)

create table tmp_facts7(a1 text,a2 text,a3 text,a4 text);
insert into tmp_facts7(a1,a2,a3,a4)
select populate_varstring(18),populate_varstring(18),populate_varstring(18),populate_varstring(18) from generate_series(1,10000000);

create index tmp_facts7_a1_idx on tmp_facts7(a1);
create index tmp_facts7_a2_idx on tmp_facts7(a2);
create index tmp_facts7_a3_idx on tmp_facts7(a3);
create index tmp_facts7_a4_idx on tmp_facts7(a4);

select count(*) from tmp_facts7 where a1='0000000000' and a2='0000000000' and a3='0000000000' and a4='0000000000';
count
-------
94
(1 row)

explain analyze select count(*) from tmp_facts7 where a1='0000000000' and a2='0000000000' and a3='0000000000' and a4='0000000000';
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=16271.06..16271.07 rows=1 width=0) (actual time=567.677..567.678 rows=1 loops=1)
-> Bitmap Heap Scan on tmp_facts7 (cost=15897.42..16270.82 rows=95 width=0) (actual time=567.458..567.643 rows=94 loops=1)
Recheck Cond: ((a1 = '0000000000'::text) AND (a3 = '0000000000'::text) AND (a2 = '0000000000'::text) AND (a4 = '0000000000'::text))
-> BitmapAnd (cost=15897.42..15897.42 rows=95 width=0) (actual time=567.270..567.270 rows=0 loops=1)
-> Bitmap Index Scan on tmp_facts7_a1_idx (cost=0.00..3933.47 rows=550133 width=0) (actual time=128.749..128.749 rows=555230 loops=1)
Index Cond: (a1 = '0000000000'::text)
-> Bitmap Index Scan on tmp_facts7_a3_idx (cost=0.00..3943.60 rows=551600 width=0) (actual time=130.825..130.825 rows=556203 loops=1)
Index Cond: (a3 = '0000000000'::text)
-> Bitmap Index Scan on tmp_facts7_a2_idx (cost=0.00..3974.53 rows=555867 width=0) (actual time=126.721..126.721 rows=555106 loops=1)
Index Cond: (a2 = '0000000000'::text)
-> Bitmap Index Scan on tmp_facts7_a4_idx (cost=0.00..4045.07 rows=565733 width=0) (actual time=132.243..132.243 rows=555891 loops=1)
Index Cond: (a4 = '0000000000'::text)
Total runtime: 568.144 ms

SELECT relname, relpages, reltuples FROM pg_class
WHERE NOT relname LIKE 'pg_%' ORDER BY relpages DESC;
relname | relpages | reltuples
------------------------------------+----------+-----------
tmp_facts7 | 111003 | 1e+07
tmp_facts7_a1_idx | 36469 | 1e+07
tmp_facts7_a2_idx | 36468 | 1e+07
tmp_facts7_a3_idx | 36467 | 1e+07
tmp_facts7_a4_idx | 36467 | 1e+07


Выводы

При денормализации таблицы размер самой таблицы и индексов примерно удвоился, и во столько же раз упала скорость выборки. Заметим, что при тесте все данные помещались в буфере СУБД, при работе с диском можно ожидать худших результатов.

Что касается выборки из нормализованной таблицы со справочниками, то результаты просто отвратительные. Если присмотреться, то из каждого справочника выборается единственное значение (равное числу 2), и запрос должен выполняться с той же скоростью, что и при выборке из таблицы фактов с указанными значениями идентификаторов справочников.

Comments

Popular posts from this blog

Открытый софт для научных расчетов

Кольцевые структуры в геофизике

Модем Huawei E1550 в debian