Painfully Slow Postgres Query Using Where On Many Adjacent Rows
Solution 1:
Step1:use a window function to obtain adjacent records, avoiding the painful self-join (12 tables is very close to the limit where geqo takes over):
copy(
WITH stuff AS (
SELECT c1.id , c1.source, c1.word
, LEAD ( c1.word, 1) OVER (www) AS c2w
, LEAD (c1.word, 2) OVER (www) AS c3w
, LEAD ( c1.word, 3) OVER (www) AS c4w
, LEAD (c1.lemma, 3) OVER (www) AS c4l
, LEAD (c1.pos, 3) OVER (www) AS c4p
, LEAD (c1.pos, 4) OVER (www) AS c5p
, LEAD (c1.word, 4) OVER (www) AS c5w
, LEAD (c1.word, 5) OVER (www) AS c6w
, LEAD (c1.lemma, 5) OVER (www) AS c6l
, LEAD (c1.word, 6) OVER (www) AS c7w
, LEAD (c1.pos, 6) OVER (www) AS c7p
, LEAD (c1.word, 7) OVER (www) AS c8w
, LEAD (c1.word, 8) OVER (www) AS c9w
, LEAD (c1.lemma, 8) OVER (www) AS c9l
, LEAD (c1.pos, 8) OVER (www) AS c9p
, LEAD (c1.word, 9) OVER (www) AS c10w
, LEAD (c1.word, 10) OVER (www) AS c11w
FROM orderedflatcorpus AS c1
WINDOW www AS (ORDERBY id)
)
SELECT id , source, word
, c2w
, c3w
, c4w
, c4l
, c4p
, c5w
, c6w
, c7w
, c8w
, c9w
, c9l
, c9p
, c10w
, c11w
FROM stuff
WHERE1=1AND c4p LIKE'v%'AND c5p ='appge'AND c6l ='way'AND c7p LIKE'i%'AND c8w ='the'AND c9p LIKE'n%'ORDERBY id
)
-- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;TO'/tmp/OUTPUT2.csv' DELIMITER E'\t' csv header;
Step 2: [data model] The {word,lemma, pos} columns appear to be a low-cardinality group, you could squeeze them out into a separate token/lemma/pos-table:
-- An index to speedup the unique extraction and final update-- (the index will be dropped automatically-- once the columns are dropped)CREATE INDEX ON tmp.orderedflatcorpus (word, lemma, pos );
ANALYZE tmp.orderedflatcorpus;
-- table containing the "squeezed out" domainCREATETABLE tmp.words ASSELECTDISTINCT word, lemma, pos
FROM tmp.orderedflatcorpus
;
ALTERTABLE tmp.words
ADDCOLUMN id SERIAL NOTNULLPRIMARY KEY;
ALTERTABLE tmp.words
ADDUNIQUE (word , lemma, pos );
-- The original table needs an FK "link" to the new tableALTERTABLE tmp.orderedflatcorpus
ADDcolumn words_id INTEGER-- NOT NULLREFERENCES tmp.words(id)
;
-- FK constraints are helped a lot by a supportive index.CREATE INDEX orderedflatcorpus_words_id_fk ON tmp.orderedflatcorpus (words_id)
;
ANALYZE tmp.orderedflatcorpus;
ANALYZE tmp.words;
-- Initialize the FK column in the original table.-- we need NOT DISTINCT FROM here, since the joined-- columns could contain NULLs , which MUST compare equal.-- ------------------------------------------------------UPDATE tmp.orderedflatcorpus dst
SET words_id = src.id
FROM tmp.words src
WHERE src.word ISNOTDISTINCTFROM dst.word
AND dst.lemma ISNOTDISTINCTFROM src.lemma
AND dst.pos ISNOTDISTINCTFROM src.pos
;
ALTERTABLE tmp.orderedflatcorpus
DROPcolumn word
, DROPcolumn lemma
, DROPcolumn pos
;
And the new query, with a JOIN to the words-table:
copy(
WITH stuff AS (
SELECT c1.id , c1.source, w.word
, LEAD ( w.word, 1) OVER (www) AS c2w
, LEAD (w.word, 2) OVER (www) AS c3w
, LEAD ( w.word, 3) OVER (www) AS c4w
, LEAD (w.lemma, 3) OVER (www) AS c4l
, LEAD (w.pos, 3) OVER (www) AS c4p
, LEAD (w.pos, 4) OVER (www) AS c5p
, LEAD (w.word, 4) OVER (www) AS c5w
, LEAD (w.word, 5) OVER (www) AS c6w
, LEAD (w.lemma, 5) OVER (www) AS c6l
, LEAD (w.word, 6) OVER (www) AS c7w
, LEAD (w.pos, 6) OVER (www) AS c7p
, LEAD (w.word, 7) OVER (www) AS c8w
, LEAD (w.word, 8) OVER (www) AS c9w
, LEAD (w.lemma, 8) OVER (www) AS c9l
, LEAD (w.pos, 8) OVER (www) AS c9p
, LEAD (w.word, 9) OVER (www) AS c10w
, LEAD (w.word, 10) OVER (www) AS c11w
FROM orderedflatcorpus AS c1
JOIN words w ON w.id=c1.words_id
WINDOW www AS (ORDERBY c1.id)
)
SELECT id , source, word
, c2w , c3w
, c4w , c4l , c4p
, c5w
, c6w
, c7w
, c8w
, c9w , c9l , c9p
, c10w
, c11w
FROM stuff
WHERE1=1AND c4p LIKE'v%'AND c5p ='appge'AND c6l ='way'AND c7p LIKE'i%'AND c8w ='the'AND c9p LIKE'n%'ORDERBY id
)
-- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;TO'/tmp/OUTPUT3.csv' DELIMITER E'\t' csv header;
Note: I get two lines in the output, because I relaxed the conditions a bit too much ...
Update :the first query, avoiding the CTE
copy(
SELECT id , source, word
, c2w
, c3w
, c4w
, c4l
, c4p
, c5w
, c6w
, c7w
, c8w
, c9w
, c9l
, c9p
, c10w
, c11w
FROM (
SELECT c1.id , c1.source, c1.word
, LEAD ( c1.word, 1) OVER (www) AS c2w
, LEAD (c1.word, 2) OVER (www) AS c3w
, LEAD ( c1.word, 3) OVER (www) AS c4w
, LEAD (c1.lemma, 3) OVER (www) AS c4l
, LEAD (c1.pos, 3) OVER (www) AS c4p
, LEAD (c1.pos, 4) OVER (www) AS c5p
, LEAD (c1.word, 4) OVER (www) AS c5w
, LEAD (c1.word, 5) OVER (www) AS c6w
, LEAD (c1.lemma, 5) OVER (www) AS c6l
, LEAD (c1.word, 6) OVER (www) AS c7w
, LEAD (c1.pos, 6) OVER (www) AS c7p
, LEAD (c1.word, 7) OVER (www) AS c8w
, LEAD (c1.word, 8) OVER (www) AS c9w
, LEAD (c1.lemma, 8) OVER (www) AS c9l
, LEAD (c1.pos, 8) OVER (www) AS c9p
, LEAD (c1.word, 9) OVER (www) AS c10w
, LEAD (c1.word, 10) OVER (www) AS c11w
FROM orderedflatcorpus AS c1
WINDOW www AS (ORDERBY id)
) stuff
WHERE1=1AND c4p LIKE'v%'AND c5p ='appge'AND c6l ='way'AND c7p LIKE'i%'AND c8w ='the'AND c9p LIKE'n%'ORDERBY id
)
-- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;TO'/tmp/OUTPUT2a.csv' DELIMITER E'\t' csv header;
[a similar transformation could be performed on the second query]
UPDATE2 The subquery version for the two table variant.
-- copy(
-- EXPLAIN ANALYZE
SELECT c1i, c1s, c1w
, c2w , c3w
, c4w , c4l , c4p
, c5w
, c6w
, c7w
, c8w
, c9w , c9l , c9p
, c10w
, c11w
FROM (
SELECT c1.id AS c1i
, c1.source AS c1s
, w1.word AS c1w
, LEAD (w1.word, 1) OVER www AS c2w
, LEAD (w1.word, 2) OVER www AS c3w
, LEAD (w1.word, 3) OVER www AS c4w
, LEAD (w1.lemma, 3) OVER www AS c4l
, LEAD (w1.pos, 3) OVER www AS c4p
, LEAD (w1.pos, 4) OVER www AS c5p
, LEAD (w1.word, 4) OVER www AS c5w
, LEAD (w1.word, 5) OVER www AS c6w
, LEAD (w1.lemma, 5) OVER www AS c6l
, LEAD (w1.word, 6) OVER www AS c7w
, LEAD (w1.pos, 6) OVER www AS c7p
, LEAD (w1.word, 7) OVER www AS c8w
, LEAD (w1.word, 8) OVER www AS c9w
, LEAD (w1.lemma, 8) OVER www AS c9l
, LEAD (w1.pos, 8) OVER www AS c9p
, LEAD (w1.word, 9) OVER www AS c10w
, LEAD (w1.word, 10) OVER www AS c11w
FROM orderedflatcorpus c1
JOIN words w1 ON w1.id=c1.words_id
WHERE1=1
/* These *could* to prune out unmatched items, but I could notget it to work ...
AND EXISTS (SELECT *FROM orderedflatcorpus c4 JOIN words w4 ON w4.id=c4.words_id
WHERE c4.id = 3+c1.id -- AND w4.pos LIKE'v%'
) -- OMG
AND EXISTS (SELECT *FROM orderedflatcorpus c5 JOIN words w5 ON w5.id=c5.words_id
WHERE c5.id = 4+c1.id -- AND w5.pos = 'appge'
) -- OMG
AND EXISTS (SELECT *FROM orderedflatcorpus c7 JOIN words w7 ON w7.id=c7.words_id
WHERE c7.id = 6+c1.id -- AND w7.pos LIKE'i%'
) -- OMG
AND EXISTS (SELECT *FROM orderedflatcorpus c9 JOIN words w9 ON w9.id=c9.words_id
WHERE c9.id = 8+c1.id -- AND w9.pos LIKE'n%'
) -- OMG
AND EXISTS (SELECT *FROM orderedflatcorpus c8 JOIN words w8 ON w8.id=c8.words_id
WHERE c8.id = 7+c1.id -- AND w8.word = 'the'
) -- OMG
*/
WINDOW www AS (ORDERBY c1.id ROWS BETWEEN CURRENT ROW AND10 FOLLOWING)
) stuff
WHERE1=1AND c4p LIKE'v%'AND c5p = 'appge'AND c6l = 'way'AND c7p LIKE'i%'AND c8w = 'the'AND c9p LIKE'n%'ORDERBY c1i
;
-- )
-- TO'/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;
-- TO'/tmp/OUTPUT3b.csv' DELIMITER E'\t' csv header;
Solution 2:
I recommend using modern join syntax, which may well fix the problem:
SELECT
c1.source, c1.word, c2.word, c3.word, c4.word, c4.lemma, c4.pos, c5.word, c6.word, c7.word, c8.word, c9.word, c9.lemma, c9.pos, c10.word, c11.word
FROM orderedflatcorpus AS c1
JOIN orderedflatcorpus AS c2 ON c1.id + 1 = c2.id
JOIN orderedflatcorpus AS c3 ON c1.id + 2 = c3.id
JOIN orderedflatcorpus AS c4 ON c1.id + 3 = c4.id
JOIN orderedflatcorpus AS c5 ON c1.id + 4 = c5.id
JOIN orderedflatcorpus AS c6 ON c1.id + 5 = c6.id
JOIN orderedflatcorpus AS c7 ON c1.id + 6 = c7.id
JOIN orderedflatcorpus AS c8 ON c1.id + 7 = c8.id
JOIN orderedflatcorpus AS c9 ON c1.id + 8 = c9.id
JOIN orderedflatcorpus AS c10 ON c1.id + 9 = c10.id
JOIN orderedflatcorpus AS c11 ON c1.id + 10 = c11.id
WHERE c4.pos LIKE'v%'AND c5.pos = 'appge'AND c6.lemma = 'way'AND c7.pos LIKE'i%'AND c8.word = 'the'AND c9.pos LIKE'n%'
Notes:
- redundant
LIKE
s removed ORDER BY
removed, because it's very expensive. CSV (like table rows) don't need ordering to be valid. If you absolutely need ordering, use command line tools to order it after the execution of the query.
Solution 3:
Let's try to reformat your query just a bit and see what we can see. The first thing to do is to change it over to use ANSI-style joins so we can clearly see what the relationships are:
SELECT c1.source, c1.word, c2.word, c3.word, c4.word,
c4.lemma, c4.pos, c5.word, c6.word, c7.word,
c8.word, c9.word, c9.lemma, c9.pos, c10.word, c11.word
FROM orderedflatcorpus c1
INNER JOIN orderedflatcorpus c2
ON c2.ID = c1.ID + 1AND
c2.WORD LIKE'%'
INNER JOIN orderedflatcorpus c3
ON c3.ID = c1.ID + 2AND
c3.WORD LIKE'%'
INNER JOIN orderedflatcorpus c4
ON c4.ID = c1.ID + 3AND
c4.pos LIKE'v%'
INNER JOIN orderedflatcorpus c5
ON c5.ID = c1.ID + 4AND
c5.pos = 'appge'
INNER JOIN orderedflatcorpus c6
ON c6.ID = c1.ID + 5AND
c6.lemma = 'way'
INNER JOIN orderedflatcorpus c7
ON c7.ID = c1.ID + 6AND
c7.pos LIKE'i%'
INNER JOIN orderedflatcorpus c8
ON c8.ID = c1.ID + 7AND
c8.word = 'the'
INNER JOIN orderedflatcorpus c9
ON c9.ID = c1.ID + 8AND
c9.pos LIKE'n%'
INNER JOIN orderedflatcorpus c10
ON c10.ID = c1.ID + 9AND
c10.WORD LIKE'%'
INNER JOIN orderedflatcorpus c11
ON c11.ID = c1.ID + 10AND
c11.WORD LIKE'%'WHERE c1.WORD LIKE'%'ORDERBY c1.id
OK, first off - all those LIKE's are killing this query. Let's eliminate them where we can. I'm going to assume here that WORD can't be NULL in ORDEREDFLATCORPUS, and thus all the IS LIKE '%'
conditions can be eliminated:
SELECT c1.source, c1.word, c2.word, c3.word, c4.word,
c4.lemma, c4.pos, c5.word, c6.word, c7.word,
c8.word, c9.word, c9.lemma, c9.pos, c10.word, c11.word
FROM orderedflatcorpus c1
INNER JOIN orderedflatcorpus c2
ON c2.ID = c1.ID + 1
INNER JOIN orderedflatcorpus c3
ON c3.ID = c1.ID + 2
INNER JOIN orderedflatcorpus c4
ON c4.ID = c1.ID + 3AND
c4.pos LIKE'v%'
INNER JOIN orderedflatcorpus c5
ON c5.ID = c1.ID + 4AND
c5.pos = 'appge'
INNER JOIN orderedflatcorpus c6
ON c6.ID = c1.ID + 5AND
c6.lemma = 'way'
INNER JOIN orderedflatcorpus c7
ON c7.ID = c1.ID + 6AND
c7.pos LIKE'i%'
INNER JOIN orderedflatcorpus c8
ON c8.ID = c1.ID + 7AND
c8.word = 'the'
INNER JOIN orderedflatcorpus c9
ON c9.ID = c1.ID + 8AND
c9.pos LIKE'n%'
INNER JOIN orderedflatcorpus c10
ON c10.ID = c1.ID + 9
INNER JOIN orderedflatcorpus c11
ON c11.ID = c1.ID + 10ORDERBY c1.id
If WORD can be NULL, then you might need to use:
SELECT c1.source, c1.word, c2.word, c3.word, c4.word,
c4.lemma, c4.pos, c5.word, c6.word, c7.word,
c8.word, c9.word, c9.lemma, c9.pos, c10.word, c11.word
FROM orderedflatcorpus c1
INNER JOIN orderedflatcorpus c2
ON c2.ID = c1.ID + 1AND
c2.WORD ISNOT NULL
INNER JOIN orderedflatcorpus c3
ON c3.ID = c1.ID + 2AND
c3.WORD ISNOT NULL
INNER JOIN orderedflatcorpus c4
ON c4.ID = c1.ID + 3AND
c4.pos LIKE'v%'
INNER JOIN orderedflatcorpus c5
ON c5.ID = c1.ID + 4AND
c5.pos = 'appge'
INNER JOIN orderedflatcorpus c6
ON c6.ID = c1.ID + 5AND
c6.lemma = 'way'
INNER JOIN orderedflatcorpus c7
ON c7.ID = c1.ID + 6AND
c7.pos LIKE'i%'
INNER JOIN orderedflatcorpus c8
ON c8.ID = c1.ID + 7AND
c8.word = 'the'
INNER JOIN orderedflatcorpus c9
ON c9.ID = c1.ID + 8AND
c9.pos LIKE'n%'
INNER JOIN orderedflatcorpus c10
ON c10.ID = c1.ID + 9AND
c10.WORD ISNOT NULL
INNER JOIN orderedflatcorpus c11
ON c11.ID = c1.ID + 10AND
c11.WORD ISNOT NULL
WHERE c1.WORD ISNOT NULL
ORDERBY c1.id
Next - this query needs to do as much filtering as it can as early as it possibly can. The database query optimizer may be able to figure this out, but let's give it some help by putting the equijoins first in the join list, and then adjusting the ID calculations to reflect the information we're getting first:
SELECT c1.source, c1.word, c2.word, c3.word, c4.word,
c4.lemma, c4.pos, c5.word, c6.word, c7.word,
c8.word, c9.word, c9.lemma, c9.pos, c10.word, c11.word
FROM DUAL
INNER JOIN orderedflatcorpus c5
ON c5.pos = 'appge'
INNER JOIN orderedflatcorpus c6
ON c6.ID = c5.ID + 1AND
c6.lemma = 'way'
INNER JOIN orderedflatcorpus c8
ON c8.ID = c5.ID + 3AND
c8.word = 'the'
INNER JOIN orderedflatcorpus c1
ON c1.ID = c5.ID - 4AND
INNER JOIN orderedflatcorpus c2
ON c2.ID = c5.ID - 3
INNER JOIN orderedflatcorpus c3
ON c3.ID = c5.ID - 2
INNER JOIN orderedflatcorpus c4
ON c4.ID = c5.ID - 1AND
c4.pos LIKE'v%'
INNER JOIN orderedflatcorpus c7
ON c7.ID = c5.ID + 2AND
c7.pos LIKE'i%'
INNER JOIN orderedflatcorpus c9
ON c9.ID = c5.ID + 4AND
c9.pos LIKE'n%'
INNER JOIN orderedflatcorpus c10
ON c10.ID = c5.ID + 5
INNER JOIN orderedflatcorpus c11
ON c11.ID = c5.ID + 6ORDERBY c1.id
Next we need to consider what indexes would be most useful. I think the following indexes would be worth having:
(ID)
(ID, WORD)
(ID, LEMMA)
(ID, POS)
Put those indexes on, run this query, and see if it helps. Also, check the ID calculations - I think I got them right but... :-)
Best of luck.
Post a Comment for "Painfully Slow Postgres Query Using Where On Many Adjacent Rows"