diff --git a/contrib/Makefile b/contrib/Makefile index 2f0a88d3f77..dd04c20acd2 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -34,6 +34,7 @@ SUBDIRS = \ pg_freespacemap \ pg_logicalinspect \ pg_overexplain \ + pg_plan_advice \ pg_prewarm \ pg_stat_statements \ pg_surgery \ diff --git a/contrib/btree_gist/btree_utils_var.c b/contrib/btree_gist/btree_utils_var.c index 6847e4e54d5..f6ba1c0c825 100644 --- a/contrib/btree_gist/btree_utils_var.c +++ b/contrib/btree_gist/btree_utils_var.c @@ -115,36 +115,47 @@ gbt_var_leaf2node(GBT_VARKEY *leaf, const gbtree_vinfo *tinfo, FmgrInfo *flinfo) /* * returns the common prefix length of a node key + * + * If the underlying type is character data, the prefix length may point in + * the middle of a multibyte character. */ static int32 gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo) { GBT_VARKEY_R r = gbt_var_key_readable(node); int32 i = 0; - int32 l = 0; + int32 l_left_to_match = 0; + int32 l_total = 0; int32 t1len = VARSIZE(r.lower) - VARHDRSZ; int32 t2len = VARSIZE(r.upper) - VARHDRSZ; int32 ml = Min(t1len, t2len); char *p1 = VARDATA(r.lower); char *p2 = VARDATA(r.upper); + const char *end1 = p1 + t1len; + const char *end2 = p2 + t2len; if (ml == 0) return 0; while (i < ml) { - if (tinfo->eml > 1 && l == 0) + if (tinfo->eml > 1 && l_left_to_match == 0) { - if ((l = pg_mblen(p1)) != pg_mblen(p2)) + l_total = pg_mblen_range(p1, end1); + if (l_total != pg_mblen_range(p2, end2)) { return i; } + l_left_to_match = l_total; } if (*p1 != *p2) { if (tinfo->eml > 1) { - return (i - l + 1); + int32 l_matched_subset = l_total - l_left_to_match; + + /* end common prefix at final byte of last matching char */ + return i - l_matched_subset; } else { @@ -154,7 +165,7 @@ gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo) p1++; p2++; - l--; + l_left_to_match--; i++; } return ml; /* lower == upper */ diff --git a/contrib/dblink/dblink.c b/contrib/dblink/dblink.c index 8cb3166495c..2498d80c8e7 100644 --- a/contrib/dblink/dblink.c +++ b/contrib/dblink/dblink.c @@ -2069,6 +2069,7 @@ get_text_array_contents(ArrayType *array, int *numitems) int16 typlen; bool typbyval; char typalign; + uint8 typalignby; char **values; char *ptr; bits8 *bitmap; @@ -2081,6 +2082,7 @@ get_text_array_contents(ArrayType *array, int *numitems) get_typlenbyvalalign(ARR_ELEMTYPE(array), &typlen, &typbyval, &typalign); + typalignby = typalign_to_alignby(typalign); values = palloc_array(char *, nitems); @@ -2098,7 +2100,7 @@ get_text_array_contents(ArrayType *array, int *numitems) { values[i] = TextDatumGetCString(PointerGetDatum(ptr)); ptr = att_addlength_pointer(ptr, typlen, ptr); - ptr = (char *) att_align_nominal(ptr, typalign); + ptr = (char *) att_nominal_alignby(ptr, typalignby); } /* advance bitmap pointer if any */ diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c index 5c4917ce1fc..9e3784e0f47 100644 --- a/contrib/dict_xsyn/dict_xsyn.c +++ b/contrib/dict_xsyn/dict_xsyn.c @@ -54,14 +54,14 @@ find_word(char *in, char **end) *end = NULL; while (*in && isspace((unsigned char) *in)) - in += pg_mblen(in); + in += pg_mblen_cstr(in); if (!*in || *in == '#') return NULL; start = in; while (*in && !isspace((unsigned char) *in)) - in += pg_mblen(in); + in += pg_mblen_cstr(in); *end = in; diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c index 34e3918811c..9cdfcb5daa0 100644 --- a/contrib/hstore/hstore_io.c +++ b/contrib/hstore/hstore_io.c @@ -67,7 +67,7 @@ prssyntaxerror(HSParser *state) errsave(state->escontext, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error in hstore, near \"%.*s\" at position %d", - pg_mblen(state->ptr), state->ptr, + pg_mblen_cstr(state->ptr), state->ptr, (int) (state->ptr - state->begin)))); /* In soft error situation, return false as convenience for caller */ return false; diff --git a/contrib/hstore_plperl/hstore_plperl.c b/contrib/hstore_plperl/hstore_plperl.c index 31393b4fa50..69001191cc0 100644 --- a/contrib/hstore_plperl/hstore_plperl.c +++ b/contrib/hstore_plperl/hstore_plperl.c @@ -21,6 +21,13 @@ static hstoreCheckKeyLen_t hstoreCheckKeyLen_p; typedef size_t (*hstoreCheckValLen_t) (size_t len); static hstoreCheckValLen_t hstoreCheckValLen_p; +/* Static asserts verify that typedefs above match original declarations */ +StaticAssertVariableIsOfType(&hstoreUpgrade, hstoreUpgrade_t); +StaticAssertVariableIsOfType(&hstoreUniquePairs, hstoreUniquePairs_t); +StaticAssertVariableIsOfType(&hstorePairs, hstorePairs_t); +StaticAssertVariableIsOfType(&hstoreCheckKeyLen, hstoreCheckKeyLen_t); +StaticAssertVariableIsOfType(&hstoreCheckValLen, hstoreCheckValLen_t); + /* * Module initialize function: fetch function pointers for cross-module calls. @@ -28,24 +35,18 @@ static hstoreCheckValLen_t hstoreCheckValLen_p; void _PG_init(void) { - /* Asserts verify that typedefs above match original declarations */ - AssertVariableIsOfType(&hstoreUpgrade, hstoreUpgrade_t); hstoreUpgrade_p = (hstoreUpgrade_t) load_external_function("$libdir/hstore", "hstoreUpgrade", true, NULL); - AssertVariableIsOfType(&hstoreUniquePairs, hstoreUniquePairs_t); hstoreUniquePairs_p = (hstoreUniquePairs_t) load_external_function("$libdir/hstore", "hstoreUniquePairs", true, NULL); - AssertVariableIsOfType(&hstorePairs, hstorePairs_t); hstorePairs_p = (hstorePairs_t) load_external_function("$libdir/hstore", "hstorePairs", true, NULL); - AssertVariableIsOfType(&hstoreCheckKeyLen, hstoreCheckKeyLen_t); hstoreCheckKeyLen_p = (hstoreCheckKeyLen_t) load_external_function("$libdir/hstore", "hstoreCheckKeyLen", true, NULL); - AssertVariableIsOfType(&hstoreCheckValLen, hstoreCheckValLen_t); hstoreCheckValLen_p = (hstoreCheckValLen_t) load_external_function("$libdir/hstore", "hstoreCheckValLen", true, NULL); diff --git a/contrib/hstore_plpython/hstore_plpython.c b/contrib/hstore_plpython/hstore_plpython.c index e2bfc6da38e..d2be030e07c 100644 --- a/contrib/hstore_plpython/hstore_plpython.c +++ b/contrib/hstore_plpython/hstore_plpython.c @@ -28,6 +28,15 @@ static hstoreCheckKeyLen_t hstoreCheckKeyLen_p; typedef size_t (*hstoreCheckValLen_t) (size_t len); static hstoreCheckValLen_t hstoreCheckValLen_p; +/* Static asserts verify that typedefs above match original declarations */ +StaticAssertVariableIsOfType(&PLyObject_AsString, PLyObject_AsString_t); +StaticAssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); +StaticAssertVariableIsOfType(&hstoreUpgrade, hstoreUpgrade_t); +StaticAssertVariableIsOfType(&hstoreUniquePairs, hstoreUniquePairs_t); +StaticAssertVariableIsOfType(&hstorePairs, hstorePairs_t); +StaticAssertVariableIsOfType(&hstoreCheckKeyLen, hstoreCheckKeyLen_t); +StaticAssertVariableIsOfType(&hstoreCheckValLen, hstoreCheckValLen_t); + /* * Module initialize function: fetch function pointers for cross-module calls. @@ -35,32 +44,24 @@ static hstoreCheckValLen_t hstoreCheckValLen_p; void _PG_init(void) { - /* Asserts verify that typedefs above match original declarations */ - AssertVariableIsOfType(&PLyObject_AsString, PLyObject_AsString_t); PLyObject_AsString_p = (PLyObject_AsString_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyObject_AsString", true, NULL); - AssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); PLyUnicode_FromStringAndSize_p = (PLyUnicode_FromStringAndSize_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyUnicode_FromStringAndSize", true, NULL); - AssertVariableIsOfType(&hstoreUpgrade, hstoreUpgrade_t); hstoreUpgrade_p = (hstoreUpgrade_t) load_external_function("$libdir/hstore", "hstoreUpgrade", true, NULL); - AssertVariableIsOfType(&hstoreUniquePairs, hstoreUniquePairs_t); hstoreUniquePairs_p = (hstoreUniquePairs_t) load_external_function("$libdir/hstore", "hstoreUniquePairs", true, NULL); - AssertVariableIsOfType(&hstorePairs, hstorePairs_t); hstorePairs_p = (hstorePairs_t) load_external_function("$libdir/hstore", "hstorePairs", true, NULL); - AssertVariableIsOfType(&hstoreCheckKeyLen, hstoreCheckKeyLen_t); hstoreCheckKeyLen_p = (hstoreCheckKeyLen_t) load_external_function("$libdir/hstore", "hstoreCheckKeyLen", true, NULL); - AssertVariableIsOfType(&hstoreCheckValLen, hstoreCheckValLen_t); hstoreCheckValLen_p = (hstoreCheckValLen_t) load_external_function("$libdir/hstore", "hstoreCheckValLen", true, NULL); diff --git a/contrib/intarray/_int_selfuncs.c b/contrib/intarray/_int_selfuncs.c index 4a7053028c6..7fce743632f 100644 --- a/contrib/intarray/_int_selfuncs.c +++ b/contrib/intarray/_int_selfuncs.c @@ -19,6 +19,7 @@ #include "catalog/pg_operator.h" #include "catalog/pg_statistic.h" #include "catalog/pg_type.h" +#include "commands/extension.h" #include "miscadmin.h" #include "utils/fmgrprotos.h" #include "utils/lsyscache.h" @@ -170,7 +171,18 @@ _int_matchsel(PG_FUNCTION_ARGS) PG_RETURN_FLOAT8(0.0); } - /* The caller made sure the const is a query, so get it now */ + /* + * Verify that the Const is a query_int, else return a default estimate. + * (This could only fail if someone attached this estimator to the wrong + * operator.) + */ + if (((Const *) other)->consttype != + get_function_sibling_type(fcinfo->flinfo->fn_oid, "query_int")) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(DEFAULT_EQ_SEL); + } + query = DatumGetQueryTypeP(((Const *) other)->constvalue); /* Empty query matches nothing */ diff --git a/contrib/jsonb_plpython/jsonb_plpython.c b/contrib/jsonb_plpython/jsonb_plpython.c index 7e8e1d6674f..c2c4ce37c08 100644 --- a/contrib/jsonb_plpython/jsonb_plpython.c +++ b/contrib/jsonb_plpython/jsonb_plpython.c @@ -33,22 +33,24 @@ typedef PyObject *(*PLyUnicode_FromStringAndSize_t) (const char *s, Py_ssize_t size); static PLyUnicode_FromStringAndSize_t PLyUnicode_FromStringAndSize_p; +/* Static asserts verify that typedefs above match original declarations */ +StaticAssertVariableIsOfType(&PLyObject_AsString, PLyObject_AsString_t); +StaticAssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); +StaticAssertVariableIsOfType(&PLy_elog_impl, PLy_elog_impl_t); + + /* * Module initialize function: fetch function pointers for cross-module calls. */ void _PG_init(void) { - /* Asserts verify that typedefs above match original declarations */ - AssertVariableIsOfType(&PLyObject_AsString, PLyObject_AsString_t); PLyObject_AsString_p = (PLyObject_AsString_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyObject_AsString", true, NULL); - AssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); PLyUnicode_FromStringAndSize_p = (PLyUnicode_FromStringAndSize_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyUnicode_FromStringAndSize", true, NULL); - AssertVariableIsOfType(&PLy_elog_impl, PLy_elog_impl_t); PLy_elog_impl_p = (PLy_elog_impl_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLy_elog_impl", true, NULL); diff --git a/contrib/ltree/crc32.c b/contrib/ltree/crc32.c index 3918d4a0ec2..d21bed31fdd 100644 --- a/contrib/ltree/crc32.c +++ b/contrib/ltree/crc32.c @@ -23,6 +23,7 @@ ltree_crc32_sz(const char *buf, int size) { pg_crc32 crc; const char *p = buf; + const char *end = buf + size; static pg_locale_t locale = NULL; if (!locale) @@ -32,7 +33,7 @@ ltree_crc32_sz(const char *buf, int size) while (size > 0) { char foldstr[UNICODE_CASEMAP_BUFSZ]; - int srclen = pg_mblen(p); + int srclen = pg_mblen_range(p, end); size_t foldlen; /* fold one codepoint at a time */ diff --git a/contrib/ltree/lquery_op.c b/contrib/ltree/lquery_op.c index a28ddbf40de..0adcdd8ff2a 100644 --- a/contrib/ltree/lquery_op.c +++ b/contrib/ltree/lquery_op.c @@ -27,14 +27,14 @@ getlexeme(char *start, char *end, int *len) char *ptr; while (start < end && t_iseq(start, '_')) - start += pg_mblen(start); + start += pg_mblen_range(start, end); ptr = start; if (ptr >= end) return NULL; while (ptr < end && !t_iseq(ptr, '_')) - ptr += pg_mblen(ptr); + ptr += pg_mblen_range(ptr, end); *len = ptr - start; return start; diff --git a/contrib/ltree/ltree.h b/contrib/ltree/ltree.h index 78478dec173..b0ded40eba9 100644 --- a/contrib/ltree/ltree.h +++ b/contrib/ltree/ltree.h @@ -127,7 +127,7 @@ typedef struct #define LQUERY_HASNOT 0x01 /* valid label chars are alphanumerics, underscores and hyphens */ -#define ISLABEL(x) ( t_isalnum(x) || t_iseq(x, '_') || t_iseq(x, '-') ) +#define ISLABEL(x) ( t_isalnum_cstr(x) || t_iseq(x, '_') || t_iseq(x, '-') ) /* full text query */ diff --git a/contrib/ltree/ltree_io.c b/contrib/ltree/ltree_io.c index 59c4462df80..54c4ca3c5c3 100644 --- a/contrib/ltree/ltree_io.c +++ b/contrib/ltree/ltree_io.c @@ -54,7 +54,7 @@ parse_ltree(const char *buf, struct Node *escontext) ptr = buf; while (*ptr) { - charlen = pg_mblen(ptr); + charlen = pg_mblen_cstr(ptr); if (t_iseq(ptr, '.')) num++; ptr += charlen; @@ -69,7 +69,7 @@ parse_ltree(const char *buf, struct Node *escontext) ptr = buf; while (*ptr) { - charlen = pg_mblen(ptr); + charlen = pg_mblen_cstr(ptr); switch (state) { @@ -291,7 +291,7 @@ parse_lquery(const char *buf, struct Node *escontext) ptr = buf; while (*ptr) { - charlen = pg_mblen(ptr); + charlen = pg_mblen_cstr(ptr); if (t_iseq(ptr, '.')) num++; @@ -311,7 +311,7 @@ parse_lquery(const char *buf, struct Node *escontext) ptr = buf; while (*ptr) { - charlen = pg_mblen(ptr); + charlen = pg_mblen_cstr(ptr); switch (state) { diff --git a/contrib/ltree/ltxtquery_io.c b/contrib/ltree/ltxtquery_io.c index 91a2222eaa9..d15f3235393 100644 --- a/contrib/ltree/ltxtquery_io.c +++ b/contrib/ltree/ltxtquery_io.c @@ -64,7 +64,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint for (;;) { - charlen = pg_mblen(state->buf); + charlen = pg_mblen_cstr(state->buf); switch (state->state) { diff --git a/contrib/ltree_plpython/ltree_plpython.c b/contrib/ltree_plpython/ltree_plpython.c index 0493aeb2423..d4e7b613fa1 100644 --- a/contrib/ltree_plpython/ltree_plpython.c +++ b/contrib/ltree_plpython/ltree_plpython.c @@ -13,6 +13,9 @@ PG_MODULE_MAGIC_EXT( typedef PyObject *(*PLyUnicode_FromStringAndSize_t) (const char *s, Py_ssize_t size); static PLyUnicode_FromStringAndSize_t PLyUnicode_FromStringAndSize_p; +/* Static asserts verify that typedefs above match original declarations */ +StaticAssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); + /* * Module initialize function: fetch function pointers for cross-module calls. @@ -20,8 +23,6 @@ static PLyUnicode_FromStringAndSize_t PLyUnicode_FromStringAndSize_p; void _PG_init(void) { - /* Asserts verify that typedefs above match original declarations */ - AssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t); PLyUnicode_FromStringAndSize_p = (PLyUnicode_FromStringAndSize_t) load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyUnicode_FromStringAndSize", true, NULL); diff --git a/contrib/meson.build b/contrib/meson.build index def13257cbe..5a752eac347 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -48,6 +48,7 @@ subdir('pgcrypto') subdir('pg_freespacemap') subdir('pg_logicalinspect') subdir('pg_overexplain') +subdir('pg_plan_advice') subdir('pg_prewarm') subdir('pgrowlocks') subdir('pg_stat_statements') diff --git a/contrib/oid2name/oid2name.c b/contrib/oid2name/oid2name.c index 51802907138..63e6ce2dae8 100644 --- a/contrib/oid2name/oid2name.c +++ b/contrib/oid2name/oid2name.c @@ -469,7 +469,7 @@ void sql_exec_dumpalltables(PGconn *conn, struct options *opts) { char todo[1024]; - char *addfields = ",c.oid AS \"Oid\", nspname AS \"Schema\", spcname as \"Tablespace\" "; + char *addfields = ",c.oid AS \"Oid\", nspname AS \"Schema\", spcname as \"Tablespace\", pg_relation_filepath(c.oid) as \"Path\" "; snprintf(todo, sizeof(todo), "SELECT pg_catalog.pg_relation_filenode(c.oid) as \"Filenode\", relname as \"Table Name\" %s " @@ -507,7 +507,7 @@ sql_exec_searchtables(PGconn *conn, struct options *opts) *comma_filenumbers, *comma_tables; bool written = false; - char *addfields = ",c.oid AS \"Oid\", nspname AS \"Schema\", spcname as \"Tablespace\" "; + char *addfields = ",c.oid AS \"Oid\", nspname AS \"Schema\", spcname as \"Tablespace\", pg_relation_filepath(c.oid) as \"Path\" "; /* get tables qualifiers, whether names, filenumbers, or OIDs */ comma_oids = get_comma_elts(opts->oids); diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c index 8277fa256c3..2f0dfff175a 100644 --- a/contrib/pageinspect/heapfuncs.c +++ b/contrib/pageinspect/heapfuncs.c @@ -101,7 +101,7 @@ text_to_bits(char *str, int len) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid character \"%.*s\" in t_bits string", - pg_mblen(str + off), str + off))); + pg_mblen_cstr(str + off), str + off))); if (off % 8 == 7) bits[off / 8] = byte; diff --git a/contrib/pg_overexplain/expected/pg_overexplain.out b/contrib/pg_overexplain/expected/pg_overexplain.out index 55d34666d87..f376d2e7996 100644 --- a/contrib/pg_overexplain/expected/pg_overexplain.out +++ b/contrib/pg_overexplain/expected/pg_overexplain.out @@ -104,6 +104,7 @@ $$); Parallel Safe: true Plan Node ID: 2 Append RTIs: 1 + Child Append RTIs: none -> Seq Scan on brassica vegetables_1 Disabled Nodes: 0 Parallel Safe: true @@ -142,7 +143,7 @@ $$); Relation Kind: relation Relation Lock Mode: AccessShareLock Unprunable RTIs: 1 3 4 -(53 rows) +(54 rows) -- Test a different output format. SELECT explain_filter($$ @@ -197,6 +198,7 @@ $$); none + none + 1 + + none + 0 + + + @@ -452,6 +454,8 @@ SELECT * FROM vegetables WHERE genus = 'daucus'; Seq Scan on daucus vegetables Filter: (genus = 'daucus'::text) Scan RTI: 2 + Elided Node Type: Append + Elided Node RTIs: 1 RTI 1 (relation, inherited, in-from-clause): Eref: vegetables (id, name, genus) Relation: vegetables @@ -465,7 +469,7 @@ SELECT * FROM vegetables WHERE genus = 'daucus'; Relation Kind: relation Relation Lock Mode: AccessShareLock Unprunable RTIs: 1 2 -(16 rows) +(18 rows) -- Also test a case that involves a write. EXPLAIN (RANGE_TABLE, COSTS OFF) @@ -489,3 +493,122 @@ INSERT INTO vegetables (name, genus) VALUES ('broccoflower', 'brassica'); Result RTIs: 1 (15 rows) +-- should show "Subplan: sub" +EXPLAIN (RANGE_TABLE, COSTS OFF) +SELECT * FROM vegetables v, + (SELECT * FROM vegetables WHERE genus = 'daucus' OFFSET 0) sub; + QUERY PLAN +---------------------------------------------- + Nested Loop + -> Seq Scan on daucus vegetables + Filter: (genus = 'daucus'::text) + Scan RTI: 6 + Elided Node Type: Append + Elided Node RTIs: 5 + Elided Node Type: SubqueryScan + Elided Node RTIs: 2 + -> Append + Append RTIs: 1 + Child Append RTIs: none + -> Seq Scan on brassica v_1 + Scan RTI: 3 + -> Seq Scan on daucus v_2 + Scan RTI: 4 + RTI 1 (relation, inherited, in-from-clause): + Alias: v () + Eref: v (id, name, genus) + Relation: vegetables + Relation Kind: partitioned_table + Relation Lock Mode: AccessShareLock + Permission Info Index: 1 + RTI 2 (subquery, in-from-clause): + Alias: sub () + Eref: sub (id, name, genus) + RTI 3 (relation, in-from-clause): + Alias: v (id, name, genus) + Eref: v (id, name, genus) + Relation: brassica + Relation Kind: relation + Relation Lock Mode: AccessShareLock + RTI 4 (relation, in-from-clause): + Alias: v (id, name, genus) + Eref: v (id, name, genus) + Relation: daucus + Relation Kind: relation + Relation Lock Mode: AccessShareLock + RTI 5 (relation, inherited, in-from-clause): + Subplan: sub + Eref: vegetables (id, name, genus) + Relation: vegetables + Relation Kind: partitioned_table + Relation Lock Mode: AccessShareLock + Permission Info Index: 2 + RTI 6 (relation, in-from-clause): + Subplan: sub + Alias: vegetables (id, name, genus) + Eref: vegetables (id, name, genus) + Relation: daucus + Relation Kind: relation + Relation Lock Mode: AccessShareLock + Unprunable RTIs: 1 3 4 5 6 +(52 rows) + +-- should show "Subplan: unnamed_subquery" +EXPLAIN (RANGE_TABLE, COSTS OFF) +SELECT * FROM vegetables v, + (SELECT * FROM vegetables WHERE genus = 'daucus' OFFSET 0); + QUERY PLAN +---------------------------------------------- + Nested Loop + -> Seq Scan on daucus vegetables + Filter: (genus = 'daucus'::text) + Scan RTI: 6 + Elided Node Type: Append + Elided Node RTIs: 5 + Elided Node Type: SubqueryScan + Elided Node RTIs: 2 + -> Append + Append RTIs: 1 + Child Append RTIs: none + -> Seq Scan on brassica v_1 + Scan RTI: 3 + -> Seq Scan on daucus v_2 + Scan RTI: 4 + RTI 1 (relation, inherited, in-from-clause): + Alias: v () + Eref: v (id, name, genus) + Relation: vegetables + Relation Kind: partitioned_table + Relation Lock Mode: AccessShareLock + Permission Info Index: 1 + RTI 2 (subquery, in-from-clause): + Eref: unnamed_subquery (id, name, genus) + RTI 3 (relation, in-from-clause): + Alias: v (id, name, genus) + Eref: v (id, name, genus) + Relation: brassica + Relation Kind: relation + Relation Lock Mode: AccessShareLock + RTI 4 (relation, in-from-clause): + Alias: v (id, name, genus) + Eref: v (id, name, genus) + Relation: daucus + Relation Kind: relation + Relation Lock Mode: AccessShareLock + RTI 5 (relation, inherited, in-from-clause): + Subplan: unnamed_subquery + Eref: vegetables (id, name, genus) + Relation: vegetables + Relation Kind: partitioned_table + Relation Lock Mode: AccessShareLock + Permission Info Index: 2 + RTI 6 (relation, in-from-clause): + Subplan: unnamed_subquery + Alias: vegetables (id, name, genus) + Eref: vegetables (id, name, genus) + Relation: daucus + Relation Kind: relation + Relation Lock Mode: AccessShareLock + Unprunable RTIs: 1 3 4 5 6 +(51 rows) + diff --git a/contrib/pg_overexplain/pg_overexplain.c b/contrib/pg_overexplain/pg_overexplain.c index 316ffd1c87f..36e6aac0e2c 100644 --- a/contrib/pg_overexplain/pg_overexplain.c +++ b/contrib/pg_overexplain/pg_overexplain.c @@ -54,6 +54,8 @@ static void overexplain_alias(const char *qlabel, Alias *alias, ExplainState *es); static void overexplain_bitmapset(const char *qlabel, Bitmapset *bms, ExplainState *es); +static void overexplain_bitmapset_list(const char *qlabel, List *bms_list, + ExplainState *es); static void overexplain_intlist(const char *qlabel, List *list, ExplainState *es); @@ -191,6 +193,8 @@ overexplain_per_node_hook(PlanState *planstate, List *ancestors, */ if (options->range_table) { + bool opened_elided_nodes = false; + switch (nodeTag(plan)) { case T_SeqScan: @@ -230,11 +234,17 @@ overexplain_per_node_hook(PlanState *planstate, List *ancestors, overexplain_bitmapset("Append RTIs", ((Append *) plan)->apprelids, es); + overexplain_bitmapset_list("Child Append RTIs", + ((Append *) plan)->child_append_relid_sets, + es); break; case T_MergeAppend: overexplain_bitmapset("Append RTIs", ((MergeAppend *) plan)->apprelids, es); + overexplain_bitmapset_list("Child Append RTIs", + ((MergeAppend *) plan)->child_append_relid_sets, + es); break; case T_Result: @@ -251,6 +261,43 @@ overexplain_per_node_hook(PlanState *planstate, List *ancestors, default: break; } + + foreach_node(ElidedNode, n, es->pstmt->elidedNodes) + { + char *elidednodetag; + + if (n->plan_node_id != plan->plan_node_id) + continue; + + if (!opened_elided_nodes) + { + ExplainOpenGroup("Elided Nodes", "Elided Nodes", false, es); + opened_elided_nodes = true; + } + + switch (n->elided_type) + { + case T_Append: + elidednodetag = "Append"; + break; + case T_MergeAppend: + elidednodetag = "MergeAppend"; + break; + case T_SubqueryScan: + elidednodetag = "SubqueryScan"; + break; + default: + elidednodetag = psprintf("%d", n->elided_type); + break; + } + + ExplainOpenGroup("Elided Node", NULL, true, es); + ExplainPropertyText("Elided Node Type", elidednodetag, es); + overexplain_bitmapset("Elided Node RTIs", n->relids, es); + ExplainCloseGroup("Elided Node", NULL, true, es); + } + if (opened_elided_nodes) + ExplainCloseGroup("Elided Nodes", "Elided Nodes", false, es); } } @@ -395,6 +442,8 @@ static void overexplain_range_table(PlannedStmt *plannedstmt, ExplainState *es) { Index rti; + ListCell *lc_subrtinfo = list_head(plannedstmt->subrtinfos); + SubPlanRTInfo *rtinfo = NULL; /* Open group, one entry per RangeTblEntry */ ExplainOpenGroup("Range Table", "Range Table", false, es); @@ -405,6 +454,18 @@ overexplain_range_table(PlannedStmt *plannedstmt, ExplainState *es) RangeTblEntry *rte = rt_fetch(rti, plannedstmt->rtable); char *kind = NULL; char *relkind; + SubPlanRTInfo *next_rtinfo; + + /* Advance to next SubRTInfo, if it's time. */ + if (lc_subrtinfo != NULL) + { + next_rtinfo = lfirst(lc_subrtinfo); + if (rti > next_rtinfo->rtoffset) + { + rtinfo = next_rtinfo; + lc_subrtinfo = lnext(plannedstmt->subrtinfos, lc_subrtinfo); + } + } /* NULL entries are possible; skip them */ if (rte == NULL) @@ -469,6 +530,28 @@ overexplain_range_table(PlannedStmt *plannedstmt, ExplainState *es) ExplainPropertyBool("In From Clause", rte->inFromCl, es); } + /* + * Indicate which subplan is the origin of which RTE. Note dummy + * subplans. Here again, we crunch more onto one line in text format. + */ + if (rtinfo != NULL) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + { + if (!rtinfo->dummy) + ExplainPropertyText("Subplan", rtinfo->plan_name, es); + else + ExplainPropertyText("Subplan", + psprintf("%s (dummy)", + rtinfo->plan_name), es); + } + else + { + ExplainPropertyText("Subplan", rtinfo->plan_name, es); + ExplainPropertyBool("Subplan Is Dummy", rtinfo->dummy, es); + } + } + /* rte->alias is optional; rte->eref is requested */ if (rte->alias != NULL) overexplain_alias("Alias", rte->alias, es); @@ -740,6 +823,54 @@ overexplain_bitmapset(const char *qlabel, Bitmapset *bms, ExplainState *es) pfree(buf.data); } +/* + * Emit a text property describing the contents of a list of bitmapsets. + * If a bitmapset contains exactly 1 member, we just print an integer; + * otherwise, we surround the list of members by parentheses. + * + * If there are no bitmapsets in the list, we print the word "none". + */ +static void +overexplain_bitmapset_list(const char *qlabel, List *bms_list, + ExplainState *es) +{ + StringInfoData buf; + + initStringInfo(&buf); + + foreach_node(Bitmapset, bms, bms_list) + { + if (bms_membership(bms) == BMS_SINGLETON) + appendStringInfo(&buf, " %d", bms_singleton_member(bms)); + else + { + int x = -1; + bool first = true; + + appendStringInfoString(&buf, " ("); + while ((x = bms_next_member(bms, x)) >= 0) + { + if (first) + first = false; + else + appendStringInfoChar(&buf, ' '); + appendStringInfo(&buf, "%d", x); + } + appendStringInfoChar(&buf, ')'); + } + } + + if (buf.len == 0) + { + ExplainPropertyText(qlabel, "none", es); + return; + } + + Assert(buf.data[0] == ' '); + ExplainPropertyText(qlabel, buf.data + 1, es); + pfree(buf.data); +} + /* * Emit a text property describing the contents of a list of integers, OIDs, * or XIDs -- either a space-separated list of integer members, or the word diff --git a/contrib/pg_overexplain/sql/pg_overexplain.sql b/contrib/pg_overexplain/sql/pg_overexplain.sql index 42e275ac2f9..34a957cbed3 100644 --- a/contrib/pg_overexplain/sql/pg_overexplain.sql +++ b/contrib/pg_overexplain/sql/pg_overexplain.sql @@ -110,3 +110,13 @@ SELECT * FROM vegetables WHERE genus = 'daucus'; -- Also test a case that involves a write. EXPLAIN (RANGE_TABLE, COSTS OFF) INSERT INTO vegetables (name, genus) VALUES ('broccoflower', 'brassica'); + +-- should show "Subplan: sub" +EXPLAIN (RANGE_TABLE, COSTS OFF) +SELECT * FROM vegetables v, + (SELECT * FROM vegetables WHERE genus = 'daucus' OFFSET 0) sub; + +-- should show "Subplan: unnamed_subquery" +EXPLAIN (RANGE_TABLE, COSTS OFF) +SELECT * FROM vegetables v, + (SELECT * FROM vegetables WHERE genus = 'daucus' OFFSET 0); diff --git a/contrib/pg_plan_advice/.gitignore b/contrib/pg_plan_advice/.gitignore new file mode 100644 index 00000000000..19a14253019 --- /dev/null +++ b/contrib/pg_plan_advice/.gitignore @@ -0,0 +1,3 @@ +/pgpa_parser.h +/pgpa_parser.c +/pgpa_scanner.c diff --git a/contrib/pg_plan_advice/Makefile b/contrib/pg_plan_advice/Makefile new file mode 100644 index 00000000000..1d4c559aed8 --- /dev/null +++ b/contrib/pg_plan_advice/Makefile @@ -0,0 +1,50 @@ +# contrib/pg_plan_advice/Makefile + +MODULE_big = pg_plan_advice +OBJS = \ + $(WIN32RES) \ + pg_plan_advice.o \ + pgpa_ast.o \ + pgpa_collector.o \ + pgpa_identifier.o \ + pgpa_join.o \ + pgpa_output.o \ + pgpa_parser.o \ + pgpa_planner.o \ + pgpa_scan.o \ + pgpa_scanner.o \ + pgpa_trove.o \ + pgpa_walker.o + +EXTENSION = pg_plan_advice +DATA = pg_plan_advice--1.0.sql +PGFILEDESC = "pg_plan_advice - help the planner get the right plan" + +REGRESS = gather join_order join_strategy partitionwise scan +TAP_TESTS = 1 + +EXTRA_CLEAN = pgpa_parser.h pgpa_parser.c pgpa_scanner.c + +# required for 001_regress.pl +REGRESS_SHLIB=$(abs_top_builddir)/src/test/regress/regress$(DLSUFFIX) +export REGRESS_SHLIB + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_plan_advice +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +# See notes in src/backend/parser/Makefile about the following two rules +pgpa_parser.h: pgpa_parser.c + touch $@ + +pgpa_parser.c: BISONFLAGS += -d + +# Force these dependencies to be known even without dependency info built: +pgpa_parser.o pgpa_scanner.o: pgpa_parser.h diff --git a/contrib/pg_plan_advice/README b/contrib/pg_plan_advice/README new file mode 100644 index 00000000000..0b888fd82f2 --- /dev/null +++ b/contrib/pg_plan_advice/README @@ -0,0 +1,260 @@ +contrib/pg_plan_advice/README + +Plan Advice +=========== + +This module implements a mini-language for "plan advice" that allows for +control of certain key planner decisions. Goals include (1) enforcing plan +stability (my previous plan was good and I would like to keep getting a +similar one) and (2) allowing users to experiment with plans other than +the one preferred by the optimizer. Non-goals include (1) controlling +every possible planner decision and (2) forcing consideration of plans +that the optimizer rejects for reasons other than cost. (There is some +room for bikeshedding about what exactly this non-goal means: what if +we skip path generation entirely for a certain case on the theory that +we know it cannot win on cost? Does that count as a cost-based rejection +even though no cost was ever computed?) + +Generally, plan advice is a series of whitespace-separated advice items, +each of which applies an advice tag to a list of advice targets. For +example, "SEQ_SCAN(foo) HASH_JOIN(bar@ss)" contains two items of advice, +the first of which applies the SEQ_SCAN tag to "foo" and the second of +which applies the HASH_JOIN tag to "bar@ss". In this simple example, each +target identifies a single relation; see "Relation Identifiers", below. +Advice tags can also be applied to groups of relations; for example, +"HASH_JOIN(baz (bletch quux))" applies the HASH_JOIN tag to the single +relation identifier "baz" as well as to the 2-item list containing +"bletch" and "quux". + +Critically, this module knows both how to generate plan advice from an +already-existing plan, and also how to enforce it during future planning +cycles. Everything it does is intended to be "round-trip safe": if you +generate advice from a plan and then feed that back into a future planing +cycle, each piece of advice should be guaranteed to apply to the exactly the +same part of the query from which it was generated without ambiguity or +guesswork, and it should succesfully enforce the same planning decision that +led to it being generated in the first place. Note that there is no +intention that these guarantees hold in the presence of intervening DDL; +e.g. if you change the properties of a function so that a subquery is no +longer inlined, or if you drop an index named in the plan advice, the advice +isn't going to work any more. That's expected. + +This module aims to force the planner to follow any provided advice without +regard to whether it is appears to be good advice or bad advice. If the +user provides bad advice, whether derived from a previously-generated plan +or manually written, they may get a bad plan. We regard this as user error, +not a defect in this module. It seems likely that applying advice +judiciously and only when truly required to avoid problems will be a more +successful strategy than applying it with a broad brush, but users are free +to experiment with whatever strategies they think best. + +Relation Identifiers +==================== + +Uniquely identifying the part of a query to which a certain piece of +advice applies is harder than it sounds. Our basic approach is to use +relation aliases as a starting point, and then disambiguate. There are +three ways that same relation alias can occur multiple times: + +1. It can appear in more than one subquery. + +2. It can appear more than once in the same subquery, + e.g. (foo JOIN bar) x JOIN foo. + +3. The table can be partitioned. + +Any combination of these things can occur simultaneously. Therefore, our +general syntax for a relation identifier is: + +alias_name#occurrence_number/partition_schema.partition_name@plan_name + +All components except for the alias_name are optional and included only +when required. When a component is omitted, the associated punctuation +must also be omitted. Occurrence numbers are counted ignoring children of +partitioned tables. When the generated occurrence number is 1, we omit +the occurrence number. The partition schema and partition name are included +only for children of partitioned tables. In generated advice, the +partition_schema is always included whenever there is a partition_name, +but user-written advice may mention the name and omit the schema. The +plan_name is omitted for the top-level PlannerInfo. + +Scan Advice +=========== + +For many types of scan, no advice is generated or possible; for instance, +a subquery is always scanned using a subquery scan. While that scan may be +elided via setrefs processing, this doesn't change the fact that only one +basic approach exists. Hence, scan advice applies mostly to relations, which +can be scanned in multiple ways. + +We tend to think of a scan as targeting a single relation, and that's +normally the case, but it doesn't have to be. For instance, if a join is +proven empty, the whole thing may be replaced with a single Result node +which, in effect, is a degenerate scan of every relation in the collapsed +portion of the join tree. Similarly, it's possible to inject a custom scan +in such a way that it replaces an entire join. If we ever emit advice +for these cases, it would target sets of relation identifiers surrounded +by parentheses, e.g. SOME_SORT_OF_SCAN(foo (bar baz)) would mean that the +the given scan type would be used for foo as a single relation and also the +combination of bar and baz as a join product. We have no such cases at +present. + +For index and index-only scans, both the relation being scanned and the +index or indexes being used must be specified. For example, INDEX_SCAN(foo +foo_a_idx bar bar_b_idx) indicates that an index scan (not an index-only +scan) should be used on foo_a_idx when scanning foo, and that an index scan +should be used on bar_b_idx when scanning bar. + +Bitmap heap scans currently do not allow for an index specification: +BITMAP_HEAP_SCAN(foo bar) simply means that each of foo and bar should use +some sort of bitmap heap scan. + +Join Order Advice +================= + +The JOIN_ORDER tag specifies the order in which several tables that are +part of the same join problem should be joined. Each subquery (except for +those that are inlined) is a separate join problem. Within a subquery, +partitionwise joins can create additional, separate join problems. Hence, +queries involving partitionwise joins may use JOIN_ORDER() many times. + +We take the canonical join structure to be an outer-deep tree, so +JOIN_ORDER(t1 t2 t3) says that t1 is the driving table and should be joined +first to t2 and then to t3. If the join problem involves additional tables, +they can be joined in any order after the join between t1, t2, and t3 has +been constructured. Generated join advice always mentions all tables +in the join problem, but manually written join advice need not do so. + +For trees which are not outer-deep, parentheses can be used. For example, +JOIN_ORDER(t1 (t2 t3)) says that the top-level join should have t1 on the +outer side and a join between t2 and t3 on the inner side. That join should +be constructed so that t2 is on the outer side and t3 is on the inner side. + +In some cases, it's not possible to fully specify the join order in this way. +For example, if t2 and t3 are being scanned by a single custom scan or foreign +scan, or if a partitionwise join is being performed between those tables, then +it's impossible to say that t2 is the outer table and t3 is the inner table, +or the other way around; it's just undefined. In such cases, we generate +join advice that uses curly braces, intending to indicate a lack of ordering: +JOIN_ORDER(t1 {t2 t3}) says that the uppermost join should have t1 on the outer +side and some kind of join between t2 and t3 on the inner side, but without +saying how that join must be performed or anything about which relation should +appear on which side of the join, or even whether this kind of join has sides. + +Join Strategy Advice +==================== + +Tags such as NESTED_LOOP_PLAIN specify the method that should be used to +perform a certain join. More specifically, NESTED_LOOP_PLAIN(x (y z)) says +that the plan should put the relation whose identifier is "x" on the inner +side of a plain nested loop (one without materialization or memoization) +and that it should also put a join between the relation whose identifier is +"y" and the relation whose identifier is "z" on the inner side of a nested +loop. Hence, for an N-table join problem, there will be N-1 pieces of join +strategy advice; no join strategy advice is required for the outermost +table in the join problem. + +Considering that we have both join order advice and join strategy advice, +it might seem natural to say that NESTED_LOOP_PLAIN(x) should be redefined +to mean that x should appear by itself on one side or the other of a nested +loop, rather than specifically on the inner side, but this definition appears +useless in practice. It gives the planner too much freedom to do things that +bear little resemblance to what the user probably had in mind. This makes +only a limited amount of practical difference in the case of a merge join or +unparameterized nested loop, but for a parameterized nested loop or a hash +join, the two sides are treated very differently and saying that a certain +relation should be involved in one of those operations without saying which +role it should take isn't saying much. + +This choice of definition implies that join strategy advice also imposes some +join order constraints. For example, given a join between foo and bar, +HASH_JOIN(bar) implies that foo is the driving table. Otherwise, it would +be impossible to put bar beneath the inner side of a Hash Join. + +Note that, given this definition, it's reasonable to consider deleting the +join order advice but applying the join strategy advice. For example, +consider a star schema with tables fact, dim1, dim2, dim3, dim4, and dim5. +The automatically generated advice might specify JOIN_ORDER(fact dim1 dim3 +dim4 dim2 dim5) HASH_JOIN(dim2 dim4) NESTED_LOOP_PLAIN(dim1 dim3 dim5). +Deleting the JOIN_ORDER advice allows the planner to reorder the joins +however it likes while still forcing the same choice of join method. This +seems potentially useful, and is one reason why a unified syntax that controls +both join order and join method in a single locution was not chosen. + +Advice Completeness +=================== + +An essential guiding principle is that no inference may made on the basis +of the absence of advice. The user is entitled to remove any portion of the +generated advice which they deem unsuitable or counterproductive and the +result should only be to increase the flexibility afforded to the planner. +This means that if advice can say that a certain optimization or technique +should be used, it should also be able to say that the optimization or +technique should not be used. We should never assume that the absence of an +instruction to do a certain thing means that it should not be done; all +instructions must be explicit. + +Semijoin Uniqueness +=================== + +Faced with a semijoin, the planner considers both a direct implementation +and a plan where the one side is made unique and then an inner join is +performed. We emit SEMIJOIN_UNIQUE() advice when this transformation occurs +and SEMIJOIN_NON_UNIQUE() advice when it doesn't. These items work like +join strategy advice: the inner side of the relevant join is named, and the +chosen join order must be compatible with the advice having some effect. + +Partitionwise +============= + +PARTITIONWISE() advise can be used to specify both those partitionwise joins +which should be performed and those which should not be performed; the idea +is that each argument to PARTITIONWISE specifies a set of relations that +should be scanned partitionwise after being joined to each other and nothing +else. Hence, for example, PARTITIONWISE((t1 t2) t3) specifies that the +query should contain a partitionwise join between t1 and t2 and that t3 +should not be part of any partitionwise join. If there are no other rels +in the query, specifying just PARTITIONWISE((t1 t2)) would have the same +effect, since there would be no other rels to which t3 could be joined in +a partitionwise fashion. + +Parallel Query (Gather, etc.) +============================= + +Each argument to GATHER() or GATHER_MERGE() is a single relation or an +exact set of relations on top of which a Gather or Gather Merge node, +respectively, should be placed. Each argument to NO_GATHER() is a single +relation that should not appear beneath any Gather or Gather Merge node; +that is, parallelism should not be used. + +Implicit Join Order Constraints +=============================== + +When JOIN_ORDER() advice is not provided for a particular join problem, +other pieces of advice may still incidentally constraint the join order. +For example, a user who specifies HASH_JOIN((foo bar)) is explicitly saying +that there should be a hash join with exactly foo and bar on the outer +side of it, but that also implies that foo and bar must be joined to +each other before either of them is joined to anything else. Otherwise, +the join the user is attempting to constraint won't actually occur in the +query, which ends up looking like the system has just decided to ignore +the advice altogether. + +Future Work +=========== + +We don't handle choice of aggregation: it would be nice to be able to force +sorted or grouped aggregation. I'm guessing this can be left to future work. + +More seriously, we don't know anything about eager aggregation, which could +have a large impact on the shape of the plan tree. XXX: This needs some study +to determine how large a problem it is, and might need to be fixed sooner +rather than later. + +We don't offer any control over estimates, only outcomes. It seems like a +good idea to incorporate that ability at some future point, as pg_hint_plan +does. However, since primary goal of the initial development work is to be +able to induce the planner to recreate a desired plan that worked well in +the past, this has not been included in the initial development effort. + +XXX Need to investigate whether and how well supplying advice works with GEQO diff --git a/contrib/pg_plan_advice/expected/gather.out b/contrib/pg_plan_advice/expected/gather.out new file mode 100644 index 00000000000..0cc0dedf859 --- /dev/null +++ b/contrib/pg_plan_advice/expected/gather.out @@ -0,0 +1,371 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 1; +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET debug_parallel_query = off; +CREATE TABLE gt_dim (id serial primary key, dim text) + WITH (autovacuum_enabled = false); +INSERT INTO gt_dim (dim) SELECT random()::text FROM generate_series(1,100) g; +VACUUM ANALYZE gt_dim; +CREATE TABLE gt_fact ( + id int not null, + dim_id integer not null references gt_dim (id) +) WITH (autovacuum_enabled = false); +INSERT INTO gt_fact + SELECT g, (g%3)+1 FROM generate_series(1,100000) g; +VACUUM ANALYZE gt_fact; +-- By default, we expect Gather Merge with a parallel hash join. +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +------------------------------------------------------- + Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: f.dim_id + -> Parallel Hash Join + Hash Cond: (f.dim_id = d.id) + -> Parallel Seq Scan on gt_fact f + -> Parallel Hash + -> Parallel Seq Scan on gt_dim d + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + GATHER_MERGE((f d)) +(14 rows) + +-- Force Gather or Gather Merge of both relations together. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather_merge((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +------------------------------------------------------- + Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: f.dim_id + -> Parallel Hash Join + Hash Cond: (f.dim_id = d.id) + -> Parallel Seq Scan on gt_fact f + -> Parallel Hash + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER_MERGE((f d)) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + GATHER_MERGE((f d)) +(16 rows) + +SET LOCAL pg_plan_advice.advice = 'gather((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +------------------------------------------------------- + Sort + Sort Key: f.dim_id + -> Gather + Workers Planned: 1 + -> Parallel Hash Join + Hash Cond: (f.dim_id = d.id) + -> Parallel Seq Scan on gt_fact f + -> Parallel Hash + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER((f d)) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + GATHER((f d)) +(16 rows) + +COMMIT; +-- Force a separate Gather or Gather Merge operation for each relation. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather_merge(f d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +-------------------------------------------------- + Merge Join + Merge Cond: (f.dim_id = d.id) + -> Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: f.dim_id + -> Parallel Seq Scan on gt_fact f + -> Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: d.id + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER_MERGE(f) /* matched */ + GATHER_MERGE(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + MERGE_JOIN_PLAIN(d) + SEQ_SCAN(f d) + GATHER_MERGE(f d) +(20 rows) + +SET LOCAL pg_plan_advice.advice = 'gather(f d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +-------------------------------------------------- + Merge Join + Merge Cond: (f.dim_id = d.id) + -> Sort + Sort Key: f.dim_id + -> Gather + Workers Planned: 1 + -> Parallel Seq Scan on gt_fact f + -> Sort + Sort Key: d.id + -> Gather + Workers Planned: 1 + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER(f) /* matched */ + GATHER(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + MERGE_JOIN_PLAIN(d) + SEQ_SCAN(f d) + GATHER(f d) +(20 rows) + +SET LOCAL pg_plan_advice.advice = 'gather((d d/d.d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +-------------------------------------------------- + Merge Join + Merge Cond: (f.dim_id = d.id) + -> Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: f.dim_id + -> Parallel Seq Scan on gt_fact f + -> Index Scan using gt_dim_pkey on gt_dim d + Supplied Plan Advice: + GATHER((d d/d.d)) /* partially matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + MERGE_JOIN_PLAIN(d) + SEQ_SCAN(f) + INDEX_SCAN(d public.gt_dim_pkey) + GATHER_MERGE(f) + NO_GATHER(d) +(17 rows) + +COMMIT; +-- Force a Gather or Gather Merge on one relation but no parallelism on other. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather_merge(f) no_gather(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +-------------------------------------------------- + Merge Join + Merge Cond: (f.dim_id = d.id) + -> Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: f.dim_id + -> Parallel Seq Scan on gt_fact f + -> Index Scan using gt_dim_pkey on gt_dim d + Supplied Plan Advice: + GATHER_MERGE(f) /* matched */ + NO_GATHER(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + MERGE_JOIN_PLAIN(d) + SEQ_SCAN(f) + INDEX_SCAN(d public.gt_dim_pkey) + GATHER_MERGE(f) + NO_GATHER(d) +(18 rows) + +SET LOCAL pg_plan_advice.advice = 'gather_merge(d) no_gather(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +------------------------------------------------- + Merge Join + Merge Cond: (f.dim_id = d.id) + -> Sort + Sort Key: f.dim_id + -> Seq Scan on gt_fact f + -> Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: d.id + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER_MERGE(d) /* matched */ + NO_GATHER(f) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + MERGE_JOIN_PLAIN(d) + SEQ_SCAN(f d) + GATHER_MERGE(d) + NO_GATHER(f) +(19 rows) + +SET LOCAL pg_plan_advice.advice = 'gather(f) no_gather(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +-------------------------------------------------- + Merge Join + Merge Cond: (d.id = f.dim_id) + -> Index Scan using gt_dim_pkey on gt_dim d + -> Sort + Sort Key: f.dim_id + -> Gather + Workers Planned: 1 + -> Parallel Seq Scan on gt_fact f + Supplied Plan Advice: + GATHER(f) /* matched */ + NO_GATHER(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d f) + MERGE_JOIN_PLAIN(f) + SEQ_SCAN(f) + INDEX_SCAN(d public.gt_dim_pkey) + GATHER(f) + NO_GATHER(d) +(18 rows) + +SET LOCAL pg_plan_advice.advice = 'gather(d) no_gather(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +------------------------------------------------- + Merge Join + Merge Cond: (f.dim_id = d.id) + -> Sort + Sort Key: f.dim_id + -> Seq Scan on gt_fact f + -> Sort + Sort Key: d.id + -> Gather + Workers Planned: 1 + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER(d) /* matched */ + NO_GATHER(f) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + MERGE_JOIN_PLAIN(d) + SEQ_SCAN(f d) + GATHER(d) + NO_GATHER(f) +(19 rows) + +COMMIT; +-- Force no Gather or Gather Merge use at all. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'no_gather(f d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +------------------------------------------------ + Merge Join + Merge Cond: (d.id = f.dim_id) + -> Index Scan using gt_dim_pkey on gt_dim d + -> Sort + Sort Key: f.dim_id + -> Seq Scan on gt_fact f + Supplied Plan Advice: + NO_GATHER(f) /* matched */ + NO_GATHER(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d f) + MERGE_JOIN_PLAIN(f) + SEQ_SCAN(f) + INDEX_SCAN(d public.gt_dim_pkey) + NO_GATHER(f d) +(15 rows) + +COMMIT; +-- Can't force Gather Merge without the ORDER BY clause, but just Gather is OK. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather_merge((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id; + QUERY PLAN +------------------------------------------------- + Gather + Disabled: true + Workers Planned: 1 + -> Parallel Hash Join + Hash Cond: (f.dim_id = d.id) + -> Parallel Seq Scan on gt_fact f + -> Parallel Hash + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER_MERGE((f d)) /* matched, failed */ + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + GATHER((f d)) +(15 rows) + +SET LOCAL pg_plan_advice.advice = 'gather((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id; + QUERY PLAN +------------------------------------------------- + Gather + Workers Planned: 1 + -> Parallel Hash Join + Hash Cond: (f.dim_id = d.id) + -> Parallel Seq Scan on gt_fact f + -> Parallel Hash + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER((f d)) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + GATHER((f d)) +(14 rows) + +COMMIT; +-- Test conflicting advice. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather((f d)) no_gather(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + QUERY PLAN +------------------------------------------------------- + Gather Merge + Workers Planned: 1 + -> Sort + Sort Key: f.dim_id + -> Parallel Hash Join + Hash Cond: (f.dim_id = d.id) + -> Parallel Seq Scan on gt_fact f + -> Parallel Hash + -> Parallel Seq Scan on gt_dim d + Supplied Plan Advice: + GATHER((f d)) /* matched, conflicting, failed */ + NO_GATHER(f) /* matched, conflicting, failed */ + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + GATHER_MERGE((f d)) +(17 rows) + +COMMIT; diff --git a/contrib/pg_plan_advice/expected/join_order.out b/contrib/pg_plan_advice/expected/join_order.out new file mode 100644 index 00000000000..db0dcef7012 --- /dev/null +++ b/contrib/pg_plan_advice/expected/join_order.out @@ -0,0 +1,509 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; +CREATE TABLE jo_dim1 (id integer primary key, dim1 text, val1 int) + WITH (autovacuum_enabled = false); +INSERT INTO jo_dim1 (id, dim1, val1) + SELECT g, 'some filler text ' || g, (g % 3) + 1 + FROM generate_series(1,100) g; +VACUUM ANALYZE jo_dim1; +CREATE TABLE jo_dim2 (id integer primary key, dim2 text, val2 int) + WITH (autovacuum_enabled = false); +INSERT INTO jo_dim2 (id, dim2, val2) + SELECT g, 'some filler text ' || g, (g % 7) + 1 + FROM generate_series(1,1000) g; +VACUUM ANALYZE jo_dim2; +CREATE TABLE jo_fact ( + id int primary key, + dim1_id integer not null references jo_dim1 (id), + dim2_id integer not null references jo_dim2 (id) +) WITH (autovacuum_enabled = false); +INSERT INTO jo_fact + SELECT g, (g%100)+1, (g%100)+1 FROM generate_series(1,100000) g; +VACUUM ANALYZE jo_fact; +-- We expect to join to d2 first and then d1, since the condition on d2 +-- is more selective. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------ + Hash Join + Hash Cond: (f.dim1_id = d1.id) + -> Hash Join + Hash Cond: (f.dim2_id = d2.id) + -> Seq Scan on jo_fact f + -> Hash + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + Generated Plan Advice: + JOIN_ORDER(f d2 d1) + HASH_JOIN(d2 d1) + SEQ_SCAN(f d2 d1) + NO_GATHER(f d1 d2) +(16 rows) + +-- Force a few different join orders. Some of these are very inefficient, +-- but the planner considers them all viable. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(f d1 d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------ + Hash Join + Hash Cond: (f.dim2_id = d2.id) + -> Hash Join + Hash Cond: (f.dim1_id = d1.id) + -> Seq Scan on jo_fact f + -> Hash + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + -> Hash + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + Supplied Plan Advice: + JOIN_ORDER(f d1 d2) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d1 d2) + HASH_JOIN(d1 d2) + SEQ_SCAN(f d1 d2) + NO_GATHER(f d1 d2) +(18 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(f d2 d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------ + Hash Join + Hash Cond: (f.dim1_id = d1.id) + -> Hash Join + Hash Cond: (f.dim2_id = d2.id) + -> Seq Scan on jo_fact f + -> Hash + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + Supplied Plan Advice: + JOIN_ORDER(f d2 d1) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d2 d1) + HASH_JOIN(d2 d1) + SEQ_SCAN(f d2 d1) + NO_GATHER(f d1 d2) +(18 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(d1 f d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +----------------------------------------- + Hash Join + Hash Cond: (f.dim2_id = d2.id) + -> Hash Join + Hash Cond: (d1.id = f.dim1_id) + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + -> Hash + -> Seq Scan on jo_fact f + -> Hash + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + Supplied Plan Advice: + JOIN_ORDER(d1 f d2) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d1 f d2) + HASH_JOIN(f d2) + SEQ_SCAN(d1 f d2) + NO_GATHER(f d1 d2) +(18 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(f (d1 d2))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------------------------- + Merge Join + Merge Cond: ((f.dim2_id = d2.id) AND (f.dim1_id = d1.id)) + -> Sort + Sort Key: f.dim2_id, f.dim1_id + -> Seq Scan on jo_fact f + -> Sort + Sort Key: d2.id, d1.id + -> Nested Loop + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + -> Materialize + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + Supplied Plan Advice: + JOIN_ORDER(f (d1 d2)) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f (d1 d2)) + MERGE_JOIN_PLAIN((d1 d2)) + NESTED_LOOP_MATERIALIZE(d2) + SEQ_SCAN(f d1 d2) + NO_GATHER(f d1 d2) +(21 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(f {d1 d2})'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------------------------- + Merge Join + Merge Cond: ((f.dim2_id = d2.id) AND (f.dim1_id = d1.id)) + -> Sort + Sort Key: f.dim2_id, f.dim1_id + -> Seq Scan on jo_fact f + -> Sort + Sort Key: d2.id, d1.id + -> Nested Loop + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + -> Materialize + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + Supplied Plan Advice: + JOIN_ORDER(f {d1 d2}) /* matched, failed */ + Generated Plan Advice: + JOIN_ORDER(f (d2 d1)) + MERGE_JOIN_PLAIN((d1 d2)) + NESTED_LOOP_MATERIALIZE(d1) + SEQ_SCAN(f d2 d1) + NO_GATHER(f d1 d2) +(21 rows) + +COMMIT; +-- Force a join order by mentioning just a prefix of the join list. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------------ + Hash Join + Hash Cond: (d2.id = f.dim2_id) + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + -> Hash + -> Hash Join + Hash Cond: (f.dim1_id = d1.id) + -> Seq Scan on jo_fact f + -> Hash + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + Supplied Plan Advice: + JOIN_ORDER(d2) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d2 (f d1)) + HASH_JOIN(d1 (f d1)) + SEQ_SCAN(d2 f d1) + NO_GATHER(f d1 d2) +(18 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(d2 d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------------------------- + Merge Join + Merge Cond: ((d2.id = f.dim2_id) AND (d1.id = f.dim1_id)) + -> Sort + Sort Key: d2.id, d1.id + -> Nested Loop + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + -> Materialize + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + -> Sort + Sort Key: f.dim2_id, f.dim1_id + -> Seq Scan on jo_fact f + Supplied Plan Advice: + JOIN_ORDER(d2 d1) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d2 d1 f) + MERGE_JOIN_PLAIN(f) + NESTED_LOOP_MATERIALIZE(d1) + SEQ_SCAN(d2 d1 f) + NO_GATHER(f d1 d2) +(21 rows) + +COMMIT; +-- jo_fact is not partitioned, but let's try pretending that it is and +-- verifying that the advice does not apply. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(f/d1 d1 d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------------------------- + Nested Loop + Disabled: true + -> Nested Loop + Disabled: true + -> Seq Scan on jo_fact f + -> Index Scan using jo_dim1_pkey on jo_dim1 d1 + Index Cond: (id = f.dim1_id) + Filter: (val1 = 1) + -> Index Scan using jo_dim2_pkey on jo_dim2 d2 + Index Cond: (id = f.dim2_id) + Filter: (val2 = 1) + Supplied Plan Advice: + JOIN_ORDER(f/d1 d1 d2) /* partially matched */ + Generated Plan Advice: + JOIN_ORDER(f d1 d2) + NESTED_LOOP_PLAIN(d1 d2) + SEQ_SCAN(f) + INDEX_SCAN(d1 public.jo_dim1_pkey d2 public.jo_dim2_pkey) + NO_GATHER(f d1 d2) +(19 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(f/d1 (d1 d2))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +-------------------------------------------------------------- + Nested Loop + Disabled: true + Join Filter: ((d1.id = f.dim1_id) AND (d2.id = f.dim2_id)) + -> Nested Loop + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + -> Materialize + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + -> Seq Scan on jo_fact f + Supplied Plan Advice: + JOIN_ORDER(f/d1 (d1 d2)) /* partially matched */ + Generated Plan Advice: + JOIN_ORDER(d1 d2 f) + NESTED_LOOP_PLAIN(f) + NESTED_LOOP_MATERIALIZE(d2) + SEQ_SCAN(d1 d2 f) + NO_GATHER(f d1 d2) +(18 rows) + +COMMIT; +-- The unusual formulation of this query is intended to prevent the query +-- planner from reducing the FULL JOIN to some other join type, so that we +-- can test what happens with a join type that cannot be reordered. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; + QUERY PLAN +------------------------------------------------------------- + Nested Loop + Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL)) + -> Merge Full Join + Merge Cond: (((d2.id + 0)) = ((f.dim2_id + 0))) + -> Sort + Sort Key: ((d2.id + 0)) + -> Seq Scan on jo_dim2 d2 + -> Sort + Sort Key: ((f.dim2_id + 0)) + -> Seq Scan on jo_fact f + -> Materialize + -> Seq Scan on jo_dim1 d1 + Generated Plan Advice: + JOIN_ORDER(d2 f d1) + MERGE_JOIN_PLAIN(f) + NESTED_LOOP_MATERIALIZE(d1) + SEQ_SCAN(d2 f d1) + NO_GATHER(d1 f d2) +(18 rows) + +-- We should not be able to force the planner to join f to d1 first, because +-- that is not a valid join order, but we should be able to force the planner +-- to make either d2 or f the driving table. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(f d1 d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; + QUERY PLAN +------------------------------------------------------------- + Nested Loop + Disabled: true + Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL)) + -> Merge Full Join + Disabled: true + Merge Cond: (((d2.id + 0)) = ((f.dim2_id + 0))) + -> Sort + Sort Key: ((d2.id + 0)) + -> Seq Scan on jo_dim2 d2 + -> Sort + Sort Key: ((f.dim2_id + 0)) + -> Seq Scan on jo_fact f + -> Seq Scan on jo_dim1 d1 + Supplied Plan Advice: + JOIN_ORDER(f d1 d2) /* partially matched */ + Generated Plan Advice: + JOIN_ORDER(d2 f d1) + MERGE_JOIN_PLAIN(f) + NESTED_LOOP_PLAIN(d1) + SEQ_SCAN(d2 f d1) + NO_GATHER(d1 f d2) +(21 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(f d2 d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; + QUERY PLAN +------------------------------------------------------------- + Nested Loop + Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL)) + -> Merge Full Join + Merge Cond: (((f.dim2_id + 0)) = ((d2.id + 0))) + -> Sort + Sort Key: ((f.dim2_id + 0)) + -> Seq Scan on jo_fact f + -> Sort + Sort Key: ((d2.id + 0)) + -> Seq Scan on jo_dim2 d2 + -> Materialize + -> Seq Scan on jo_dim1 d1 + Supplied Plan Advice: + JOIN_ORDER(f d2 d1) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d2 d1) + MERGE_JOIN_PLAIN(d2) + NESTED_LOOP_MATERIALIZE(d1) + SEQ_SCAN(f d2 d1) + NO_GATHER(d1 f d2) +(20 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(d2 f d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; + QUERY PLAN +------------------------------------------------------------- + Nested Loop + Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL)) + -> Merge Full Join + Merge Cond: (((d2.id + 0)) = ((f.dim2_id + 0))) + -> Sort + Sort Key: ((d2.id + 0)) + -> Seq Scan on jo_dim2 d2 + -> Sort + Sort Key: ((f.dim2_id + 0)) + -> Seq Scan on jo_fact f + -> Materialize + -> Seq Scan on jo_dim1 d1 + Supplied Plan Advice: + JOIN_ORDER(d2 f d1) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d2 f d1) + MERGE_JOIN_PLAIN(f) + NESTED_LOOP_MATERIALIZE(d1) + SEQ_SCAN(d2 f d1) + NO_GATHER(d1 f d2) +(20 rows) + +COMMIT; +-- Two incompatible join orders should conflict. In the second case, +-- the conflict is implicit: if d1 is on the inner side of a join of any +-- type, it cannot also be the driving table. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(f) join_order(d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; + QUERY PLAN +------------------------------------------------------------- + Nested Loop + Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL)) + -> Merge Full Join + Merge Cond: (((f.dim2_id + 0)) = ((d2.id + 0))) + -> Sort + Sort Key: ((f.dim2_id + 0)) + -> Seq Scan on jo_fact f + -> Sort + Sort Key: ((d2.id + 0)) + -> Seq Scan on jo_dim2 d2 + -> Materialize + -> Seq Scan on jo_dim1 d1 + Supplied Plan Advice: + JOIN_ORDER(f) /* matched, conflicting */ + JOIN_ORDER(d1) /* matched, conflicting, failed */ + Generated Plan Advice: + JOIN_ORDER(f d2 d1) + MERGE_JOIN_PLAIN(d2) + NESTED_LOOP_MATERIALIZE(d1) + SEQ_SCAN(f d2 d1) + NO_GATHER(d1 f d2) +(21 rows) + +SET LOCAL pg_plan_advice.advice = 'join_order(d1) hash_join(d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; + QUERY PLAN +--------------------------------------------------------------- + Nested Loop + Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL)) + -> Seq Scan on jo_dim1 d1 + -> Materialize + -> Merge Full Join + Merge Cond: (((d2.id + 0)) = ((f.dim2_id + 0))) + -> Sort + Sort Key: ((d2.id + 0)) + -> Seq Scan on jo_dim2 d2 + -> Sort + Sort Key: ((f.dim2_id + 0)) + -> Seq Scan on jo_fact f + Supplied Plan Advice: + JOIN_ORDER(d1) /* matched, conflicting */ + HASH_JOIN(d1) /* matched, conflicting, failed */ + Generated Plan Advice: + JOIN_ORDER(d1 (d2 f)) + MERGE_JOIN_PLAIN(f) + NESTED_LOOP_MATERIALIZE((f d2)) + SEQ_SCAN(d1 d2 f) + NO_GATHER(d1 f d2) +(21 rows) + +COMMIT; diff --git a/contrib/pg_plan_advice/expected/join_strategy.out b/contrib/pg_plan_advice/expected/join_strategy.out new file mode 100644 index 00000000000..0f9db692190 --- /dev/null +++ b/contrib/pg_plan_advice/expected/join_strategy.out @@ -0,0 +1,339 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; +CREATE TABLE join_dim (id serial primary key, dim text) + WITH (autovacuum_enabled = false); +INSERT INTO join_dim (dim) SELECT random()::text FROM generate_series(1,100) g; +VACUUM ANALYZE join_dim; +CREATE TABLE join_fact ( + id int primary key, + dim_id integer not null references join_dim (id) +) WITH (autovacuum_enabled = false); +INSERT INTO join_fact + SELECT g, (g%3)+1 FROM generate_series(1,100000) g; +CREATE INDEX join_fact_dim_id ON join_fact (dim_id); +VACUUM ANALYZE join_fact; +-- We expect a hash join by default. +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +------------------------------------ + Hash Join + Hash Cond: (f.dim_id = d.id) + -> Seq Scan on join_fact f + -> Hash + -> Seq Scan on join_dim d + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + NO_GATHER(f d) +(10 rows) + +-- Try forcing each join method in turn with join_dim as the inner table. +-- All of these should work except for MERGE_JOIN_MATERIALIZE; that will +-- fail, because the planner knows that join_dim (id) is unique, and will +-- refuse to add mark/restore overhead. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'HASH_JOIN(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +------------------------------------ + Hash Join + Hash Cond: (f.dim_id = d.id) + -> Seq Scan on join_fact f + -> Hash + -> Seq Scan on join_dim d + Supplied Plan Advice: + HASH_JOIN(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + NO_GATHER(f d) +(12 rows) + +SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_MATERIALIZE(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------------------- + Merge Join + Disabled: true + Merge Cond: (f.dim_id = d.id) + -> Index Scan using join_fact_dim_id on join_fact f + -> Index Scan using join_dim_pkey on join_dim d + Supplied Plan Advice: + MERGE_JOIN_MATERIALIZE(d) /* matched, failed */ + Generated Plan Advice: + JOIN_ORDER(f d) + MERGE_JOIN_PLAIN(d) + INDEX_SCAN(f public.join_fact_dim_id d public.join_dim_pkey) + NO_GATHER(f d) +(12 rows) + +SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_PLAIN(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------------------- + Merge Join + Merge Cond: (f.dim_id = d.id) + -> Index Scan using join_fact_dim_id on join_fact f + -> Index Scan using join_dim_pkey on join_dim d + Supplied Plan Advice: + MERGE_JOIN_PLAIN(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + MERGE_JOIN_PLAIN(d) + INDEX_SCAN(f public.join_fact_dim_id d public.join_dim_pkey) + NO_GATHER(f d) +(11 rows) + +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MATERIALIZE(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +-------------------------------------------- + Nested Loop + Join Filter: (f.dim_id = d.id) + -> Seq Scan on join_fact f + -> Materialize + -> Seq Scan on join_dim d + Supplied Plan Advice: + NESTED_LOOP_MATERIALIZE(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + NESTED_LOOP_MATERIALIZE(d) + SEQ_SCAN(f d) + NO_GATHER(f d) +(12 rows) + +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MEMOIZE(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------------- + Nested Loop + -> Seq Scan on join_fact f + -> Memoize + Cache Key: f.dim_id + Cache Mode: logical + -> Index Scan using join_dim_pkey on join_dim d + Index Cond: (id = f.dim_id) + Supplied Plan Advice: + NESTED_LOOP_MEMOIZE(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + NESTED_LOOP_MEMOIZE(d) + SEQ_SCAN(f) + INDEX_SCAN(d public.join_dim_pkey) + NO_GATHER(f d) +(15 rows) + +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------- + Nested Loop + -> Seq Scan on join_fact f + -> Index Scan using join_dim_pkey on join_dim d + Index Cond: (id = f.dim_id) + Supplied Plan Advice: + NESTED_LOOP_PLAIN(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + NESTED_LOOP_PLAIN(d) + SEQ_SCAN(f) + INDEX_SCAN(d public.join_dim_pkey) + NO_GATHER(f d) +(12 rows) + +COMMIT; +-- Now try forcing each join method in turn with join_fact as the inner +-- table. All of these should work. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'HASH_JOIN(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +------------------------------------- + Hash Join + Hash Cond: (d.id = f.dim_id) + -> Seq Scan on join_dim d + -> Hash + -> Seq Scan on join_fact f + Supplied Plan Advice: + HASH_JOIN(f) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d f) + HASH_JOIN(f) + SEQ_SCAN(d f) + NO_GATHER(f d) +(12 rows) + +SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_MATERIALIZE(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------------------- + Merge Join + Merge Cond: (d.id = f.dim_id) + -> Index Scan using join_dim_pkey on join_dim d + -> Materialize + -> Index Scan using join_fact_dim_id on join_fact f + Supplied Plan Advice: + MERGE_JOIN_MATERIALIZE(f) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d f) + MERGE_JOIN_MATERIALIZE(f) + INDEX_SCAN(d public.join_dim_pkey f public.join_fact_dim_id) + NO_GATHER(f d) +(12 rows) + +SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_PLAIN(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------------------- + Merge Join + Merge Cond: (d.id = f.dim_id) + -> Index Scan using join_dim_pkey on join_dim d + -> Index Scan using join_fact_dim_id on join_fact f + Supplied Plan Advice: + MERGE_JOIN_PLAIN(f) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d f) + MERGE_JOIN_PLAIN(f) + INDEX_SCAN(d public.join_dim_pkey f public.join_fact_dim_id) + NO_GATHER(f d) +(11 rows) + +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MATERIALIZE(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +-------------------------------------------- + Nested Loop + Join Filter: (f.dim_id = d.id) + -> Seq Scan on join_dim d + -> Materialize + -> Seq Scan on join_fact f + Supplied Plan Advice: + NESTED_LOOP_MATERIALIZE(f) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d f) + NESTED_LOOP_MATERIALIZE(f) + SEQ_SCAN(d f) + NO_GATHER(f d) +(12 rows) + +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MEMOIZE(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +-------------------------------------------------------------- + Nested Loop + -> Seq Scan on join_dim d + -> Memoize + Cache Key: d.id + Cache Mode: logical + -> Index Scan using join_fact_dim_id on join_fact f + Index Cond: (dim_id = d.id) + Supplied Plan Advice: + NESTED_LOOP_MEMOIZE(f) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d f) + NESTED_LOOP_MEMOIZE(f) + SEQ_SCAN(d) + INDEX_SCAN(f public.join_fact_dim_id) + NO_GATHER(f d) +(15 rows) + +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +-------------------------------------------------------- + Nested Loop + -> Seq Scan on join_dim d + -> Index Scan using join_fact_dim_id on join_fact f + Index Cond: (dim_id = d.id) + Supplied Plan Advice: + NESTED_LOOP_PLAIN(f) /* matched */ + Generated Plan Advice: + JOIN_ORDER(d f) + NESTED_LOOP_PLAIN(f) + SEQ_SCAN(d) + INDEX_SCAN(f public.join_fact_dim_id) + NO_GATHER(f d) +(12 rows) + +COMMIT; +-- Non-working cases. We can't force a foreign join between these tables, +-- because they aren't foreign tables. We also can't use two different +-- strategies on the same table, nor can we put both tables on the inner +-- side of the same join. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'FOREIGN_JOIN((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------- + Nested Loop + Disabled: true + -> Seq Scan on join_fact f + -> Index Scan using join_dim_pkey on join_dim d + Index Cond: (id = f.dim_id) + Supplied Plan Advice: + FOREIGN_JOIN((f d)) /* matched, failed */ + Generated Plan Advice: + JOIN_ORDER(f d) + NESTED_LOOP_PLAIN(d) + SEQ_SCAN(f) + INDEX_SCAN(d public.join_dim_pkey) + NO_GATHER(f d) +(13 rows) + +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f) NESTED_LOOP_MATERIALIZE(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +----------------------------------------------------------------- + Merge Join + Merge Cond: (d.id = f.dim_id) + -> Index Scan using join_dim_pkey on join_dim d + -> Index Scan using join_fact_dim_id on join_fact f + Supplied Plan Advice: + NESTED_LOOP_PLAIN(f) /* matched, conflicting, failed */ + NESTED_LOOP_MATERIALIZE(f) /* matched, conflicting, failed */ + Generated Plan Advice: + JOIN_ORDER(d f) + MERGE_JOIN_PLAIN(f) + INDEX_SCAN(d public.join_dim_pkey f public.join_fact_dim_id) + NO_GATHER(f d) +(12 rows) + +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------- + Nested Loop + Disabled: true + -> Seq Scan on join_fact f + -> Index Scan using join_dim_pkey on join_dim d + Index Cond: (id = f.dim_id) + Supplied Plan Advice: + NESTED_LOOP_PLAIN(f) /* matched, failed */ + NESTED_LOOP_PLAIN(d) /* matched */ + Generated Plan Advice: + JOIN_ORDER(f d) + NESTED_LOOP_PLAIN(d) + SEQ_SCAN(f) + INDEX_SCAN(d public.join_dim_pkey) + NO_GATHER(f d) +(14 rows) + +COMMIT; diff --git a/contrib/pg_plan_advice/expected/local_collector.out b/contrib/pg_plan_advice/expected/local_collector.out new file mode 100644 index 00000000000..f2adef39ed8 --- /dev/null +++ b/contrib/pg_plan_advice/expected/local_collector.out @@ -0,0 +1,67 @@ +CREATE EXTENSION pg_plan_advice; +SET debug_parallel_query = off; +-- Try clearing advice before we've collected any. +SELECT pg_clear_collected_local_advice(); + pg_clear_collected_local_advice +--------------------------------- + +(1 row) + +-- Set a small advice collection limit so that we'll exceed it. +SET pg_plan_advice.local_collection_limit = 2; +-- Enable the collector. +SET pg_plan_advice.local_collector = on; +-- Set up a dummy table. +CREATE TABLE dummy_table (a int primary key, b text) + WITH (autovacuum_enabled = false, parallel_workers = 0); +-- Test queries. +SELECT * FROM dummy_table a, dummy_table b; + a | b | a | b +---+---+---+--- +(0 rows) + +SELECT * FROM dummy_table; + a | b +---+--- +(0 rows) + +-- Should return the advice from the second test query. +SELECT advice FROM pg_get_collected_local_advice() ORDER BY id DESC LIMIT 1; + advice +------------------------ + SEQ_SCAN(dummy_table) + + NO_GATHER(dummy_table) +(1 row) + +-- Now try clearing advice again. +SELECT pg_clear_collected_local_advice(); + pg_clear_collected_local_advice +--------------------------------- + +(1 row) + +-- Raise the collection limit so that the collector uses multiple chunks. +SET pg_plan_advice.local_collection_limit = 2000; +-- Push a bunch of queries through the collector. +DO $$ +BEGIN + FOR x IN 1..2000 LOOP + EXECUTE 'SELECT * FROM dummy_table'; + END LOOP; +END +$$; +-- Check that the collector worked. +SELECT COUNT(*) FROM pg_get_collected_local_advice(); + count +------- + 2000 +(1 row) + +-- And clear one more time, to verify that this doesn't cause a problem +-- even with a larger number of entries. +SELECT pg_clear_collected_local_advice(); + pg_clear_collected_local_advice +--------------------------------- + +(1 row) + diff --git a/contrib/pg_plan_advice/expected/partitionwise.out b/contrib/pg_plan_advice/expected/partitionwise.out new file mode 100644 index 00000000000..2b3d0a82443 --- /dev/null +++ b/contrib/pg_plan_advice/expected/partitionwise.out @@ -0,0 +1,426 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; +SET enable_partitionwise_join = true; +CREATE TABLE pt1 (id integer primary key, dim1 text, val1 int) + PARTITION BY RANGE (id); +CREATE TABLE pt1a PARTITION OF pt1 FOR VALUES FROM (1) to (1001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt1b PARTITION OF pt1 FOR VALUES FROM (1001) to (2001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt1c PARTITION OF pt1 FOR VALUES FROM (2001) to (3001) + WITH (autovacuum_enabled = false); +INSERT INTO pt1 (id, dim1, val1) + SELECT g, 'some filler text ' || g, (g % 3) + 1 + FROM generate_series(1,3000) g; +VACUUM ANALYZE pt1; +CREATE TABLE pt2 (id integer primary key, dim2 text, val2 int) + PARTITION BY RANGE (id); +CREATE TABLE pt2a PARTITION OF pt2 FOR VALUES FROM (1) to (1001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt2b PARTITION OF pt2 FOR VALUES FROM (1001) to (2001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt2c PARTITION OF pt2 FOR VALUES FROM (2001) to (3001) + WITH (autovacuum_enabled = false); +INSERT INTO pt2 (id, dim2, val2) + SELECT g, 'some other text ' || g, (g % 5) + 1 + FROM generate_series(1,3000,2) g; +VACUUM ANALYZE pt2; +CREATE TABLE pt3 (id integer primary key, dim3 text, val3 int) + PARTITION BY RANGE (id); +CREATE TABLE pt3a PARTITION OF pt3 FOR VALUES FROM (1) to (1001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt3b PARTITION OF pt3 FOR VALUES FROM (1001) to (2001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt3c PARTITION OF pt3 FOR VALUES FROM (2001) to (3001) + WITH (autovacuum_enabled = false); +INSERT INTO pt3 (id, dim3, val3) + SELECT g, 'a third random text ' || g, (g % 7) + 1 + FROM generate_series(1,3000,3) g; +VACUUM ANALYZE pt3; +CREATE TABLE ptmismatch (id integer primary key, dimm text, valm int) + PARTITION BY RANGE (id); +CREATE TABLE ptmismatcha PARTITION OF ptmismatch + FOR VALUES FROM (1) to (1501) + WITH (autovacuum_enabled = false); +CREATE TABLE ptmismatchb PARTITION OF ptmismatch + FOR VALUES FROM (1501) to (3001) + WITH (autovacuum_enabled = false); +INSERT INTO ptmismatch (id, dimm, valm) + SELECT g, 'yet another text ' || g, (g % 2) + 1 + FROM generate_series(1,3000) g; +VACUUM ANALYZE ptmismatch; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; + QUERY PLAN +------------------------------------------------------------------------------------- + Append + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_1.id = pt3_1.id) + -> Seq Scan on pt2a pt2_1 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3a pt3_1 + Filter: (val3 = 1) + -> Index Scan using pt1a_pkey on pt1a pt1_1 + Index Cond: (id = pt2_1.id) + Filter: (val1 = 1) + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_2.id = pt3_2.id) + -> Seq Scan on pt2b pt2_2 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3b pt3_2 + Filter: (val3 = 1) + -> Index Scan using pt1b_pkey on pt1b pt1_2 + Index Cond: (id = pt2_2.id) + Filter: (val1 = 1) + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_3.id = pt3_3.id) + -> Seq Scan on pt2c pt2_3 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3c pt3_3 + Filter: (val3 = 1) + -> Index Scan using pt1c_pkey on pt1c pt1_3 + Index Cond: (id = pt2_3.id) + Filter: (val1 = 1) + Generated Plan Advice: + JOIN_ORDER(pt2/public.pt2a pt3/public.pt3a pt1/public.pt1a) + JOIN_ORDER(pt2/public.pt2b pt3/public.pt3b pt1/public.pt1b) + JOIN_ORDER(pt2/public.pt2c pt3/public.pt3c pt1/public.pt1c) + NESTED_LOOP_PLAIN(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c) + HASH_JOIN(pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c) + SEQ_SCAN(pt2/public.pt2a pt3/public.pt3a pt2/public.pt2b pt3/public.pt3b + pt2/public.pt2c pt3/public.pt3c) + INDEX_SCAN(pt1/public.pt1a public.pt1a_pkey pt1/public.pt1b public.pt1b_pkey + pt1/public.pt1c public.pt1c_pkey) + PARTITIONWISE((pt1 pt2 pt3)) + NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a + pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c) +(47 rows) + +-- Suppress partitionwise join, or do it just partially. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE(pt1 pt2 pt3)'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; + QUERY PLAN +------------------------------------------------------------------------------------- + Nested Loop + -> Hash Join + Hash Cond: (pt2.id = pt3.id) + -> Append + -> Seq Scan on pt2a pt2_1 + Filter: (val2 = 1) + -> Seq Scan on pt2b pt2_2 + Filter: (val2 = 1) + -> Seq Scan on pt2c pt2_3 + Filter: (val2 = 1) + -> Hash + -> Append + -> Seq Scan on pt3a pt3_1 + Filter: (val3 = 1) + -> Seq Scan on pt3b pt3_2 + Filter: (val3 = 1) + -> Seq Scan on pt3c pt3_3 + Filter: (val3 = 1) + -> Append + -> Index Scan using pt1a_pkey on pt1a pt1_1 + Index Cond: (id = pt2.id) + Filter: (val1 = 1) + -> Index Scan using pt1b_pkey on pt1b pt1_2 + Index Cond: (id = pt2.id) + Filter: (val1 = 1) + -> Index Scan using pt1c_pkey on pt1c pt1_3 + Index Cond: (id = pt2.id) + Filter: (val1 = 1) + Supplied Plan Advice: + PARTITIONWISE(pt1) /* matched */ + PARTITIONWISE(pt2) /* matched */ + PARTITIONWISE(pt3) /* matched */ + Generated Plan Advice: + JOIN_ORDER(pt2 pt3 pt1) + NESTED_LOOP_PLAIN(pt1) + HASH_JOIN(pt3) + SEQ_SCAN(pt2/public.pt2a pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a + pt3/public.pt3b pt3/public.pt3c) + INDEX_SCAN(pt1/public.pt1a public.pt1a_pkey pt1/public.pt1b public.pt1b_pkey + pt1/public.pt1c public.pt1c_pkey) + PARTITIONWISE(pt2 pt3 pt1) + NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a + pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c) +(43 rows) + +SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 pt2) pt3)'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; + QUERY PLAN +------------------------------------------------------------------------------------- + Hash Join + Hash Cond: (pt1.id = pt3.id) + -> Append + -> Hash Join + Hash Cond: (pt1_1.id = pt2_1.id) + -> Seq Scan on pt1a pt1_1 + Filter: (val1 = 1) + -> Hash + -> Seq Scan on pt2a pt2_1 + Filter: (val2 = 1) + -> Hash Join + Hash Cond: (pt1_2.id = pt2_2.id) + -> Seq Scan on pt1b pt1_2 + Filter: (val1 = 1) + -> Hash + -> Seq Scan on pt2b pt2_2 + Filter: (val2 = 1) + -> Hash Join + Hash Cond: (pt1_3.id = pt2_3.id) + -> Seq Scan on pt1c pt1_3 + Filter: (val1 = 1) + -> Hash + -> Seq Scan on pt2c pt2_3 + Filter: (val2 = 1) + -> Hash + -> Append + -> Seq Scan on pt3a pt3_1 + Filter: (val3 = 1) + -> Seq Scan on pt3b pt3_2 + Filter: (val3 = 1) + -> Seq Scan on pt3c pt3_3 + Filter: (val3 = 1) + Supplied Plan Advice: + PARTITIONWISE((pt1 pt2)) /* matched */ + PARTITIONWISE(pt3) /* matched */ + Generated Plan Advice: + JOIN_ORDER(pt1/public.pt1a pt2/public.pt2a) + JOIN_ORDER(pt1/public.pt1b pt2/public.pt2b) + JOIN_ORDER(pt1/public.pt1c pt2/public.pt2c) + JOIN_ORDER({pt1 pt2} pt3) + HASH_JOIN(pt2/public.pt2a pt2/public.pt2b pt2/public.pt2c pt3) + SEQ_SCAN(pt1/public.pt1a pt2/public.pt2a pt1/public.pt1b pt2/public.pt2b + pt1/public.pt1c pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b + pt3/public.pt3c) + PARTITIONWISE((pt1 pt2) pt3) + NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a + pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c) +(47 rows) + +COMMIT; +-- Test conflicting advice. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 pt2) (pt1 pt3))'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; + QUERY PLAN +------------------------------------------------------------------------------------- + Append + Disabled: true + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_1.id = pt3_1.id) + -> Seq Scan on pt2a pt2_1 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3a pt3_1 + Filter: (val3 = 1) + -> Index Scan using pt1a_pkey on pt1a pt1_1 + Index Cond: (id = pt2_1.id) + Filter: (val1 = 1) + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_2.id = pt3_2.id) + -> Seq Scan on pt2b pt2_2 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3b pt3_2 + Filter: (val3 = 1) + -> Index Scan using pt1b_pkey on pt1b pt1_2 + Index Cond: (id = pt2_2.id) + Filter: (val1 = 1) + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_3.id = pt3_3.id) + -> Seq Scan on pt2c pt2_3 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3c pt3_3 + Filter: (val3 = 1) + -> Index Scan using pt1c_pkey on pt1c pt1_3 + Index Cond: (id = pt2_3.id) + Filter: (val1 = 1) + Supplied Plan Advice: + PARTITIONWISE((pt1 pt2)) /* matched, conflicting, failed */ + PARTITIONWISE((pt1 pt3)) /* matched, conflicting, failed */ + Generated Plan Advice: + JOIN_ORDER(pt2/public.pt2a pt3/public.pt3a pt1/public.pt1a) + JOIN_ORDER(pt2/public.pt2b pt3/public.pt3b pt1/public.pt1b) + JOIN_ORDER(pt2/public.pt2c pt3/public.pt3c pt1/public.pt1c) + NESTED_LOOP_PLAIN(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c) + HASH_JOIN(pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c) + SEQ_SCAN(pt2/public.pt2a pt3/public.pt3a pt2/public.pt2b pt3/public.pt3b + pt2/public.pt2c pt3/public.pt3c) + INDEX_SCAN(pt1/public.pt1a public.pt1a_pkey pt1/public.pt1b public.pt1b_pkey + pt1/public.pt1c public.pt1c_pkey) + PARTITIONWISE((pt1 pt2 pt3)) + NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a + pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c) +(51 rows) + +COMMIT; +-- Can't force a partitionwise join with a mismatched table. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 ptmismatch))'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, ptmismatch WHERE pt1.id = ptmismatch.id; + QUERY PLAN +--------------------------------------------------------------------------- + Nested Loop + Disabled: true + -> Append + -> Seq Scan on pt1a pt1_1 + -> Seq Scan on pt1b pt1_2 + -> Seq Scan on pt1c pt1_3 + -> Append + -> Index Scan using ptmismatcha_pkey on ptmismatcha ptmismatch_1 + Index Cond: (id = pt1.id) + -> Index Scan using ptmismatchb_pkey on ptmismatchb ptmismatch_2 + Index Cond: (id = pt1.id) + Supplied Plan Advice: + PARTITIONWISE((pt1 ptmismatch)) /* matched, failed */ + Generated Plan Advice: + JOIN_ORDER(pt1 ptmismatch) + NESTED_LOOP_PLAIN(ptmismatch) + SEQ_SCAN(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c) + INDEX_SCAN(ptmismatch/public.ptmismatcha public.ptmismatcha_pkey + ptmismatch/public.ptmismatchb public.ptmismatchb_pkey) + PARTITIONWISE(pt1 ptmismatch) + NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c + ptmismatch/public.ptmismatcha ptmismatch/public.ptmismatchb) +(22 rows) + +COMMIT; +-- Force join order for a particular branch of the partitionwise join with +-- and without mentioning the schema name. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'JOIN_ORDER(pt3/public.pt3a pt2/public.pt2a pt1/public.pt1a)'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; + QUERY PLAN +------------------------------------------------------------------------------------- + Append + -> Nested Loop + -> Hash Join + Hash Cond: (pt3_1.id = pt2_1.id) + -> Seq Scan on pt3a pt3_1 + Filter: (val3 = 1) + -> Hash + -> Seq Scan on pt2a pt2_1 + Filter: (val2 = 1) + -> Index Scan using pt1a_pkey on pt1a pt1_1 + Index Cond: (id = pt2_1.id) + Filter: (val1 = 1) + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_2.id = pt3_2.id) + -> Seq Scan on pt2b pt2_2 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3b pt3_2 + Filter: (val3 = 1) + -> Index Scan using pt1b_pkey on pt1b pt1_2 + Index Cond: (id = pt2_2.id) + Filter: (val1 = 1) + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_3.id = pt3_3.id) + -> Seq Scan on pt2c pt2_3 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3c pt3_3 + Filter: (val3 = 1) + -> Index Scan using pt1c_pkey on pt1c pt1_3 + Index Cond: (id = pt2_3.id) + Filter: (val1 = 1) + Supplied Plan Advice: + JOIN_ORDER(pt3/public.pt3a pt2/public.pt2a pt1/public.pt1a) /* matched */ + Generated Plan Advice: + JOIN_ORDER(pt3/public.pt3a pt2/public.pt2a pt1/public.pt1a) + JOIN_ORDER(pt2/public.pt2b pt3/public.pt3b pt1/public.pt1b) + JOIN_ORDER(pt2/public.pt2c pt3/public.pt3c pt1/public.pt1c) + NESTED_LOOP_PLAIN(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c) + HASH_JOIN(pt2/public.pt2a pt3/public.pt3b pt3/public.pt3c) + SEQ_SCAN(pt3/public.pt3a pt2/public.pt2a pt2/public.pt2b pt3/public.pt3b + pt2/public.pt2c pt3/public.pt3c) + INDEX_SCAN(pt1/public.pt1a public.pt1a_pkey pt1/public.pt1b public.pt1b_pkey + pt1/public.pt1c public.pt1c_pkey) + PARTITIONWISE((pt1 pt2 pt3)) + NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a + pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c) +(49 rows) + +SET LOCAL pg_plan_advice.advice = 'JOIN_ORDER(pt3/pt3a pt2/pt2a pt1/pt1a)'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; + QUERY PLAN +------------------------------------------------------------------------------------- + Append + -> Nested Loop + -> Hash Join + Hash Cond: (pt3_1.id = pt2_1.id) + -> Seq Scan on pt3a pt3_1 + Filter: (val3 = 1) + -> Hash + -> Seq Scan on pt2a pt2_1 + Filter: (val2 = 1) + -> Index Scan using pt1a_pkey on pt1a pt1_1 + Index Cond: (id = pt2_1.id) + Filter: (val1 = 1) + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_2.id = pt3_2.id) + -> Seq Scan on pt2b pt2_2 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3b pt3_2 + Filter: (val3 = 1) + -> Index Scan using pt1b_pkey on pt1b pt1_2 + Index Cond: (id = pt2_2.id) + Filter: (val1 = 1) + -> Nested Loop + -> Hash Join + Hash Cond: (pt2_3.id = pt3_3.id) + -> Seq Scan on pt2c pt2_3 + Filter: (val2 = 1) + -> Hash + -> Seq Scan on pt3c pt3_3 + Filter: (val3 = 1) + -> Index Scan using pt1c_pkey on pt1c pt1_3 + Index Cond: (id = pt2_3.id) + Filter: (val1 = 1) + Supplied Plan Advice: + JOIN_ORDER(pt3/pt3a pt2/pt2a pt1/pt1a) /* matched */ + Generated Plan Advice: + JOIN_ORDER(pt3/public.pt3a pt2/public.pt2a pt1/public.pt1a) + JOIN_ORDER(pt2/public.pt2b pt3/public.pt3b pt1/public.pt1b) + JOIN_ORDER(pt2/public.pt2c pt3/public.pt3c pt1/public.pt1c) + NESTED_LOOP_PLAIN(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c) + HASH_JOIN(pt2/public.pt2a pt3/public.pt3b pt3/public.pt3c) + SEQ_SCAN(pt3/public.pt3a pt2/public.pt2a pt2/public.pt2b pt3/public.pt3b + pt2/public.pt2c pt3/public.pt3c) + INDEX_SCAN(pt1/public.pt1a public.pt1a_pkey pt1/public.pt1b public.pt1b_pkey + pt1/public.pt1c public.pt1c_pkey) + PARTITIONWISE((pt1 pt2 pt3)) + NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a + pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c) +(49 rows) + +COMMIT; diff --git a/contrib/pg_plan_advice/expected/prepared.out b/contrib/pg_plan_advice/expected/prepared.out new file mode 100644 index 00000000000..07a7c623659 --- /dev/null +++ b/contrib/pg_plan_advice/expected/prepared.out @@ -0,0 +1,67 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; +CREATE TABLE ptab (id integer, val text) WITH (autovacuum_enabled = false); +SET pg_plan_advice.always_store_advice_details = false; +-- Not prepared, so advice should be generated. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM ptab; + QUERY PLAN +------------------------ + Seq Scan on ptab + Generated Plan Advice: + SEQ_SCAN(ptab) + NO_GATHER(ptab) +(4 rows) + +-- Prepared, so advice should not be generated. +PREPARE pt1 AS SELECT * FROM ptab; +EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt1; + QUERY PLAN +------------------ + Seq Scan on ptab +(1 row) + +SET pg_plan_advice.always_store_advice_details = true; +-- Prepared, but always_store_advice_details = true, so should show advice. +PREPARE pt2 AS SELECT * FROM ptab; +EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt2; + QUERY PLAN +------------------------ + Seq Scan on ptab + Generated Plan Advice: + SEQ_SCAN(ptab) + NO_GATHER(ptab) +(4 rows) + +-- Not prepared, so feedback should be generated. +SET pg_plan_advice.always_store_advice_details = false; +SET pg_plan_advice.advice = 'SEQ_SCAN(ptab)'; +EXPLAIN (COSTS OFF) +SELECT * FROM ptab; + QUERY PLAN +-------------------------------- + Seq Scan on ptab + Supplied Plan Advice: + SEQ_SCAN(ptab) /* matched */ +(3 rows) + +-- Prepared, so advice should not be generated. +PREPARE pt3 AS SELECT * FROM ptab; +EXPLAIN (COSTS OFF) EXECUTE pt1; + QUERY PLAN +------------------ + Seq Scan on ptab +(1 row) + +SET pg_plan_advice.always_store_advice_details = true; +-- Prepared, but always_store_advice_details = true, so should show feedback. +PREPARE pt4 AS SELECT * FROM ptab; +EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt2; + QUERY PLAN +------------------------ + Seq Scan on ptab + Generated Plan Advice: + SEQ_SCAN(ptab) + NO_GATHER(ptab) +(4 rows) + diff --git a/contrib/pg_plan_advice/expected/scan.out b/contrib/pg_plan_advice/expected/scan.out new file mode 100644 index 00000000000..d05ead369b4 --- /dev/null +++ b/contrib/pg_plan_advice/expected/scan.out @@ -0,0 +1,757 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; +SET seq_page_cost = 0.1; +SET random_page_cost = 0.1; +SET cpu_tuple_cost = 0; +SET cpu_index_tuple_cost = 0; +CREATE TABLE scan_table (a int primary key, b text) + WITH (autovacuum_enabled = false); +INSERT INTO scan_table + SELECT g, 'some text ' || g FROM generate_series(1, 100000) g; +CREATE INDEX scan_table_b ON scan_table USING brin (b); +VACUUM ANALYZE scan_table; +-- Sequential scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; + QUERY PLAN +------------------------- + Seq Scan on scan_table + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(4 rows) + +-- Index scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(5 rows) + +-- Index-only scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------------ + Index Only Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Generated Plan Advice: + INDEX_ONLY_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(5 rows) + +-- Bitmap heap scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE b > 'some text 8'; + QUERY PLAN +----------------------------------------------- + Bitmap Heap Scan on scan_table + Recheck Cond: (b > 'some text 8'::text) + -> Bitmap Index Scan on scan_table_b + Index Cond: (b > 'some text 8'::text) + Generated Plan Advice: + BITMAP_HEAP_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +-- TID scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)'; + QUERY PLAN +----------------------------------- + Tid Scan on scan_table + TID Cond: (ctid = '(0,1)'::tid) + Generated Plan Advice: + TID_SCAN(scan_table) + NO_GATHER(scan_table) +(5 rows) + +-- TID range scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE ctid > '(1,1)' AND ctid < '(2,1)'; + QUERY PLAN +--------------------------------------------------------------- + Tid Range Scan on scan_table + TID Cond: ((ctid > '(1,1)'::tid) AND (ctid < '(2,1)'::tid)) + Generated Plan Advice: + TID_SCAN(scan_table) + NO_GATHER(scan_table) +(5 rows) + +-- Try forcing each of our test queries to use the scan type they +-- wanted to use anyway. This should succeed. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; + QUERY PLAN +-------------------------------------- + Seq Scan on scan_table + Supplied Plan Advice: + SEQ_SCAN(scan_table) /* matched */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(6 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +-------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table scan_table_pkey) /* matched */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------------------- + Index Only Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_ONLY_SCAN(scan_table scan_table_pkey) /* matched */ + Generated Plan Advice: + INDEX_ONLY_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE b > 'some text 8'; + QUERY PLAN +----------------------------------------------- + Bitmap Heap Scan on scan_table + Recheck Cond: (b > 'some text 8'::text) + -> Bitmap Index Scan on scan_table_b + Index Cond: (b > 'some text 8'::text) + Supplied Plan Advice: + BITMAP_HEAP_SCAN(scan_table) /* matched */ + Generated Plan Advice: + BITMAP_HEAP_SCAN(scan_table) + NO_GATHER(scan_table) +(9 rows) + +SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)'; + QUERY PLAN +-------------------------------------- + Tid Scan on scan_table + TID Cond: (ctid = '(0,1)'::tid) + Supplied Plan Advice: + TID_SCAN(scan_table) /* matched */ + Generated Plan Advice: + TID_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE ctid > '(1,1)' AND ctid < '(2,1)'; + QUERY PLAN +--------------------------------------------------------------- + Tid Range Scan on scan_table + TID Cond: ((ctid > '(1,1)'::tid) AND (ctid < '(2,1)'::tid)) + Supplied Plan Advice: + TID_SCAN(scan_table) /* matched */ + Generated Plan Advice: + TID_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +COMMIT; +-- Try to force a full scan of the table to use some other scan type. All +-- of these will fail. An index scan or bitmap heap scan could potentially +-- generate the correct answer, but the planner does not even consider these +-- possibilities due to the lack of a WHERE clause. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; + QUERY PLAN +---------------------------------------------------------------- + Seq Scan on scan_table + Disabled: true + Supplied Plan Advice: + INDEX_SCAN(scan_table scan_table_pkey) /* matched, failed */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; + QUERY PLAN +--------------------------------------------------------------------- + Seq Scan on scan_table + Disabled: true + Supplied Plan Advice: + INDEX_ONLY_SCAN(scan_table scan_table_pkey) /* matched, failed */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; + QUERY PLAN +------------------------------------------------------ + Seq Scan on scan_table + Disabled: true + Supplied Plan Advice: + BITMAP_HEAP_SCAN(scan_table) /* matched, failed */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; + QUERY PLAN +---------------------------------------------- + Seq Scan on scan_table + Disabled: true + Supplied Plan Advice: + TID_SCAN(scan_table) /* matched, failed */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +COMMIT; +-- Try again to force index use. This should now succeed for the INDEX_SCAN +-- and BITMAP_HEAP_SCAN, but the INDEX_ONLY_SCAN can't be forced because the +-- query fetches columns not included in the index. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0; + QUERY PLAN +-------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a > 0) + Supplied Plan Advice: + INDEX_SCAN(scan_table scan_table_pkey) /* matched */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0; + QUERY PLAN +--------------------------------------------------------------------- + Seq Scan on scan_table + Disabled: true + Filter: (a > 0) + Supplied Plan Advice: + INDEX_ONLY_SCAN(scan_table scan_table_pkey) /* matched, failed */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(8 rows) + +SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0; + QUERY PLAN +---------------------------------------------- + Bitmap Heap Scan on scan_table + Recheck Cond: (a > 0) + -> Bitmap Index Scan on scan_table_pkey + Index Cond: (a > 0) + Supplied Plan Advice: + BITMAP_HEAP_SCAN(scan_table) /* matched */ + Generated Plan Advice: + BITMAP_HEAP_SCAN(scan_table) + NO_GATHER(scan_table) +(9 rows) + +COMMIT; +-- We can force a primary key lookup to use a sequential scan, but we +-- can't force it to use an index-only scan (due to the column list) +-- or a TID scan (due to the absence of a TID qual). +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +-------------------------------------- + Seq Scan on scan_table + Filter: (a = 1) + Supplied Plan Advice: + SEQ_SCAN(scan_table) /* matched */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +--------------------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Disabled: true + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_ONLY_SCAN(scan_table scan_table_pkey) /* matched, failed */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(8 rows) + +SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Disabled: true + Index Cond: (a = 1) + Supplied Plan Advice: + TID_SCAN(scan_table) /* matched, failed */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(8 rows) + +COMMIT; +-- We can forcibly downgrade an index-only scan to an index scan, but we can't +-- force the use of an index that the planner thinks is inapplicable. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +-------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table scan_table_pkey) /* matched */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table public.scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +--------------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) /* matched */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_b)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Disabled: true + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table scan_table_b) /* matched, failed */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(8 rows) + +COMMIT; +-- We can force the use of a sequential scan in place of a bitmap heap scan, +-- but a plain index scan on a BRIN index is not possible. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE b > 'some text 8'; + QUERY PLAN +-------------------------------------- + Seq Scan on scan_table + Filter: (b > 'some text 8'::text) + Supplied Plan Advice: + SEQ_SCAN(scan_table) /* matched */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_b)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Disabled: true + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table scan_table_b) /* matched, failed */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(8 rows) + +COMMIT; +-- We can force the use of a sequential scan rather than a TID scan or +-- TID range scan. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)'; + QUERY PLAN +-------------------------------------- + Seq Scan on scan_table + Filter: (ctid = '(0,1)'::tid) + Supplied Plan Advice: + SEQ_SCAN(scan_table) /* matched */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE ctid > '(1,1)' AND ctid < '(2,1)'; + QUERY PLAN +------------------------------------------------------------- + Seq Scan on scan_table + Filter: ((ctid > '(1,1)'::tid) AND (ctid < '(2,1)'::tid)) + Supplied Plan Advice: + SEQ_SCAN(scan_table) /* matched */ + Generated Plan Advice: + SEQ_SCAN(scan_table) + NO_GATHER(scan_table) +(7 rows) + +COMMIT; +-- Test more complex scenarios with index scans. +BEGIN; +-- Should still work if we mention the schema. +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table public.scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +--------------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) /* matched */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +-- But not if we mention the wrong schema. +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table cilbup.scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table cilbup.scan_table_pkey) /* matched, inapplicable, failed */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +-- It's OK to repeat the same advice. +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +-------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table scan_table_pkey) /* matched */ + INDEX_SCAN(scan_table scan_table_pkey) /* matched */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(8 rows) + +-- But it doesn't work if the index target is even notionally different. +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey scan_table public.scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + QUERY PLAN +---------------------------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table scan_table_pkey) /* matched, conflicting */ + INDEX_SCAN(scan_table public.scan_table_pkey) /* matched, conflicting */ + Generated Plan Advice: + INDEX_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(8 rows) + +COMMIT; +-- Test assorted incorrect advice. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(nothing)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------------ + Index Only Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + SEQ_SCAN(nothing) /* not matched */ + Generated Plan Advice: + INDEX_ONLY_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(nothing whatsoever)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------------ + Index Only Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(nothing whatsoever) /* not matched */ + Generated Plan Advice: + INDEX_ONLY_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table bogus)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; + QUERY PLAN +-------------------------------------------------------------------- + Index Only Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_SCAN(scan_table bogus) /* matched, inapplicable, failed */ + Generated Plan Advice: + INDEX_ONLY_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(nothing whatsoever)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; + QUERY PLAN +--------------------------------------------------------- + Index Only Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_ONLY_SCAN(nothing whatsoever) /* not matched */ + Generated Plan Advice: + INDEX_ONLY_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table bogus)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; + QUERY PLAN +------------------------------------------------------------------------- + Index Only Scan using scan_table_pkey on scan_table + Index Cond: (a = 1) + Supplied Plan Advice: + INDEX_ONLY_SCAN(scan_table bogus) /* matched, inapplicable, failed */ + Generated Plan Advice: + INDEX_ONLY_SCAN(scan_table public.scan_table_pkey) + NO_GATHER(scan_table) +(7 rows) + +COMMIT; +-- Test our ability to refer to multiple instances of the same alias. +BEGIN; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x + LEFT JOIN scan_table s ON g = s.a; + QUERY PLAN +------------------------------------------------------------------- + Nested Loop Left Join + -> Nested Loop Left Join + -> Function Scan on generate_series g + -> Index Scan using scan_table_pkey on scan_table s + Index Cond: (a = g.g) + -> Index Scan using scan_table_pkey on scan_table s_1 + Index Cond: (a = g.g) + Generated Plan Advice: + JOIN_ORDER(g s s#2) + NESTED_LOOP_PLAIN(s s#2) + INDEX_SCAN(s public.scan_table_pkey s#2 public.scan_table_pkey) + NO_GATHER(s s#2) +(12 rows) + +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x + LEFT JOIN scan_table s ON g = s.a; + QUERY PLAN +---------------------------------------------------------- + Nested Loop Left Join + -> Hash Left Join + Hash Cond: (g.g = s.a) + -> Function Scan on generate_series g + -> Hash + -> Seq Scan on scan_table s + -> Index Scan using scan_table_pkey on scan_table s_1 + Index Cond: (a = g.g) + Supplied Plan Advice: + SEQ_SCAN(s) /* matched */ + Generated Plan Advice: + JOIN_ORDER(g s s#2) + NESTED_LOOP_PLAIN(s#2) + HASH_JOIN(s) + SEQ_SCAN(s) + INDEX_SCAN(s#2 public.scan_table_pkey) + NO_GATHER(s s#2) +(17 rows) + +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s#2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x + LEFT JOIN scan_table s ON g = s.a; + QUERY PLAN +-------------------------------------------------------------- + Hash Left Join + Hash Cond: (g.g = s_1.a) + -> Nested Loop Left Join + -> Function Scan on generate_series g + -> Index Scan using scan_table_pkey on scan_table s + Index Cond: (a = g.g) + -> Hash + -> Seq Scan on scan_table s_1 + Supplied Plan Advice: + SEQ_SCAN(s#2) /* matched */ + Generated Plan Advice: + JOIN_ORDER(g s s#2) + NESTED_LOOP_PLAIN(s) + HASH_JOIN(s#2) + SEQ_SCAN(s#2) + INDEX_SCAN(s public.scan_table_pkey) + NO_GATHER(s s#2) +(17 rows) + +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s) SEQ_SCAN(s#2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x + LEFT JOIN scan_table s ON g = s.a; + QUERY PLAN +------------------------------------------------ + Hash Left Join + Hash Cond: (g.g = s_1.a) + -> Hash Left Join + Hash Cond: (g.g = s.a) + -> Function Scan on generate_series g + -> Hash + -> Seq Scan on scan_table s + -> Hash + -> Seq Scan on scan_table s_1 + Supplied Plan Advice: + SEQ_SCAN(s) /* matched */ + SEQ_SCAN(s#2) /* matched */ + Generated Plan Advice: + JOIN_ORDER(g s s#2) + HASH_JOIN(s s#2) + SEQ_SCAN(s s#2) + NO_GATHER(s s#2) +(17 rows) + +COMMIT; +-- Test our ability to refer to scans within a subquery. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x; + QUERY PLAN +-------------------------------------------------- + Index Scan using scan_table_pkey on scan_table s + Index Cond: (a = 1) + Generated Plan Advice: + INDEX_SCAN(s@x public.scan_table_pkey) + NO_GATHER(s@x) +(5 rows) + +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0); + QUERY PLAN +--------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table s + Index Cond: (a = 1) + Generated Plan Advice: + INDEX_SCAN(s@unnamed_subquery public.scan_table_pkey) + NO_GATHER(s@unnamed_subquery) +(5 rows) + +BEGIN; +-- Should not match. +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x; + QUERY PLAN +-------------------------------------------------- + Index Scan using scan_table_pkey on scan_table s + Index Cond: (a = 1) + Supplied Plan Advice: + SEQ_SCAN(s) /* not matched */ + Generated Plan Advice: + INDEX_SCAN(s@x public.scan_table_pkey) + NO_GATHER(s@x) +(7 rows) + +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0); + QUERY PLAN +--------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table s + Index Cond: (a = 1) + Supplied Plan Advice: + SEQ_SCAN(s) /* not matched */ + Generated Plan Advice: + INDEX_SCAN(s@unnamed_subquery public.scan_table_pkey) + NO_GATHER(s@unnamed_subquery) +(7 rows) + +-- Should match first query only. +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s@x)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x; + QUERY PLAN +------------------------------- + Seq Scan on scan_table s + Filter: (a = 1) + Supplied Plan Advice: + SEQ_SCAN(s@x) /* matched */ + Generated Plan Advice: + SEQ_SCAN(s@x) + NO_GATHER(s@x) +(7 rows) + +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0); + QUERY PLAN +--------------------------------------------------------- + Index Scan using scan_table_pkey on scan_table s + Index Cond: (a = 1) + Supplied Plan Advice: + SEQ_SCAN(s@x) /* not matched */ + Generated Plan Advice: + INDEX_SCAN(s@unnamed_subquery public.scan_table_pkey) + NO_GATHER(s@unnamed_subquery) +(7 rows) + +-- Should match second query only. +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s@unnamed_subquery)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x; + QUERY PLAN +-------------------------------------------------- + Index Scan using scan_table_pkey on scan_table s + Index Cond: (a = 1) + Supplied Plan Advice: + SEQ_SCAN(s@unnamed_subquery) /* not matched */ + Generated Plan Advice: + INDEX_SCAN(s@x public.scan_table_pkey) + NO_GATHER(s@x) +(7 rows) + +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0); + QUERY PLAN +---------------------------------------------- + Seq Scan on scan_table s + Filter: (a = 1) + Supplied Plan Advice: + SEQ_SCAN(s@unnamed_subquery) /* matched */ + Generated Plan Advice: + SEQ_SCAN(s@unnamed_subquery) + NO_GATHER(s@unnamed_subquery) +(7 rows) + +COMMIT; diff --git a/contrib/pg_plan_advice/expected/semijoin.out b/contrib/pg_plan_advice/expected/semijoin.out new file mode 100644 index 00000000000..6f203c5a68e --- /dev/null +++ b/contrib/pg_plan_advice/expected/semijoin.out @@ -0,0 +1,377 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; +CREATE TABLE sj_wide ( + id integer primary key, + val1 integer, + padding text storage plain +) WITH (autovacuum_enabled = false); +INSERT INTO sj_wide + SELECT g, g%10+1, repeat(' ', 300) FROM generate_series(1, 1000) g; +CREATE INDEX ON sj_wide (val1); +VACUUM ANALYZE sj_wide; +CREATE TABLE sj_narrow ( + id integer primary key, + val1 integer +) WITH (autovacuum_enabled = false); +INSERT INTO sj_narrow + SELECT g, g%10+1 FROM generate_series(1, 1000) g; +CREATE INDEX ON sj_narrow (val1); +VACUUM ANALYZE sj_narrow; +-- We expect this to make the VALUES list unique and use index lookups to +-- find the rows in sj_wide, so as to avoid a full scan of sj_wide. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_wide + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + QUERY PLAN +----------------------------------------------------------- + Nested Loop + -> HashAggregate + Group Key: "*VALUES*".column1, "*VALUES*".column2 + -> Values Scan on "*VALUES*" + -> Index Scan using sj_wide_pkey on sj_wide + Index Cond: (id = "*VALUES*".column1) + Filter: (val1 = "*VALUES*".column2) + Generated Plan Advice: + JOIN_ORDER("*VALUES*" sj_wide) + NESTED_LOOP_PLAIN(sj_wide) + INDEX_SCAN(sj_wide public.sj_wide_pkey) + SEMIJOIN_UNIQUE("*VALUES*") + NO_GATHER(sj_wide) +(13 rows) + +-- If we ask for a unique semijoin, we should get the same plan as with +-- no advice. If we ask for a non-unique semijoin, we should see a Semi +-- Join operation in the plan tree. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique("*VALUES*")'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_wide + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + QUERY PLAN +----------------------------------------------------------- + Nested Loop + -> HashAggregate + Group Key: "*VALUES*".column1, "*VALUES*".column2 + -> Values Scan on "*VALUES*" + -> Index Scan using sj_wide_pkey on sj_wide + Index Cond: (id = "*VALUES*".column1) + Filter: (val1 = "*VALUES*".column2) + Supplied Plan Advice: + SEMIJOIN_UNIQUE("*VALUES*") /* matched */ + Generated Plan Advice: + JOIN_ORDER("*VALUES*" sj_wide) + NESTED_LOOP_PLAIN(sj_wide) + INDEX_SCAN(sj_wide public.sj_wide_pkey) + SEMIJOIN_UNIQUE("*VALUES*") + NO_GATHER(sj_wide) +(15 rows) + +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique("*VALUES*")'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_wide + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + QUERY PLAN +------------------------------------------------------------------------------------------ + Hash Semi Join + Hash Cond: ((sj_wide.id = "*VALUES*".column1) AND (sj_wide.val1 = "*VALUES*".column2)) + -> Seq Scan on sj_wide + -> Hash + -> Values Scan on "*VALUES*" + Supplied Plan Advice: + SEMIJOIN_NON_UNIQUE("*VALUES*") /* matched */ + Generated Plan Advice: + JOIN_ORDER(sj_wide "*VALUES*") + HASH_JOIN("*VALUES*") + SEQ_SCAN(sj_wide) + SEMIJOIN_NON_UNIQUE("*VALUES*") + NO_GATHER(sj_wide) +(13 rows) + +COMMIT; +-- Because this table is narrower than the previous one, a sequential scan +-- is less expensive, and we choose a straightforward Semi Join plan by +-- default. (Note that this is also very sensitive to the length of the IN +-- list, which affects how many index lookups the alternative plan will need.) +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_narrow + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + QUERY PLAN +---------------------------------------------------------------------------------------------- + Hash Semi Join + Hash Cond: ((sj_narrow.id = "*VALUES*".column1) AND (sj_narrow.val1 = "*VALUES*".column2)) + -> Seq Scan on sj_narrow + -> Hash + -> Values Scan on "*VALUES*" + Generated Plan Advice: + JOIN_ORDER(sj_narrow "*VALUES*") + HASH_JOIN("*VALUES*") + SEQ_SCAN(sj_narrow) + SEMIJOIN_NON_UNIQUE("*VALUES*") + NO_GATHER(sj_narrow) +(11 rows) + +-- Here, we expect advising a unique semijoin to swith to the same plan that +-- we got with sj_wide, and advising a non-unique semijoin should not change +-- the plan. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique("*VALUES*")'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_narrow + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + QUERY PLAN +---------------------------------------------------------------------------------------------- + Hash Join + Hash Cond: ((sj_narrow.id = "*VALUES*".column1) AND (sj_narrow.val1 = "*VALUES*".column2)) + -> Seq Scan on sj_narrow + -> Hash + -> HashAggregate + Group Key: "*VALUES*".column1, "*VALUES*".column2 + -> Values Scan on "*VALUES*" + Supplied Plan Advice: + SEMIJOIN_UNIQUE("*VALUES*") /* matched */ + Generated Plan Advice: + JOIN_ORDER(sj_narrow "*VALUES*") + HASH_JOIN("*VALUES*") + SEQ_SCAN(sj_narrow) + SEMIJOIN_UNIQUE("*VALUES*") + NO_GATHER(sj_narrow) +(15 rows) + +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique("*VALUES*")'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_narrow + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + QUERY PLAN +---------------------------------------------------------------------------------------------- + Hash Semi Join + Hash Cond: ((sj_narrow.id = "*VALUES*".column1) AND (sj_narrow.val1 = "*VALUES*".column2)) + -> Seq Scan on sj_narrow + -> Hash + -> Values Scan on "*VALUES*" + Supplied Plan Advice: + SEMIJOIN_NON_UNIQUE("*VALUES*") /* matched */ + Generated Plan Advice: + JOIN_ORDER(sj_narrow "*VALUES*") + HASH_JOIN("*VALUES*") + SEQ_SCAN(sj_narrow) + SEMIJOIN_NON_UNIQUE("*VALUES*") + NO_GATHER(sj_narrow) +(13 rows) + +COMMIT; +-- In the above example, we made the outer side of the join unique, but here, +-- we should make the inner side unique. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + QUERY PLAN +------------------------------------------ + Hash Join + Hash Cond: (g.g = sj_narrow.val1) + -> Function Scan on generate_series g + -> Hash + -> HashAggregate + Group Key: sj_narrow.val1 + -> Seq Scan on sj_narrow + Generated Plan Advice: + JOIN_ORDER(g sj_narrow) + HASH_JOIN(sj_narrow) + SEQ_SCAN(sj_narrow) + SEMIJOIN_UNIQUE(sj_narrow) + NO_GATHER(sj_narrow) +(13 rows) + +-- We should be able to force a plan with or without the make-unique strategy, +-- with either side as the driving table. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + QUERY PLAN +-------------------------------------------- + Hash Join + Hash Cond: (g.g = sj_narrow.val1) + -> Function Scan on generate_series g + -> Hash + -> HashAggregate + Group Key: sj_narrow.val1 + -> Seq Scan on sj_narrow + Supplied Plan Advice: + SEMIJOIN_UNIQUE(sj_narrow) /* matched */ + Generated Plan Advice: + JOIN_ORDER(g sj_narrow) + HASH_JOIN(sj_narrow) + SEQ_SCAN(sj_narrow) + SEMIJOIN_UNIQUE(sj_narrow) + NO_GATHER(sj_narrow) +(15 rows) + +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + QUERY PLAN +------------------------------------------------ + Hash Semi Join + Hash Cond: (g.g = sj_narrow.val1) + -> Function Scan on generate_series g + -> Hash + -> Seq Scan on sj_narrow + Supplied Plan Advice: + SEMIJOIN_NON_UNIQUE(sj_narrow) /* matched */ + Generated Plan Advice: + JOIN_ORDER(g sj_narrow) + HASH_JOIN(sj_narrow) + SEQ_SCAN(sj_narrow) + SEMIJOIN_NON_UNIQUE(sj_narrow) + NO_GATHER(sj_narrow) +(13 rows) + +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow) join_order(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + QUERY PLAN +------------------------------------------------ + Hash Join + Hash Cond: (sj_narrow.val1 = g.g) + -> HashAggregate + Group Key: sj_narrow.val1 + -> Seq Scan on sj_narrow + -> Hash + -> Function Scan on generate_series g + Supplied Plan Advice: + SEMIJOIN_UNIQUE(sj_narrow) /* matched */ + JOIN_ORDER(sj_narrow) /* matched */ + Generated Plan Advice: + JOIN_ORDER(sj_narrow g) + HASH_JOIN(g) + SEQ_SCAN(sj_narrow) + SEMIJOIN_UNIQUE(sj_narrow) + NO_GATHER(sj_narrow) +(16 rows) + +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(sj_narrow) join_order(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + QUERY PLAN +------------------------------------------------ + Hash Right Semi Join + Hash Cond: (sj_narrow.val1 = g.g) + -> Seq Scan on sj_narrow + -> Hash + -> Function Scan on generate_series g + Supplied Plan Advice: + SEMIJOIN_NON_UNIQUE(sj_narrow) /* matched */ + JOIN_ORDER(sj_narrow) /* matched */ + Generated Plan Advice: + JOIN_ORDER(sj_narrow g) + HASH_JOIN(g) + SEQ_SCAN(sj_narrow) + SEMIJOIN_NON_UNIQUE(sj_narrow) + NO_GATHER(sj_narrow) +(14 rows) + +COMMIT; +-- However, mentioning the wrong side of the join should result in an advice +-- failure. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(g)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + QUERY PLAN +-------------------------------------------- + Nested Loop + Disabled: true + Join Filter: (g.g = sj_narrow.val1) + -> HashAggregate + Group Key: sj_narrow.val1 + -> Seq Scan on sj_narrow + -> Function Scan on generate_series g + Supplied Plan Advice: + SEMIJOIN_UNIQUE(g) /* matched, failed */ + Generated Plan Advice: + JOIN_ORDER(sj_narrow g) + NESTED_LOOP_PLAIN(g) + SEQ_SCAN(sj_narrow) + SEMIJOIN_UNIQUE(sj_narrow) + NO_GATHER(sj_narrow) +(15 rows) + +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(g)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + QUERY PLAN +------------------------------------------------ + Nested Loop + Disabled: true + Join Filter: (g.g = sj_narrow.val1) + -> HashAggregate + Group Key: sj_narrow.val1 + -> Seq Scan on sj_narrow + -> Function Scan on generate_series g + Supplied Plan Advice: + SEMIJOIN_NON_UNIQUE(g) /* matched, failed */ + Generated Plan Advice: + JOIN_ORDER(sj_narrow g) + NESTED_LOOP_PLAIN(g) + SEQ_SCAN(sj_narrow) + SEMIJOIN_UNIQUE(sj_narrow) + NO_GATHER(sj_narrow) +(15 rows) + +COMMIT; +-- Test conflicting advice. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow) semijoin_non_unique(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + QUERY PLAN +--------------------------------------------------------------------- + Hash Join + Hash Cond: (g.g = sj_narrow.val1) + -> Function Scan on generate_series g + -> Hash + -> HashAggregate + Group Key: sj_narrow.val1 + -> Seq Scan on sj_narrow + Supplied Plan Advice: + SEMIJOIN_UNIQUE(sj_narrow) /* matched, conflicting */ + SEMIJOIN_NON_UNIQUE(sj_narrow) /* matched, conflicting, failed */ + Generated Plan Advice: + JOIN_ORDER(g sj_narrow) + HASH_JOIN(sj_narrow) + SEQ_SCAN(sj_narrow) + SEMIJOIN_UNIQUE(sj_narrow) + NO_GATHER(sj_narrow) +(16 rows) + +COMMIT; +-- Try applying SEMIJOIN_UNIQUE() to a non-semijoin. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(g)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g, sj_narrow s WHERE g = s.val1; + QUERY PLAN +---------------------------------------------------------- + Merge Join + Merge Cond: (s.val1 = g.g) + -> Index Scan using sj_narrow_val1_idx on sj_narrow s + -> Sort + Sort Key: g.g + -> Function Scan on generate_series g + Supplied Plan Advice: + SEMIJOIN_UNIQUE(g) /* matched, inapplicable, failed */ + Generated Plan Advice: + JOIN_ORDER(s g) + MERGE_JOIN_PLAIN(g) + INDEX_SCAN(s public.sj_narrow_val1_idx) + NO_GATHER(s) +(13 rows) + +COMMIT; diff --git a/contrib/pg_plan_advice/expected/syntax.out b/contrib/pg_plan_advice/expected/syntax.out new file mode 100644 index 00000000000..be61402b569 --- /dev/null +++ b/contrib/pg_plan_advice/expected/syntax.out @@ -0,0 +1,192 @@ +LOAD 'pg_plan_advice'; +-- An empty string is allowed. Empty target lists are allowed for most advice +-- tags, but not for JOIN_ORDER. "Supplied Plan Advice" should be omitted in +-- text format when there is no actual advice, but not in non-text format. +SET pg_plan_advice.advice = ''; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +------------ + Result +(1 row) + +SET pg_plan_advice.advice = 'SEQ_SCAN()'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +------------ + Result +(1 row) + +SET pg_plan_advice.advice = 'NESTED_LOOP_PLAIN()'; +EXPLAIN (COSTS OFF, FORMAT JSON) SELECT 1; + QUERY PLAN +-------------------------------- + [ + + { + + "Plan": { + + "Node Type": "Result", + + "Parallel Aware": false,+ + "Async Capable": false, + + "Disabled": false + + }, + + "Supplied Plan Advice": ""+ + } + + ] +(1 row) + +SET pg_plan_advice.advice = 'JOIN_ORDER()'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "JOIN_ORDER()" +DETAIL: Could not parse advice: JOIN_ORDER must have at least one target at or near ")" +-- Test assorted variations in capitalization, whitespace, and which parts of +-- the relation identifier are included. These should all work. +SET pg_plan_advice.advice = 'SEQ_SCAN(x)'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +--------------------------------- + Result + Supplied Plan Advice: + SEQ_SCAN(x) /* not matched */ +(3 rows) + +SET pg_plan_advice.advice = 'seq_scan(x@y)'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +----------------------------------- + Result + Supplied Plan Advice: + SEQ_SCAN(x@y) /* not matched */ +(3 rows) + +SET pg_plan_advice.advice = 'SEQ_scan(x#2)'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +----------------------------------- + Result + Supplied Plan Advice: + SEQ_SCAN(x#2) /* not matched */ +(3 rows) + +SET pg_plan_advice.advice = 'SEQ_SCAN (x/y)'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +----------------------------------- + Result + Supplied Plan Advice: + SEQ_SCAN(x/y) /* not matched */ +(3 rows) + +SET pg_plan_advice.advice = ' SEQ_SCAN ( x / y . z ) '; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +------------------------------------- + Result + Supplied Plan Advice: + SEQ_SCAN(x/y.z) /* not matched */ +(3 rows) + +SET pg_plan_advice.advice = 'SEQ_SCAN("x"#2/"y"."z"@"t")'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +----------------------------------------- + Result + Supplied Plan Advice: + SEQ_SCAN(x#2/y.z@t) /* not matched */ +(3 rows) + +-- Syntax errors. +SET pg_plan_advice.advice = 'SEQUENTIAL_SCAN(x)'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQUENTIAL_SCAN(x)" +DETAIL: Could not parse advice: syntax error at or near "SEQUENTIAL_SCAN" +SET pg_plan_advice.advice = 'SEQ_SCAN'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN" +DETAIL: Could not parse advice: syntax error at end of input +SET pg_plan_advice.advice = 'SEQ_SCAN('; +ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN(" +DETAIL: Could not parse advice: syntax error at end of input +SET pg_plan_advice.advice = 'SEQ_SCAN("'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN("" +DETAIL: Could not parse advice: unterminated quoted identifier at end of input +SET pg_plan_advice.advice = 'SEQ_SCAN("")'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN("")" +DETAIL: Could not parse advice: zero-length delimited identifier at or near """ +SET pg_plan_advice.advice = 'SEQ_SCAN("a"'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN("a"" +DETAIL: Could not parse advice: syntax error at end of input +SET pg_plan_advice.advice = 'SEQ_SCAN(#'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN(#" +DETAIL: Could not parse advice: syntax error at or near "#" +SET pg_plan_advice.advice = '()'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "()" +DETAIL: Could not parse advice: syntax error at or near "(" +SET pg_plan_advice.advice = '123'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "123" +DETAIL: Could not parse advice: syntax error at or near "123" +-- Tags like SEQ_SCAN and NO_GATHER don't allow sublists at all; other tags, +-- except for JOIN_ORDER, allow at most one level of sublist. Hence, these +-- examples should error out. +SET pg_plan_advice.advice = 'SEQ_SCAN((x))'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN((x))" +DETAIL: Could not parse advice: syntax error at or near "(" +SET pg_plan_advice.advice = 'GATHER(((x)))'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "GATHER(((x)))" +DETAIL: Could not parse advice: syntax error at or near "(" +-- Legal comments. +SET pg_plan_advice.advice = '/**/'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +------------ + Result +(1 row) + +SET pg_plan_advice.advice = 'HASH_JOIN(_)/***/'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +---------------------------------- + Result + Supplied Plan Advice: + HASH_JOIN(_) /* not matched */ +(3 rows) + +SET pg_plan_advice.advice = '/* comment */ HASH_JOIN(/*x*/y)'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +---------------------------------- + Result + Supplied Plan Advice: + HASH_JOIN(y) /* not matched */ +(3 rows) + +SET pg_plan_advice.advice = '/* comment */ HASH_JOIN(y//*x*/z)'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +------------------------------------ + Result + Supplied Plan Advice: + HASH_JOIN(y/z) /* not matched */ +(3 rows) + +-- Unterminated comments. +SET pg_plan_advice.advice = '/*'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "/*" +DETAIL: Could not parse advice: unterminated comment at end of input +SET pg_plan_advice.advice = 'JOIN_ORDER("fOO") /* oops'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "JOIN_ORDER("fOO") /* oops" +DETAIL: Could not parse advice: unterminated comment at end of input +-- Nested comments are not supported, so the first of these is legal and +-- the second is not. +SET pg_plan_advice.advice = '/*/*/'; +EXPLAIN (COSTS OFF) SELECT 1; + QUERY PLAN +------------ + Result +(1 row) + +SET pg_plan_advice.advice = '/*/* stuff */*/'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "/*/* stuff */*/" +DETAIL: Could not parse advice: syntax error at or near "*" +-- Foreign join requires multiple relation identifiers. +SET pg_plan_advice.advice = 'FOREIGN_JOIN(a)'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "FOREIGN_JOIN(a)" +DETAIL: Could not parse advice: FOREIGN_JOIN targets must contain more than one relation identifier at or near ")" +SET pg_plan_advice.advice = 'FOREIGN_JOIN((a))'; +ERROR: invalid value for parameter "pg_plan_advice.advice": "FOREIGN_JOIN((a))" +DETAIL: Could not parse advice: FOREIGN_JOIN targets must contain more than one relation identifier at or near ")" diff --git a/contrib/pg_plan_advice/meson.build b/contrib/pg_plan_advice/meson.build new file mode 100644 index 00000000000..f7229dddcef --- /dev/null +++ b/contrib/pg_plan_advice/meson.build @@ -0,0 +1,79 @@ +# Copyright (c) 2022-2024, PostgreSQL Global Development Group + +pg_plan_advice_sources = files( + 'pg_plan_advice.c', + 'pgpa_ast.c', + 'pgpa_collector.c', + 'pgpa_identifier.c', + 'pgpa_join.c', + 'pgpa_output.c', + 'pgpa_planner.c', + 'pgpa_scan.c', + 'pgpa_trove.c', + 'pgpa_walker.c', +) + +pgpa_scanner = custom_target('pgpa_scanner', + input: 'pgpa_scanner.l', + output: 'pgpa_scanner.c', + command: flex_cmd, +) +generated_sources += pgpa_scanner +pg_plan_advice_sources += pgpa_scanner + +pgpa_parser = custom_target('pgpa_parser', + input: 'pgpa_parser.y', + kwargs: bison_kw, +) +generated_sources += pgpa_parser.to_list() +pg_plan_advice_sources += pgpa_parser + +if host_system == 'windows' + pg_plan_advice_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_plan_advice', + '--FILEDESC', 'pg_plan_advice - help the planner get the right plan',]) +endif + +pg_plan_advice_inc = include_directories('.') + +pg_plan_advice = shared_module('pg_plan_advice', + pg_plan_advice_sources, + include_directories: pg_plan_advice_inc, + kwargs: contrib_mod_args, +) +contrib_targets += pg_plan_advice + +install_data( + 'pg_plan_advice--1.0.sql', + 'pg_plan_advice.control', + kwargs: contrib_data_args, +) + +install_headers( + 'pg_plan_advice.h', + install_dir: dir_include_extension / 'pg_plan_advice', +) + +tests += { + 'name': 'pg_plan_advice', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'gather', + 'join_order', + 'join_strategy', + 'local_collector', + 'partitionwise', + 'prepared', + 'scan', + 'semijoin', + 'syntax', + ], + }, + 'tap': { + 'tests': [ + 't/001_regress.pl', + ], + }, +} diff --git a/contrib/pg_plan_advice/pg_plan_advice--1.0.sql b/contrib/pg_plan_advice/pg_plan_advice--1.0.sql new file mode 100644 index 00000000000..450c42040fd --- /dev/null +++ b/contrib/pg_plan_advice/pg_plan_advice--1.0.sql @@ -0,0 +1,43 @@ +/* contrib/pg_plan_advice/pg_plan_advice--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pg_plan_advice" to load this file. \quit + +CREATE FUNCTION pg_clear_collected_local_advice() +RETURNS void +AS 'MODULE_PATHNAME', 'pg_clear_collected_local_advice' +LANGUAGE C STRICT; + +CREATE FUNCTION pg_clear_collected_shared_advice() +RETURNS void +AS 'MODULE_PATHNAME', 'pg_clear_collected_shared_advice' +LANGUAGE C STRICT; + +CREATE FUNCTION pg_get_collected_local_advice( + OUT id bigint, + OUT userid oid, + OUT dbid oid, + OUT queryid bigint, + OUT collection_time timestamptz, + OUT query text, + OUT advice text +) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_get_collected_local_advice' +LANGUAGE C STRICT; + +CREATE FUNCTION pg_get_collected_shared_advice( + OUT id bigint, + OUT userid oid, + OUT dbid oid, + OUT queryid bigint, + OUT collection_time timestamptz, + OUT query text, + OUT advice text +) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_get_collected_shared_advice' +LANGUAGE C STRICT; + +REVOKE ALL ON FUNCTION pg_clear_collected_shared_advice() FROM PUBLIC; +REVOKE ALL ON FUNCTION pg_get_collected_shared_advice() FROM PUBLIC; diff --git a/contrib/pg_plan_advice/pg_plan_advice.c b/contrib/pg_plan_advice/pg_plan_advice.c new file mode 100644 index 00000000000..99b97843991 --- /dev/null +++ b/contrib/pg_plan_advice/pg_plan_advice.c @@ -0,0 +1,563 @@ +/*------------------------------------------------------------------------- + * + * pg_plan_advice.c + * main entrypoints for generating and applying planner advice + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pg_plan_advice.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pg_plan_advice.h" +#include "pgpa_ast.h" +#include "pgpa_collector.h" +#include "pgpa_identifier.h" +#include "pgpa_output.h" +#include "pgpa_planner.h" +#include "pgpa_trove.h" +#include "pgpa_walker.h" + +#include "commands/defrem.h" +#include "commands/explain.h" +#include "commands/explain_format.h" +#include "commands/explain_state.h" +#include "funcapi.h" +#include "optimizer/planner.h" +#include "storage/dsm_registry.h" +#include "utils/guc.h" + +PG_MODULE_MAGIC; + +static pgpa_shared_state *pgpa_state = NULL; +static dsa_area *pgpa_dsa_area = NULL; +static List *advisor_hook_list = NIL; + +/* GUC variables */ +char *pg_plan_advice_advice = NULL; +bool pg_plan_advice_always_store_advice_details = false; +static bool pg_plan_advice_always_explain_supplied_advice = true; +bool pg_plan_advice_feedback_warnings = false; +bool pg_plan_advice_local_collector = false; +int pg_plan_advice_local_collection_limit = 0; +bool pg_plan_advice_shared_collector = false; +int pg_plan_advice_shared_collection_limit = 0; +bool pg_plan_advice_trace_mask = false; + +/* Saved hook value */ +static explain_per_plan_hook_type prev_explain_per_plan = NULL; + +/* Other file-level globals */ +static int es_extension_id; +static MemoryContext pgpa_memory_context = NULL; + +static void pgpa_init_shared_state(void *ptr, void *arg); +static void pg_plan_advice_explain_option_handler(ExplainState *es, + DefElem *opt, + ParseState *pstate); +static void pg_plan_advice_explain_per_plan_hook(PlannedStmt *plannedstmt, + IntoClause *into, + ExplainState *es, + const char *queryString, + ParamListInfo params, + QueryEnvironment *queryEnv); +static bool pg_plan_advice_advice_check_hook(char **newval, void **extra, + GucSource source); +static DefElem *find_defelem_by_defname(List *deflist, char *defname); + +/* + * Initialize this module. + */ +void +_PG_init(void) +{ + DefineCustomStringVariable("pg_plan_advice.advice", + "advice to apply during query planning", + NULL, + &pg_plan_advice_advice, + NULL, + PGC_USERSET, + 0, + pg_plan_advice_advice_check_hook, + NULL, + NULL); + + DefineCustomBoolVariable("pg_plan_advice.always_explain_supplied_advice", + "EXPLAIN output includes supplied advice even without EXPLAIN (PLAN_ADVICE)", + NULL, + &pg_plan_advice_always_explain_supplied_advice, + true, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_plan_advice.always_store_advice_details", + "Generate advice strings even when seemingly not required", + "Use this option to see generated advice for prepared queries.", + &pg_plan_advice_always_store_advice_details, + false, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_plan_advice.feedback_warnings", + "Warn when supplied advice does not apply cleanly", + NULL, + &pg_plan_advice_feedback_warnings, + false, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_plan_advice.local_collector", + "Enable the local advice collector.", + NULL, + &pg_plan_advice_local_collector, + false, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("pg_plan_advice.local_collection_limit", + "# of advice entries to retain in per-backend memory", + NULL, + &pg_plan_advice_local_collection_limit, + 0, + 0, INT_MAX, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_plan_advice.shared_collector", + "Enable the shared advice collector.", + NULL, + &pg_plan_advice_shared_collector, + false, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("pg_plan_advice.shared_collection_limit", + "# of advice entries to retain in shared memory", + NULL, + &pg_plan_advice_shared_collection_limit, + 0, + 0, INT_MAX, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("pg_plan_advice.trace_mask", + "Emit debugging messages showing the computed strategy mask for each relation", + NULL, + &pg_plan_advice_trace_mask, + false, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + MarkGUCPrefixReserved("pg_plan_advice"); + + /* Get an ID that we can use to cache data in an ExplainState. */ + es_extension_id = GetExplainExtensionId("pg_plan_advice"); + + /* Register the new EXPLAIN options implemented by this module. */ + RegisterExtensionExplainOption("plan_advice", + pg_plan_advice_explain_option_handler); + + /* Install hooks */ + pgpa_planner_install_hooks(); + prev_explain_per_plan = explain_per_plan_hook; + explain_per_plan_hook = pg_plan_advice_explain_per_plan_hook; +} + +/* + * Initialize shared state when first created. + */ +static void +pgpa_init_shared_state(void *ptr, void *arg) +{ + pgpa_shared_state *state = (pgpa_shared_state *) ptr; + + LWLockInitialize(&state->lock, LWLockNewTrancheId("pg_plan_advice_lock")); + state->dsa_tranche = LWLockNewTrancheId("pg_plan_advice_dsa"); + state->area = DSA_HANDLE_INVALID; + state->shared_collector = InvalidDsaPointer; +} + +/* + * Return a pointer to a memory context where long-lived data managed by this + * module can be stored. + */ +MemoryContext +pg_plan_advice_get_mcxt(void) +{ + if (pgpa_memory_context == NULL) + pgpa_memory_context = AllocSetContextCreate(TopMemoryContext, + "pg_plan_advice", + ALLOCSET_DEFAULT_SIZES); + + return pgpa_memory_context; +} + +/* + * Get a pointer to our shared state. + * + * If no shared state exists, create and initialize it. If it does exist but + * this backend has not yet accessed it, attach to it. Otherwise, just return + * our cached pointer. + * + * Along the way, make sure the relevant LWLock tranches are registered. + */ +pgpa_shared_state * +pg_plan_advice_attach(void) +{ + if (pgpa_state == NULL) + { + bool found; + + pgpa_state = + GetNamedDSMSegment("pg_plan_advice", sizeof(pgpa_shared_state), + pgpa_init_shared_state, &found, NULL); + } + + return pgpa_state; +} + +/* + * Return a pointer to pg_plan_advice's DSA area, creating it if needed. + */ +dsa_area * +pg_plan_advice_dsa_area(void) +{ + if (pgpa_dsa_area == NULL) + { + pgpa_shared_state *state = pg_plan_advice_attach(); + dsa_handle area_handle; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(pg_plan_advice_get_mcxt()); + + LWLockAcquire(&state->lock, LW_EXCLUSIVE); + area_handle = state->area; + if (area_handle == DSA_HANDLE_INVALID) + { + pgpa_dsa_area = dsa_create(state->dsa_tranche); + dsa_pin(pgpa_dsa_area); + state->area = dsa_get_handle(pgpa_dsa_area); + LWLockRelease(&state->lock); + } + else + { + LWLockRelease(&state->lock); + pgpa_dsa_area = dsa_attach(area_handle); + } + + dsa_pin_mapping(pgpa_dsa_area); + + MemoryContextSwitchTo(oldcontext); + } + + return pgpa_dsa_area; +} + +/* + * Was the PLAN_ADVICE option specified and not set to false? + */ +bool +pg_plan_advice_should_explain(ExplainState *es) +{ + bool *plan_advice = NULL; + + if (es != NULL) + plan_advice = GetExplainExtensionState(es, es_extension_id); + return plan_advice != NULL && *plan_advice; +} + +/* + * Get the advice that should be used while planning a particular query. + */ +char * +pg_plan_advice_get_supplied_query_advice(PlannerGlobal *glob, + Query *parse, + const char *query_string, + int cursorOptions, + ExplainState *es) +{ + ListCell *lc; + + /* + * If any advisors are loaded, consult them. The first one that produces a + * non-NULL string wins. + */ + foreach(lc, advisor_hook_list) + { + pg_plan_advice_advisor_hook hook = lfirst(lc); + char *advice_string; + + advice_string = (*hook) (glob, parse, query_string, cursorOptions, es); + if (advice_string != NULL) + return advice_string; + } + + /* Otherwise, just use the value of the GUC. */ + return pg_plan_advice_advice; +} + +/* + * Add an advisor, which can supply advice strings to be used during future + * query planning operations. + * + * The advisor should return NULL if it has no advice string to offer for a + * given query. If multiple advisors are added, they will be consulted in the + * order added until one of them returns a non-NULL value. + */ +void +pg_plan_advice_add_advisor(pg_plan_advice_advisor_hook hook) +{ + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(pg_plan_advice_get_mcxt()); + advisor_hook_list = lappend(advisor_hook_list, hook); + MemoryContextSwitchTo(oldcontext); +} + +/* + * Remove an advisor. + */ +void +pg_plan_advice_remove_advisor(pg_plan_advice_advisor_hook hook) +{ + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(pg_plan_advice_get_mcxt()); + advisor_hook_list = list_delete_ptr(advisor_hook_list, hook); + MemoryContextSwitchTo(oldcontext); +} + +/* + * Handler for EXPLAIN (PLAN_ADVICE). + */ +static void +pg_plan_advice_explain_option_handler(ExplainState *es, DefElem *opt, + ParseState *pstate) +{ + bool *plan_advice; + + plan_advice = GetExplainExtensionState(es, es_extension_id); + + if (plan_advice == NULL) + { + plan_advice = palloc0_object(bool); + SetExplainExtensionState(es, es_extension_id, plan_advice); + } + + *plan_advice = defGetBoolean(opt); +} + +/* + * Display a string that is likely to consist of multiple lines in EXPLAIN + * output. + */ +static void +pg_plan_advice_explain_text_multiline(ExplainState *es, char *qlabel, + char *value) +{ + char *s; + + /* For non-text formats, it's best not to add any special handling. */ + if (es->format != EXPLAIN_FORMAT_TEXT) + { + ExplainPropertyText(qlabel, value, es); + return; + } + + /* In text format, if there is no data, display nothing. */ + if (*value == '\0') + return; + + /* + * It looks nicest to indent each line of the advice separately, beginning + * on the line below the label. + */ + ExplainIndentText(es); + appendStringInfo(es->str, "%s:\n", qlabel); + es->indent++; + while ((s = strchr(value, '\n')) != NULL) + { + ExplainIndentText(es); + appendBinaryStringInfo(es->str, value, (s - value) + 1); + value = s + 1; + } + + /* Don't interpret a terminal newline as a request for an empty line. */ + if (*value != '\0') + { + ExplainIndentText(es); + appendStringInfo(es->str, "%s\n", value); + } + + es->indent--; +} + +/* + * Add advice feedback to the EXPLAIN output. + */ +static void +pg_plan_advice_explain_feedback(ExplainState *es, List *feedback) +{ + StringInfoData buf; + + initStringInfo(&buf); + foreach_node(DefElem, item, feedback) + { + int flags = defGetInt32(item); + + appendStringInfo(&buf, "%s /* ", item->defname); + pgpa_trove_append_flags(&buf, flags); + appendStringInfo(&buf, " */\n"); + } + + pg_plan_advice_explain_text_multiline(es, "Supplied Plan Advice", + buf.data); +} + +/* + * Add relevant details, if any, to the EXPLAIN output for a single plan. + */ +static void +pg_plan_advice_explain_per_plan_hook(PlannedStmt *plannedstmt, + IntoClause *into, + ExplainState *es, + const char *queryString, + ParamListInfo params, + QueryEnvironment *queryEnv) +{ + bool should_explain; + DefElem *pgpa_item; + List *pgpa_list; + + if (prev_explain_per_plan) + prev_explain_per_plan(plannedstmt, into, es, queryString, params, + queryEnv); + + /* Should an advice string be part of the EXPLAIN output? */ + should_explain = pg_plan_advice_should_explain(es); + + /* Find any data pgpa_planner_shutdown stashed in the PlannedStmt. */ + pgpa_item = find_defelem_by_defname(plannedstmt->extension_state, + "pg_plan_advice"); + pgpa_list = pgpa_item == NULL ? NULL : (List *) pgpa_item->arg; + + /* + * By default, if there is a record of attempting to apply advice during + * query planning, we always output that information, but the user can set + * pg_plan_advice.always_explain_supplied_advice = false to suppress that + * behavior. If they do, we'll only display it when the PLAN_ADVICE option + * was specified and not set to false. + * + * NB: If we're explaining a query planned beforehand -- i.e. a prepared + * statement -- the application of query advice may not have been + * recorded, and therefore this won't be able to show anything. Use + * pg_plan_advice.always_store_advice_details = true to work around this. + */ + if (pgpa_list != NULL && (pg_plan_advice_always_explain_supplied_advice || + should_explain)) + { + DefElem *feedback; + + feedback = find_defelem_by_defname(pgpa_list, "feedback"); + if (feedback != NULL) + pg_plan_advice_explain_feedback(es, (List *) feedback->arg); + } + + /* + * If the PLAN_ADVICE option was specified -- and not sent to FALSE -- + * show generated advice. + */ + if (should_explain) + { + DefElem *advice_string_item; + char *advice_string = NULL; + + advice_string_item = + find_defelem_by_defname(pgpa_list, "advice_string"); + if (advice_string_item != NULL) + { + advice_string = strVal(advice_string_item->arg); + pg_plan_advice_explain_text_multiline(es, "Generated Plan Advice", + advice_string); + } + } +} + +/* + * Check hook for pg_plan_advice.advice + */ +static bool +pg_plan_advice_advice_check_hook(char **newval, void **extra, GucSource source) +{ + MemoryContext oldcontext; + MemoryContext tmpcontext; + char *error; + + if (*newval == NULL) + return true; + + tmpcontext = AllocSetContextCreate(CurrentMemoryContext, + "pg_plan_advice.advice", + ALLOCSET_DEFAULT_SIZES); + oldcontext = MemoryContextSwitchTo(tmpcontext); + + /* + * It would be nice to save the parse tree that we construct here for + * eventual use when planning with this advice, but *extra can only point + * to a single guc_malloc'd chunk, and our parse tree involves an + * arbitrary number of memory allocations. + */ + (void) pgpa_parse(*newval, &error); + + if (error != NULL) + { + GUC_check_errdetail("Could not parse advice: %s", error); + return false; + } + + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(tmpcontext); + + return true; +} + +/* + * Search a list of DefElem objects for a given defname. + */ +static DefElem * +find_defelem_by_defname(List *deflist, char *defname) +{ + foreach_node(DefElem, item, deflist) + { + if (strcmp(item->defname, defname) == 0) + return item; + } + + return NULL; +} diff --git a/contrib/pg_plan_advice/pg_plan_advice.control b/contrib/pg_plan_advice/pg_plan_advice.control new file mode 100644 index 00000000000..aa6fdc9e7b2 --- /dev/null +++ b/contrib/pg_plan_advice/pg_plan_advice.control @@ -0,0 +1,5 @@ +# pg_plan_advice extension +comment = 'help the planner get the right plan' +default_version = '1.0' +module_pathname = '$libdir/pg_plan_advice' +relocatable = true diff --git a/contrib/pg_plan_advice/pg_plan_advice.h b/contrib/pg_plan_advice/pg_plan_advice.h new file mode 100644 index 00000000000..21f66092fa2 --- /dev/null +++ b/contrib/pg_plan_advice/pg_plan_advice.h @@ -0,0 +1,61 @@ +/*------------------------------------------------------------------------- + * + * pg_plan_advice.h + * main header file for pg_plan_advice contrib module + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pg_plan_advice.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_PLAN_ADVICE_H +#define PG_PLAN_ADVICE_H + +#include "commands/explain_state.h" +#include "nodes/pathnodes.h" +#include "nodes/plannodes.h" +#include "storage/lwlock.h" +#include "utils/dsa.h" + +typedef struct pgpa_shared_state +{ + LWLock lock; + int dsa_tranche; + dsa_handle area; + dsa_pointer shared_collector; +} pgpa_shared_state; + +/* Hook for other plugins to supply advice strings */ +typedef char *(*pg_plan_advice_advisor_hook) (PlannerGlobal *glob, + Query *parse, + const char *query_string, + int cursorOptions, + ExplainState *es); + +/* GUC variables */ +extern char *pg_plan_advice_advice; +extern bool pg_plan_advice_always_store_advice_details; +extern bool pg_plan_advice_feedback_warnings; +extern bool pg_plan_advice_local_collector; +extern int pg_plan_advice_local_collection_limit; +extern bool pg_plan_advice_shared_collector; +extern int pg_plan_advice_shared_collection_limit; +extern bool pg_plan_advice_trace_mask; + +/* Function prototypes (for use by pg_plan_advice itself) */ +extern MemoryContext pg_plan_advice_get_mcxt(void); +extern pgpa_shared_state *pg_plan_advice_attach(void); +extern dsa_area *pg_plan_advice_dsa_area(void); +extern bool pg_plan_advice_should_explain(ExplainState *es); +extern char *pg_plan_advice_get_supplied_query_advice(PlannerGlobal *glob, + Query *parse, + const char *query_string, + int cursorOptions, + ExplainState *es); + +/* Function prototypes (for use by other plugins) */ +extern PGDLLEXPORT void pg_plan_advice_add_advisor(pg_plan_advice_advisor_hook hook); +extern PGDLLEXPORT void pg_plan_advice_remove_advisor(pg_plan_advice_advisor_hook hook); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_ast.c b/contrib/pg_plan_advice/pgpa_ast.c new file mode 100644 index 00000000000..85bd74859df --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_ast.c @@ -0,0 +1,351 @@ +/*------------------------------------------------------------------------- + * + * pgpa_ast.c + * additional supporting code related to plan advice parsing + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_ast.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgpa_ast.h" + +#include "funcapi.h" +#include "utils/array.h" +#include "utils/builtins.h" + +static bool pgpa_identifiers_cover_target(int nrids, pgpa_identifier *rids, + pgpa_advice_target *target, + bool *rids_used); + +/* + * Get a C string that corresponds to the specified advice tag. + */ +char * +pgpa_cstring_advice_tag(pgpa_advice_tag_type advice_tag) +{ + switch (advice_tag) + { + case PGPA_TAG_BITMAP_HEAP_SCAN: + return "BITMAP_HEAP_SCAN"; + case PGPA_TAG_FOREIGN_JOIN: + return "FOREIGN_JOIN"; + case PGPA_TAG_GATHER: + return "GATHER"; + case PGPA_TAG_GATHER_MERGE: + return "GATHER_MERGE"; + case PGPA_TAG_HASH_JOIN: + return "HASH_JOIN"; + case PGPA_TAG_INDEX_ONLY_SCAN: + return "INDEX_ONLY_SCAN"; + case PGPA_TAG_INDEX_SCAN: + return "INDEX_SCAN"; + case PGPA_TAG_JOIN_ORDER: + return "JOIN_ORDER"; + case PGPA_TAG_MERGE_JOIN_MATERIALIZE: + return "MERGE_JOIN_MATERIALIZE"; + case PGPA_TAG_MERGE_JOIN_PLAIN: + return "MERGE_JOIN_PLAIN"; + case PGPA_TAG_NESTED_LOOP_MATERIALIZE: + return "NESTED_LOOP_MATERIALIZE"; + case PGPA_TAG_NESTED_LOOP_MEMOIZE: + return "NESTED_LOOP_MEMOIZE"; + case PGPA_TAG_NESTED_LOOP_PLAIN: + return "NESTED_LOOP_PLAIN"; + case PGPA_TAG_NO_GATHER: + return "NO_GATHER"; + case PGPA_TAG_PARTITIONWISE: + return "PARTITIONWISE"; + case PGPA_TAG_SEMIJOIN_NON_UNIQUE: + return "SEMIJOIN_NON_UNIQUE"; + case PGPA_TAG_SEMIJOIN_UNIQUE: + return "SEMIJOIN_UNIQUE"; + case PGPA_TAG_SEQ_SCAN: + return "SEQ_SCAN"; + case PGPA_TAG_TID_SCAN: + return "TID_SCAN"; + } + + pg_unreachable(); + return NULL; +} + +/* + * Convert an advice tag, formatted as a string that has already been + * downcased as appropriate, to a pgpa_advice_tag_type. + * + * If we succeed, set *fail = false and return the result; if we fail, + * set *fail = true and reurn an arbitrary value. + */ +pgpa_advice_tag_type +pgpa_parse_advice_tag(const char *tag, bool *fail) +{ + *fail = false; + + switch (tag[0]) + { + case 'b': + if (strcmp(tag, "bitmap_heap_scan") == 0) + return PGPA_TAG_BITMAP_HEAP_SCAN; + break; + case 'f': + if (strcmp(tag, "foreign_join") == 0) + return PGPA_TAG_FOREIGN_JOIN; + break; + case 'g': + if (strcmp(tag, "gather") == 0) + return PGPA_TAG_GATHER; + if (strcmp(tag, "gather_merge") == 0) + return PGPA_TAG_GATHER_MERGE; + break; + case 'h': + if (strcmp(tag, "hash_join") == 0) + return PGPA_TAG_HASH_JOIN; + break; + case 'i': + if (strcmp(tag, "index_scan") == 0) + return PGPA_TAG_INDEX_SCAN; + if (strcmp(tag, "index_only_scan") == 0) + return PGPA_TAG_INDEX_ONLY_SCAN; + break; + case 'j': + if (strcmp(tag, "join_order") == 0) + return PGPA_TAG_JOIN_ORDER; + break; + case 'm': + if (strcmp(tag, "merge_join_materialize") == 0) + return PGPA_TAG_MERGE_JOIN_MATERIALIZE; + if (strcmp(tag, "merge_join_plain") == 0) + return PGPA_TAG_MERGE_JOIN_PLAIN; + break; + case 'n': + if (strcmp(tag, "nested_loop_materialize") == 0) + return PGPA_TAG_NESTED_LOOP_MATERIALIZE; + if (strcmp(tag, "nested_loop_memoize") == 0) + return PGPA_TAG_NESTED_LOOP_MEMOIZE; + if (strcmp(tag, "nested_loop_plain") == 0) + return PGPA_TAG_NESTED_LOOP_PLAIN; + if (strcmp(tag, "no_gather") == 0) + return PGPA_TAG_NO_GATHER; + break; + case 'p': + if (strcmp(tag, "partitionwise") == 0) + return PGPA_TAG_PARTITIONWISE; + break; + case 's': + if (strcmp(tag, "semijoin_non_unique") == 0) + return PGPA_TAG_SEMIJOIN_NON_UNIQUE; + if (strcmp(tag, "semijoin_unique") == 0) + return PGPA_TAG_SEMIJOIN_UNIQUE; + if (strcmp(tag, "seq_scan") == 0) + return PGPA_TAG_SEQ_SCAN; + break; + case 't': + if (strcmp(tag, "tid_scan") == 0) + return PGPA_TAG_TID_SCAN; + break; + } + + /* didn't work out */ + *fail = true; + + /* return an arbitrary value to unwind the call stack */ + return PGPA_TAG_SEQ_SCAN; +} + +/* + * Format a pgpa_advice_target as a string and append result to a StringInfo. + */ +void +pgpa_format_advice_target(StringInfo str, pgpa_advice_target *target) +{ + if (target->ttype != PGPA_TARGET_IDENTIFIER) + { + bool first = true; + char *delims; + + if (target->ttype == PGPA_TARGET_UNORDERED_LIST) + delims = "{}"; + else + delims = "()"; + + appendStringInfoChar(str, delims[0]); + foreach_ptr(pgpa_advice_target, child_target, target->children) + { + if (first) + first = false; + else + appendStringInfoChar(str, ' '); + pgpa_format_advice_target(str, child_target); + } + appendStringInfoChar(str, delims[1]); + } + else + { + const char *rt_identifier; + + rt_identifier = pgpa_identifier_string(&target->rid); + appendStringInfoString(str, rt_identifier); + } +} + +/* + * Format a pgpa_index_target as a string and append result to a StringInfo. + */ +void +pgpa_format_index_target(StringInfo str, pgpa_index_target *itarget) +{ + if (itarget->indnamespace != NULL) + appendStringInfo(str, "%s.", + quote_identifier(itarget->indnamespace)); + appendStringInfoString(str, quote_identifier(itarget->indname)); +} + +/* + * Determine whether two pgpa_index_target objects are exactly identical. + */ +bool +pgpa_index_targets_equal(pgpa_index_target *i1, pgpa_index_target *i2) +{ + /* indnamespace can be NULL, and two NULL values are equal */ + if ((i1->indnamespace != NULL || i2->indnamespace != NULL) && + (i1->indnamespace == NULL || i2->indnamespace == NULL || + strcmp(i1->indnamespace, i2->indnamespace) != 0)) + return false; + if (strcmp(i1->indname, i2->indname) != 0) + return false; + + return true; +} + +/* + * Check whether an identifier matches an any part of an advice target. + */ +bool +pgpa_identifier_matches_target(pgpa_identifier *rid, pgpa_advice_target *target) +{ + /* For non-identifiers, check all descendents. */ + if (target->ttype != PGPA_TARGET_IDENTIFIER) + { + foreach_ptr(pgpa_advice_target, child_target, target->children) + { + if (pgpa_identifier_matches_target(rid, child_target)) + return true; + } + return false; + } + + /* Straightforward comparisons of alias name and occcurrence number. */ + if (strcmp(rid->alias_name, target->rid.alias_name) != 0) + return false; + if (rid->occurrence != target->rid.occurrence) + return false; + + /* + * If a relation identifer mentions a partition name, it should also + * specify a partition schema. But the target may leave the schema NULL to + * match anything. + */ + Assert(rid->partnsp != NULL || rid->partrel == NULL); + if (rid->partnsp != NULL && target->rid.partnsp != NULL && + strcmp(rid->partnsp, target->rid.partnsp) != 0) + return false; + + /* + * These fields can be NULL on either side, but NULL only matches another + * NULL. + */ + if (!strings_equal_or_both_null(rid->partrel, target->rid.partrel)) + return false; + if (!strings_equal_or_both_null(rid->plan_name, target->rid.plan_name)) + return false; + + return true; +} + +/* + * Match identifiers to advice targets and return an enum value indicating + * the relationship between the set of keys and the set of targets. + * + * See the comments for pgpa_itm_type. + */ +pgpa_itm_type +pgpa_identifiers_match_target(int nrids, pgpa_identifier *rids, + pgpa_advice_target *target) +{ + bool all_rids_used = true; + bool any_rids_used = false; + bool all_targets_used; + bool *rids_used = palloc0_array(bool, nrids); + + all_targets_used = + pgpa_identifiers_cover_target(nrids, rids, target, rids_used); + + for (int i = 0; i < nrids; ++i) + { + if (rids_used[i]) + any_rids_used = true; + else + all_rids_used = false; + } + + if (all_rids_used) + { + if (all_targets_used) + return PGPA_ITM_EQUAL; + else + return PGPA_ITM_KEYS_ARE_SUBSET; + } + else + { + if (all_targets_used) + return PGPA_ITM_TARGETS_ARE_SUBSET; + else if (any_rids_used) + return PGPA_ITM_INTERSECTING; + else + return PGPA_ITM_DISJOINT; + } +} + +/* + * Returns true if every target or sub-target is matched by at least one + * identifier, and otherwise false. + * + * Also sets rids_used[i] = true for each idenifier that matches at least one + * target. + */ +static bool +pgpa_identifiers_cover_target(int nrids, pgpa_identifier *rids, + pgpa_advice_target *target, bool *rids_used) +{ + bool result = false; + + if (target->ttype != PGPA_TARGET_IDENTIFIER) + { + result = true; + + foreach_ptr(pgpa_advice_target, child_target, target->children) + { + if (!pgpa_identifiers_cover_target(nrids, rids, child_target, + rids_used)) + result = false; + } + } + else + { + for (int i = 0; i < nrids; ++i) + { + if (pgpa_identifier_matches_target(&rids[i], target)) + { + rids_used[i] = true; + result = true; + } + } + } + + return result; +} diff --git a/contrib/pg_plan_advice/pgpa_ast.h b/contrib/pg_plan_advice/pgpa_ast.h new file mode 100644 index 00000000000..5d3f8d58a71 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_ast.h @@ -0,0 +1,185 @@ +/*------------------------------------------------------------------------- + * + * pgpa_ast.h + * abstract syntax trees for plan advice, plus parser/scanner support + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_ast.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_AST_H +#define PGPA_AST_H + +#include "pgpa_identifier.h" + +#include "nodes/pg_list.h" + +/* + * Advice items generally take the form SOME_TAG(item [...]), where an item + * can take various forms. The simplest case is a relation identifier, but + * some tags allow sublists, and JOIN_ORDER() allows both ordered and unordered + * sublists. + */ +typedef enum +{ + PGPA_TARGET_IDENTIFIER, /* relation identifier */ + PGPA_TARGET_ORDERED_LIST, /* (item ...) */ + PGPA_TARGET_UNORDERED_LIST /* {item ...} */ +} pgpa_target_type; + +/* + * An index specification. + */ +typedef struct pgpa_index_target +{ + /* Index schema and name */ + char *indnamespace; + char *indname; +} pgpa_index_target; + +/* + * A single item about which advice is being given, which could be either + * a relation identifier that we want to break out into its constituent fields, + * or a sublist of some kind. + */ +typedef struct pgpa_advice_target +{ + pgpa_target_type ttype; + + /* + * This field is meaningful when ttype is PGPA_TARGET_IDENTIFIER. + * + * All identifiers must have an alias name and an occurrence number; the + * remaining fields can be NULL. Note that it's possible to specify a + * partition name without a partition schema, but not the reverse. + */ + pgpa_identifier rid; + + /* + * This field is set when ttype is PPGA_TARGET_IDENTIFIER and the advice + * tag is PGPA_TAG_INDEX_SCAN or PGPA_TAG_INDEX_ONLY_SCAN. + */ + pgpa_index_target *itarget; + + /* + * When the ttype is PGPA_TARGET__LIST, this field contains a + * list of additional pgpa_advice_target objects. Otherwise, it is unused. + */ + List *children; +} pgpa_advice_target; + +/* + * These are all the kinds of advice that we know how to parse. If a keyword + * is found at the top level, it must be in this list. + * + * If you change anything here, also update pgpa_parse_advice_tag and + * pgpa_cstring_advice_tag. + */ +typedef enum pgpa_advice_tag_type +{ + PGPA_TAG_BITMAP_HEAP_SCAN, + PGPA_TAG_FOREIGN_JOIN, + PGPA_TAG_GATHER, + PGPA_TAG_GATHER_MERGE, + PGPA_TAG_HASH_JOIN, + PGPA_TAG_INDEX_ONLY_SCAN, + PGPA_TAG_INDEX_SCAN, + PGPA_TAG_JOIN_ORDER, + PGPA_TAG_MERGE_JOIN_MATERIALIZE, + PGPA_TAG_MERGE_JOIN_PLAIN, + PGPA_TAG_NESTED_LOOP_MATERIALIZE, + PGPA_TAG_NESTED_LOOP_MEMOIZE, + PGPA_TAG_NESTED_LOOP_PLAIN, + PGPA_TAG_NO_GATHER, + PGPA_TAG_PARTITIONWISE, + PGPA_TAG_SEMIJOIN_NON_UNIQUE, + PGPA_TAG_SEMIJOIN_UNIQUE, + PGPA_TAG_SEQ_SCAN, + PGPA_TAG_TID_SCAN +} pgpa_advice_tag_type; + +/* + * An item of advice, meaning a tag and the list of all targets to which + * it is being applied. + * + * "targets" is a list of pgpa_advice_target objects. + * + * The List returned from pgpa_yyparse is list of pgpa_advice_item objects. + */ +typedef struct pgpa_advice_item +{ + pgpa_advice_tag_type tag; + List *targets; +} pgpa_advice_item; + +/* + * Result of comparing an array of pgpa_relation_identifier objects to a + * pgpa_advice_target. + * + * PGPA_ITM_EQUAL means all targets are matched by some identifier, and + * all identifiers were matched to a target. + * + * PGPA_ITM_KEYS_ARE_SUBSET means that all identifiers matched to a target, + * but there were leftover targets. Generally, this means that the advice is + * looking to apply to all of the rels we have plus some additional ones that + * we don't have. + * + * PGPA_ITM_TARGETS_ARE_SUBSET means that all targets are matched by an + * identifiers, but there were leftover identifiers. Generally, this means + * that the advice is looking to apply to some but not all of the rels we have. + * + * PGPA_ITM_INTERSECTING means that some identifeirs and targets were matched, + * but neither all identifiers nor all targets could be matched to items in + * the other set. + * + * PGPA_ITM_DISJOINT means that no matches between identifeirs and targets were + * found. + */ +typedef enum +{ + PGPA_ITM_EQUAL, + PGPA_ITM_KEYS_ARE_SUBSET, + PGPA_ITM_TARGETS_ARE_SUBSET, + PGPA_ITM_INTERSECTING, + PGPA_ITM_DISJOINT +} pgpa_itm_type; + +/* for pgpa_scanner.l and pgpa_parser.y */ +union YYSTYPE; +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void *yyscan_t; +#endif + +/* in pgpa_scanner.l */ +extern int pgpa_yylex(union YYSTYPE *yylval_param, List **result, + char **parse_error_msg_p, yyscan_t yyscanner); +extern void pgpa_yyerror(List **result, char **parse_error_msg_p, + yyscan_t yyscanner, + const char *message); +extern void pgpa_scanner_init(const char *str, yyscan_t *yyscannerp); +extern void pgpa_scanner_finish(yyscan_t yyscanner); + +/* in pgpa_parser.y */ +extern int pgpa_yyparse(List **result, char **parse_error_msg_p, + yyscan_t yyscanner); +extern List *pgpa_parse(const char *advice_string, char **error_p); + +/* in pgpa_ast.c */ +extern char *pgpa_cstring_advice_tag(pgpa_advice_tag_type advice_tag); +extern bool pgpa_identifier_matches_target(pgpa_identifier *rid, + pgpa_advice_target *target); +extern pgpa_itm_type pgpa_identifiers_match_target(int nrids, + pgpa_identifier *rids, + pgpa_advice_target *target); +extern bool pgpa_index_targets_equal(pgpa_index_target *i1, + pgpa_index_target *i2); +extern pgpa_advice_tag_type pgpa_parse_advice_tag(const char *tag, bool *fail); +extern void pgpa_format_advice_target(StringInfo str, + pgpa_advice_target *target); +extern void pgpa_format_index_target(StringInfo str, + pgpa_index_target *itarget); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_collector.c b/contrib/pg_plan_advice/pgpa_collector.c new file mode 100644 index 00000000000..a0b0d7e1594 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_collector.c @@ -0,0 +1,639 @@ +/*------------------------------------------------------------------------- + * + * pgpa_collector.c + * collect advice into backend-local or shared memory + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_collector.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pg_plan_advice.h" +#include "pgpa_collector.h" + +#include "datatype/timestamp.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "nodes/pg_list.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/timestamp.h" + +PG_FUNCTION_INFO_V1(pg_clear_collected_local_advice); +PG_FUNCTION_INFO_V1(pg_clear_collected_shared_advice); +PG_FUNCTION_INFO_V1(pg_get_collected_local_advice); +PG_FUNCTION_INFO_V1(pg_get_collected_shared_advice); + +#define ADVICE_CHUNK_SIZE 1024 +#define ADVICE_CHUNK_ARRAY_SIZE 64 + +#define PG_GET_ADVICE_COLUMNS 7 + +/* + * Advice extracted from one query plan, together with the query string + * and various other identifying details. + */ +typedef struct pgpa_collected_advice +{ + Oid userid; /* user OID */ + Oid dbid; /* database OID */ + uint64 queryid; /* query identifier */ + TimestampTz timestamp; /* query timestamp */ + int advice_offset; /* start of advice in textual data */ + char textual_data[FLEXIBLE_ARRAY_MEMBER]; +} pgpa_collected_advice; + +/* + * A bunch of pointers to pgpa_collected_advice objects, stored in + * backend-local memory. + */ +typedef struct pgpa_local_advice_chunk +{ + pgpa_collected_advice *entries[ADVICE_CHUNK_SIZE]; +} pgpa_local_advice_chunk; + +/* + * Information about all of the pgpa_collected_advice objects that we're + * storing in local memory. + * + * We assign consecutive IDs, starting from 0, to each pgpa_collected_advice + * object that we store. The actual storage is an array of chunks, which + * helps keep memcpy() overhead low when we start discarding older data. + */ +typedef struct pgpa_local_advice +{ + uint64 next_id; + uint64 oldest_id; + uint64 base_id; + int chunk_array_allocated_size; + pgpa_local_advice_chunk **chunks; +} pgpa_local_advice; + +/* + * Just like pgpa_local_advice_chunk, but stored in a dynamic shared area, + * so we must use dsa_pointer instead of native pointers. + */ +typedef struct pgpa_shared_advice_chunk +{ + dsa_pointer entries[ADVICE_CHUNK_SIZE]; +} pgpa_shared_advice_chunk; + +/* + * Just like pgpa_local_advice, but stored in a dynamic shared area, so + * we must use dsa_pointer instead of native pointers. + */ +typedef struct pgpa_shared_advice +{ + uint64 next_id; + uint64 oldest_id; + uint64 base_id; + int chunk_array_allocated_size; + dsa_pointer chunks; +} pgpa_shared_advice; + +/* Pointers to local and shared collectors */ +static pgpa_local_advice *local_collector = NULL; +static pgpa_shared_advice *shared_collector = NULL; + +/* Static functions */ +static pgpa_collected_advice *pgpa_make_collected_advice(Oid userid, + Oid dbid, + uint64 queryId, + TimestampTz timestamp, + const char *query_string, + const char *advice_string, + dsa_area *area, + dsa_pointer *result); +static void pgpa_store_local_advice(pgpa_collected_advice *ca); +static void pgpa_trim_local_advice(int limit); +static void pgpa_store_shared_advice(dsa_pointer ca_pointer); +static void pgpa_trim_shared_advice(dsa_area *area, int limit); + +/* Helper function to extract the query string from pgpa_collected_advice */ +static inline const char * +query_string(pgpa_collected_advice *ca) +{ + return ca->textual_data; +} + +/* Helper function to extract the advice string from pgpa_collected_advice */ +static inline const char * +advice_string(pgpa_collected_advice *ca) +{ + return ca->textual_data + ca->advice_offset; +} + +/* + * Store collected query advice into the local or shared advice collector, + * as appropriate. + */ +void +pgpa_collect_advice(uint64 queryId, const char *query_string, + const char *advice_string) +{ + Oid userid = GetUserId(); + Oid dbid = MyDatabaseId; + TimestampTz now = GetCurrentTimestamp(); + + if (pg_plan_advice_local_collector && + pg_plan_advice_local_collection_limit > 0) + { + pgpa_collected_advice *ca; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(pg_plan_advice_get_mcxt()); + ca = pgpa_make_collected_advice(userid, dbid, queryId, now, + query_string, advice_string, + NULL, NULL); + pgpa_store_local_advice(ca); + MemoryContextSwitchTo(oldcontext); + } + + if (pg_plan_advice_shared_collector && + pg_plan_advice_shared_collection_limit > 0) + { + dsa_area *area = pg_plan_advice_dsa_area(); + dsa_pointer ca_pointer = InvalidDsaPointer; /* placate compiler */ + + pgpa_make_collected_advice(userid, dbid, queryId, now, + query_string, advice_string, area, + &ca_pointer); + pgpa_store_shared_advice(ca_pointer); + } +} + +/* + * Allocate and fill a new pgpa_collected_advice object. + * + * If area != NULL, it is used to allocate the new object, and the resulting + * dsa_pointer is returned via *result. + * + * If area == NULL, the new object is allocated in the current memory context, + * and result is not examined or modified. + */ +static pgpa_collected_advice * +pgpa_make_collected_advice(Oid userid, Oid dbid, uint64 queryId, + TimestampTz timestamp, + const char *query_string, + const char *advice_string, + dsa_area *area, dsa_pointer *result) +{ + size_t query_string_length = strlen(query_string) + 1; + size_t advice_string_length = strlen(advice_string) + 1; + size_t total_length; + pgpa_collected_advice *ca; + + total_length = offsetof(pgpa_collected_advice, textual_data) + + query_string_length + advice_string_length; + + if (area == NULL) + ca = palloc(total_length); + else + { + *result = dsa_allocate(area, total_length); + ca = dsa_get_address(area, *result); + } + + ca->userid = userid; + ca->dbid = dbid; + ca->queryid = queryId; + ca->timestamp = timestamp; + ca->advice_offset = query_string_length; + + memcpy(ca->textual_data, query_string, query_string_length); + memcpy(&ca->textual_data[ca->advice_offset], + advice_string, advice_string_length); + + return ca; +} + +/* + * Add a pg_collected_advice object to our backend-local advice collection. + * + * Caller is responsible for switching to the appropriate memory context; + * the provided object should have been allocated in that same context. + */ +static void +pgpa_store_local_advice(pgpa_collected_advice *ca) +{ + uint64 chunk_number; + uint64 chunk_offset; + pgpa_local_advice *la = local_collector; + + /* If the local advice collector isn't initialized yet, do that now. */ + if (la == NULL) + { + la = palloc0(sizeof(pgpa_local_advice)); + la->chunk_array_allocated_size = ADVICE_CHUNK_ARRAY_SIZE; + la->chunks = palloc0_array(pgpa_local_advice_chunk *, + la->chunk_array_allocated_size); + local_collector = la; + } + + /* Compute chunk and offset at which to store this advice. */ + chunk_number = (la->next_id - la->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (la->next_id - la->base_id) % ADVICE_CHUNK_SIZE; + + /* Extend chunk array, if needed. */ + if (chunk_number >= la->chunk_array_allocated_size) + { + int new_size; + + new_size = la->chunk_array_allocated_size + ADVICE_CHUNK_ARRAY_SIZE; + la->chunks = repalloc0_array(la->chunks, + pgpa_local_advice_chunk *, + la->chunk_array_allocated_size, + new_size); + la->chunk_array_allocated_size = new_size; + } + + /* Allocate new chunk, if needed. */ + if (la->chunks[chunk_number] == NULL) + la->chunks[chunk_number] = palloc0_object(pgpa_local_advice_chunk); + + /* Save pointer and bump next-id counter. */ + Assert(la->chunks[chunk_number]->entries[chunk_offset] == NULL); + la->chunks[chunk_number]->entries[chunk_offset] = ca; + ++la->next_id; + + /* If we've exceeded the storage limit, discard old data. */ + pgpa_trim_local_advice(pg_plan_advice_local_collection_limit); +} + +/* + * Add a pg_collected_advice object to the shared advice collection. + * + * 'ca_pointer' should have been allocated from the pg_plan_advice DSA area + * and should point to an object of type pgpa_collected_advice. + */ +static void +pgpa_store_shared_advice(dsa_pointer ca_pointer) +{ + uint64 chunk_number; + uint64 chunk_offset; + pgpa_shared_state *state = pg_plan_advice_attach(); + dsa_area *area = pg_plan_advice_dsa_area(); + pgpa_shared_advice *sa = shared_collector; + dsa_pointer *chunk_array; + pgpa_shared_advice_chunk *chunk; + + /* Lock the shared state. */ + LWLockAcquire(&state->lock, LW_EXCLUSIVE); + + /* + * If we're not attached to the shared advice collector yet, fix that now. + * If we're the first ones to attach, we may need to create the object. + */ + if (sa == NULL) + { + if (state->shared_collector == InvalidDsaPointer) + state->shared_collector = + dsa_allocate0(area, sizeof(pgpa_shared_advice)); + shared_collector = sa = dsa_get_address(area, state->shared_collector); + } + + /* + * It's possible that some other backend may have succeeded in creating + * the main collector object but failed to allocate an initial chunk + * array, so we must be prepared to allocate the chunk array here whether + * or not we created the collector object. + */ + if (shared_collector->chunk_array_allocated_size == 0) + { + sa->chunks = + dsa_allocate0(area, + sizeof(dsa_pointer) * ADVICE_CHUNK_ARRAY_SIZE); + sa->chunk_array_allocated_size = ADVICE_CHUNK_ARRAY_SIZE; + } + + /* Compute chunk and offset at which to store this advice. */ + chunk_number = (sa->next_id - sa->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (sa->next_id - sa->base_id) % ADVICE_CHUNK_SIZE; + + /* Get the address of the chunk array and, if needed, extend it. */ + if (chunk_number >= sa->chunk_array_allocated_size) + { + int new_size; + dsa_pointer new_chunks; + + /* + * DSA can't enlarge an existing allocation, so we must make a new + * allocation and copy data over. + */ + new_size = sa->chunk_array_allocated_size + ADVICE_CHUNK_ARRAY_SIZE; + new_chunks = dsa_allocate0(area, sizeof(dsa_pointer) * new_size); + chunk_array = dsa_get_address(area, new_chunks); + memcpy(chunk_array, dsa_get_address(area, sa->chunks), + sizeof(dsa_pointer) * sa->chunk_array_allocated_size); + dsa_free(area, sa->chunks); + sa->chunks = new_chunks; + sa->chunk_array_allocated_size = new_size; + } + else + chunk_array = dsa_get_address(area, sa->chunks); + + /* Get the address of the desired chunk, allocating it if needed. */ + if (chunk_array[chunk_number] == InvalidDsaPointer) + chunk_array[chunk_number] = + dsa_allocate0(area, sizeof(pgpa_shared_advice_chunk)); + chunk = dsa_get_address(area, chunk_array[chunk_number]); + + /* Save pointer and bump next-id counter. */ + Assert(chunk->entries[chunk_offset] == InvalidDsaPointer); + chunk->entries[chunk_offset] = ca_pointer; + ++sa->next_id; + + /* If we've exceeded the storage limit, discard old data. */ + pgpa_trim_shared_advice(area, pg_plan_advice_shared_collection_limit); + + /* Release lock on shared state. */ + LWLockRelease(&state->lock); +} + +/* + * Discard collected advice stored in backend-local memory in excess of the + * specified limit. + */ +static void +pgpa_trim_local_advice(int limit) +{ + pgpa_local_advice *la = local_collector; + uint64 current_count; + uint64 trim_count; + uint64 total_chunk_count; + uint64 trim_chunk_count; + uint64 remaining_chunk_count; + + /* If we haven't yet reached the limit, there's nothing to do. */ + current_count = la->next_id - la->oldest_id; + if (current_count <= limit) + return; + + /* Free enough entries to get us back down to the limit. */ + trim_count = current_count - limit; + while (trim_count > 0) + { + uint64 chunk_number; + uint64 chunk_offset; + + chunk_number = (la->oldest_id - la->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (la->oldest_id - la->base_id) % ADVICE_CHUNK_SIZE; + + Assert(la->chunks[chunk_number]->entries[chunk_offset] != NULL); + pfree(la->chunks[chunk_number]->entries[chunk_offset]); + la->chunks[chunk_number]->entries[chunk_offset] = NULL; + ++la->oldest_id; + --trim_count; + } + + /* Free any chunks that are now entirely unused. */ + trim_chunk_count = (la->oldest_id - la->base_id) / ADVICE_CHUNK_SIZE; + for (uint64 n = 0; n < trim_chunk_count; ++n) + pfree(la->chunks[n]); + + /* Slide remaining chunk pointers back toward the base of the array. */ + total_chunk_count = (la->next_id - la->base_id + + ADVICE_CHUNK_SIZE - 1) / ADVICE_CHUNK_SIZE; + remaining_chunk_count = total_chunk_count - trim_chunk_count; + if (remaining_chunk_count > 0) + memmove(&la->chunks[0], &la->chunks[trim_chunk_count], + sizeof(pgpa_local_advice_chunk *) * remaining_chunk_count); + + /* Don't leave stale pointers around. */ + memset(&la->chunks[remaining_chunk_count], 0, + sizeof(pgpa_local_advice_chunk *) + * (total_chunk_count - remaining_chunk_count)); + + /* Adjust base ID value accordingly. */ + la->base_id += trim_chunk_count * ADVICE_CHUNK_SIZE; +} + +/* + * Discard collected advice stored in shared memory in excess of the + * specified limit. + */ +static void +pgpa_trim_shared_advice(dsa_area *area, int limit) +{ + pgpa_shared_advice *sa = shared_collector; + uint64 current_count; + uint64 trim_count; + uint64 total_chunk_count; + uint64 trim_chunk_count; + uint64 remaining_chunk_count; + dsa_pointer *chunk_array; + + /* If we haven't yet reached the limit, there's nothing to do. */ + current_count = sa->next_id - sa->oldest_id; + if (current_count <= limit) + return; + + /* Get a pointer to the chunk array. */ + chunk_array = dsa_get_address(area, sa->chunks); + + /* Free enough entries to get us back down to the limit. */ + trim_count = current_count - limit; + while (trim_count > 0) + { + uint64 chunk_number; + uint64 chunk_offset; + pgpa_shared_advice_chunk *chunk; + + chunk_number = (sa->oldest_id - sa->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (sa->oldest_id - sa->base_id) % ADVICE_CHUNK_SIZE; + + chunk = dsa_get_address(area, chunk_array[chunk_number]); + Assert(chunk->entries[chunk_offset] != InvalidDsaPointer); + dsa_free(area, chunk->entries[chunk_offset]); + chunk->entries[chunk_offset] = InvalidDsaPointer; + ++sa->oldest_id; + --trim_count; + } + + /* Free any chunks that are now entirely unused. */ + trim_chunk_count = (sa->oldest_id - sa->base_id) / ADVICE_CHUNK_SIZE; + for (uint64 n = 0; n < trim_chunk_count; ++n) + dsa_free(area, chunk_array[n]); + + /* Slide remaining chunk pointers back toward the base of the array. */ + total_chunk_count = (sa->next_id - sa->base_id + + ADVICE_CHUNK_SIZE - 1) / ADVICE_CHUNK_SIZE; + remaining_chunk_count = total_chunk_count - trim_chunk_count; + if (remaining_chunk_count > 0) + memmove(&chunk_array[0], &chunk_array[trim_chunk_count], + sizeof(dsa_pointer) * remaining_chunk_count); + + /* Don't leave stale pointers around. */ + memset(&chunk_array[remaining_chunk_count], 0, + sizeof(pgpa_shared_advice_chunk *) + * (total_chunk_count - remaining_chunk_count)); + + /* Adjust base ID value accordingly. */ + sa->base_id += trim_chunk_count * ADVICE_CHUNK_SIZE; +} + +/* + * SQL-callable function to discard advice collected in backend-local memory + */ +Datum +pg_clear_collected_local_advice(PG_FUNCTION_ARGS) +{ + if (local_collector != NULL) + pgpa_trim_local_advice(0); + + PG_RETURN_VOID(); +} + +/* + * SQL-callable function to discard advice collected in backend-local memory + */ +Datum +pg_clear_collected_shared_advice(PG_FUNCTION_ARGS) +{ + pgpa_shared_state *state = pg_plan_advice_attach(); + dsa_area *area = pg_plan_advice_dsa_area(); + + LWLockAcquire(&state->lock, LW_EXCLUSIVE); + + /* + * If we're not attached to the shared advice collector yet, fix that now; + * but if the collector doesn't even exist, we can return without doing + * anything else. + */ + if (shared_collector == NULL) + { + if (state->shared_collector == InvalidDsaPointer) + { + LWLockRelease(&state->lock); + return (Datum) 0; + } + shared_collector = dsa_get_address(area, state->shared_collector); + } + + /* Do the real work */ + pgpa_trim_shared_advice(area, 0); + + LWLockRelease(&state->lock); + + PG_RETURN_VOID(); +} + +/* + * SQL-callable SRF to return advice collected in backend-local memory + */ +Datum +pg_get_collected_local_advice(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + pgpa_local_advice *la = local_collector; + Oid userid = GetUserId(); + + InitMaterializedSRF(fcinfo, 0); + + if (la == NULL) + return (Datum) 0; + + /* Loop over all entries. */ + for (uint64 id = la->oldest_id; id < la->next_id; ++id) + { + uint64 chunk_number; + uint64 chunk_offset; + pgpa_collected_advice *ca; + Datum values[PG_GET_ADVICE_COLUMNS]; + bool nulls[PG_GET_ADVICE_COLUMNS] = {0}; + + chunk_number = (id - la->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (id - la->base_id) % ADVICE_CHUNK_SIZE; + + ca = la->chunks[chunk_number]->entries[chunk_offset]; + + if (!member_can_set_role(userid, ca->userid)) + continue; + + values[0] = UInt64GetDatum(id); + values[1] = ObjectIdGetDatum(ca->userid); + values[2] = ObjectIdGetDatum(ca->dbid); + values[3] = UInt64GetDatum(ca->queryid); + values[4] = TimestampGetDatum(ca->timestamp); + values[5] = CStringGetTextDatum(query_string(ca)); + values[6] = CStringGetTextDatum(advice_string(ca)); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + return (Datum) 0; +} + +/* + * SQL-callable SRF to return advice collected in shared memory + */ +Datum +pg_get_collected_shared_advice(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + pgpa_shared_state *state = pg_plan_advice_attach(); + dsa_area *area = pg_plan_advice_dsa_area(); + dsa_pointer *chunk_array; + pgpa_shared_advice *sa = shared_collector; + + InitMaterializedSRF(fcinfo, 0); + + /* Lock the shared state. */ + LWLockAcquire(&state->lock, LW_SHARED); + + /* + * If we're not attached to the shared advice collector yet, fix that now; + * but if the collector doesn't even exist, we can return without doing + * anything else. + */ + if (sa == NULL) + { + if (state->shared_collector == InvalidDsaPointer) + { + LWLockRelease(&state->lock); + return (Datum) 0; + } + shared_collector = sa = dsa_get_address(area, state->shared_collector); + } + + /* Get a pointer to the chunk array. */ + chunk_array = dsa_get_address(area, sa->chunks); + + /* Loop over all entries. */ + for (uint64 id = sa->oldest_id; id < sa->next_id; ++id) + { + uint64 chunk_number; + uint64 chunk_offset; + pgpa_shared_advice_chunk *chunk; + pgpa_collected_advice *ca; + Datum values[PG_GET_ADVICE_COLUMNS]; + bool nulls[PG_GET_ADVICE_COLUMNS] = {0}; + + chunk_number = (id - sa->base_id) / ADVICE_CHUNK_SIZE; + chunk_offset = (id - sa->base_id) % ADVICE_CHUNK_SIZE; + + chunk = dsa_get_address(area, chunk_array[chunk_number]); + ca = dsa_get_address(area, chunk->entries[chunk_offset]); + + values[0] = UInt64GetDatum(id); + values[1] = ObjectIdGetDatum(ca->userid); + values[2] = ObjectIdGetDatum(ca->dbid); + values[3] = UInt64GetDatum(ca->queryid); + values[4] = TimestampGetDatum(ca->timestamp); + values[5] = CStringGetTextDatum(query_string(ca)); + values[6] = CStringGetTextDatum(advice_string(ca)); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + /* Release lock on shared state. */ + LWLockRelease(&state->lock); + + return (Datum) 0; +} diff --git a/contrib/pg_plan_advice/pgpa_collector.h b/contrib/pg_plan_advice/pgpa_collector.h new file mode 100644 index 00000000000..b6e746a06d7 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_collector.h @@ -0,0 +1,18 @@ +/*------------------------------------------------------------------------- + * + * pgpa_collector.h + * collect advice into backend-local or shared memory + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_collector.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_COLLECTOR_H +#define PGPA_COLLECTOR_H + +extern void pgpa_collect_advice(uint64 queryId, const char *query_string, + const char *advice_string); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_identifier.c b/contrib/pg_plan_advice/pgpa_identifier.c new file mode 100644 index 00000000000..51b4b0c60a6 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_identifier.c @@ -0,0 +1,476 @@ +/*------------------------------------------------------------------------- + * + * pgpa_identifier.c + * create appropriate identifiers for range table entries + * + * The goal of this module is to be able to produce identifiers for range + * table entries that are unique, understandable to human beings, and + * able to be reconstructed during future planning cycles. As an + * exception, we do not care about, or want to produce, identifiers for + * RTE_JOIN entries. This is because (1) we would end up with a ton of + * RTEs with unhelpful names like unnamed_join_17; (2) not all joins have + * RTEs; and (3) we intend to refer to joins by their constituent members + * rather than by reference to the join RTE. + * + * In general, we construct identifiers of the following form: + * + * alias_name#occurrence_number/child_table_name@subquery_name + * + * However, occurrence_number is omitted when it is the first occurrence + * within the same subquery, child_table_name is omitted for relations that + * are not child tables, and subquery_name is omitted for the topmost + * query level. Whenever an item is omitted, the preceding punctuation mark + * is also omitted. Identifier-style escaping is applied to alias_name and + * subquery_name. Whenever we include child_table_name, we always + * schema-qualified name, but writing their own plan advice are not required + * to do so. Identifier-style escaping is applied to the schema and to the + * relation names separately. + * + * The upshot of all of these rules is that in simple cases, the relation + * identifier is textually identical to the alias name, making life easier + * for users. However, even in complex cases, every relation identifier + * for a given query will be unique (or at least we hope so: if not, this + * code is buggy and the identifier format might need to be rethought). + * + * A key goal of this system is that we want to be able to reconstruct the + * same identifiers during a future planning cycle for the same query, so + * that if a certain behavior is specified for a certain identifier, we can + * properly identify the RTI for which that behavior is mandated. In order + * for this to work, subquery names must be unique and known before the + * subquery is planned, and the remainder of the identifier must not depend + * on any part of the query outside of the current subquery level. In + * particular, occurrence_number must be calculated relative to the range + * table for the relevant subquery, not the final flattened range table. + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_identifier.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgpa_identifier.h" + +#include "parser/parsetree.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" + +static Index *pgpa_create_top_rti_map(Index rtable_length, List *rtable, + List *appinfos); +static int pgpa_occurrence_number(List *rtable, Index *top_rti_map, + SubPlanRTInfo *rtinfo, Index rti); + +/* + * Create a range table identifier from scratch. + * + * This function leaves the caller to do all the heavy lifting, so it's + * generally better to use one of the functions below instead. + * + * See the file header comments for more details on the format of an + * identifier. + */ +const char * +pgpa_identifier_string(const pgpa_identifier *rid) +{ + const char *result; + + Assert(rid->alias_name != NULL); + result = quote_identifier(rid->alias_name); + + Assert(rid->occurrence >= 0); + if (rid->occurrence > 1) + result = psprintf("%s#%d", result, rid->occurrence); + + if (rid->partrel != NULL) + { + if (rid->partnsp == NULL) + result = psprintf("%s/%s", result, + quote_identifier(rid->partrel)); + else + result = psprintf("%s/%s.%s", result, + quote_identifier(rid->partnsp), + quote_identifier(rid->partrel)); + } + + if (rid->plan_name != NULL) + result = psprintf("%s@%s", result, quote_identifier(rid->plan_name)); + + return result; +} + +/* + * Compute a relation identifier for a particular RTI. + * + * The caller provides root and rti, and gets the necessary details back via + * the remaining parameters. + */ +void +pgpa_compute_identifier_by_rti(PlannerInfo *root, Index rti, + pgpa_identifier *rid) +{ + Index top_rti = rti; + int occurrence = 1; + RangeTblEntry *rte; + RangeTblEntry *top_rte; + char *partnsp = NULL; + char *partrel = NULL; + + /* + * If this is a child RTE, find the topmost parent that is still of type + * RTE_RELATION. We do this because we identify children of partitioned + * tables by the name of the child table, but subqueries can also have + * child rels and we don't care about those here. + */ + for (;;) + { + AppendRelInfo *appinfo; + RangeTblEntry *parent_rte; + + /* append_rel_array can be NULL if there are no children */ + if (root->append_rel_array == NULL || + (appinfo = root->append_rel_array[top_rti]) == NULL) + break; + + parent_rte = planner_rt_fetch(appinfo->parent_relid, root); + if (parent_rte->rtekind != RTE_RELATION) + break; + + top_rti = appinfo->parent_relid; + } + + /* Get the range table entries for the RTI and top RTI. */ + rte = planner_rt_fetch(rti, root); + top_rte = planner_rt_fetch(top_rti, root); + Assert(rte->rtekind != RTE_JOIN); + Assert(top_rte->rtekind != RTE_JOIN); + + /* Work out the correct occurrence number. */ + for (Index prior_rti = 1; prior_rti < top_rti; ++prior_rti) + { + RangeTblEntry *prior_rte; + AppendRelInfo *appinfo; + + /* + * If this is a child rel of a parent that is a relation, skip it. + * + * Such range table entries are disambiguated by mentioning the schema + * and name of the table, not by counting them as separate occurrences + * of the same table. + * + * NB: append_rel_array can be NULL if there are no children + */ + if (root->append_rel_array != NULL && + (appinfo = root->append_rel_array[prior_rti]) != NULL) + { + RangeTblEntry *parent_rte; + + parent_rte = planner_rt_fetch(appinfo->parent_relid, root); + if (parent_rte->rtekind == RTE_RELATION) + continue; + } + + /* Skip NULL entries and joins. */ + prior_rte = planner_rt_fetch(prior_rti, root); + if (prior_rte == NULL || prior_rte->rtekind == RTE_JOIN) + continue; + + /* Skip if the alias name differs. */ + if (strcmp(prior_rte->eref->aliasname, rte->eref->aliasname) != 0) + continue; + + /* Looks like a true duplicate. */ + ++occurrence; + } + + /* If this is a child table, get the schema and relation names. */ + if (rti != top_rti) + { + partnsp = get_namespace_name_or_temp(get_rel_namespace(rte->relid)); + partrel = get_rel_name(rte->relid); + } + + /* OK, we have all the answers we need. Return them to the caller. */ + rid->alias_name = top_rte->eref->aliasname; + rid->occurrence = occurrence; + rid->partnsp = partnsp; + rid->partrel = partrel; + rid->plan_name = root->plan_name; +} + +/* + * Compute a relation identifier for a set of RTIs, except for any RTE_JOIN + * RTIs that may be present. + * + * RTE_JOIN entries are excluded because they cannot be mentioned by plan + * advice. + * + * The caller is responsible for making sure that the tkeys array is large + * enough to store the results. + * + * The return value is the number of identifiers computed. + */ +int +pgpa_compute_identifiers_by_relids(PlannerInfo *root, Bitmapset *relids, + pgpa_identifier *rids) +{ + int count = 0; + int rti = -1; + + while ((rti = bms_next_member(relids, rti)) >= 0) + { + RangeTblEntry *rte = planner_rt_fetch(rti, root); + + if (rte->rtekind == RTE_JOIN) + continue; + pgpa_compute_identifier_by_rti(root, rti, &rids[count++]); + } + + Assert(count > 0); + return count; +} + +/* + * Create an array of range table identifiers for all the non-NULL, + * non-RTE_JOIN entries in the PlannedStmt's range table. + */ +pgpa_identifier * +pgpa_create_identifiers_for_planned_stmt(PlannedStmt *pstmt) +{ + Index rtable_length = list_length(pstmt->rtable); + pgpa_identifier *result = palloc0_array(pgpa_identifier, rtable_length); + Index *top_rti_map; + int rtinfoindex = 0; + SubPlanRTInfo *rtinfo = NULL; + SubPlanRTInfo *nextrtinfo = NULL; + + /* + * Account for relations addded by inheritance expansion of partitioned + * tables. + */ + top_rti_map = pgpa_create_top_rti_map(rtable_length, pstmt->rtable, + pstmt->appendRelations); + + /* + * When we begin iterating, we're processing the portion of the range + * table that originated from the top-level PlannerInfo, so subrtinfo is + * NULL. Later, subrtinfo will be the SubPlanRTInfo for the subquery whose + * portion of the range table we are processing. nextrtinfo is always the + * SubPlanRTInfo that follows the current one, if any, so when we're + * processing the top-level query's portion of the range table, the next + * SubPlanRTInfo is the very first one. + */ + if (pstmt->subrtinfos != NULL) + nextrtinfo = linitial(pstmt->subrtinfos); + + /* Main loop over the range table. */ + for (Index rti = 1; rti <= rtable_length; rti++) + { + const char *plan_name; + Index top_rti; + RangeTblEntry *rte; + RangeTblEntry *top_rte; + char *partnsp = NULL; + char *partrel = NULL; + int occurrence; + pgpa_identifier *rid; + + /* + * Advance to the next SubPlanRTInfo, if it's time to do that. + * + * This loop probably shouldn't ever iterate more than once, because + * that would imply that a subquery was planned but added nothing to + * the range table; but let's be defensive and assume it can happen. + */ + while (nextrtinfo != NULL && rti > nextrtinfo->rtoffset) + { + rtinfo = nextrtinfo; + if (++rtinfoindex >= list_length(pstmt->subrtinfos)) + nextrtinfo = NULL; + else + nextrtinfo = list_nth(pstmt->subrtinfos, rtinfoindex); + } + + /* Fetch the range table entry, if any. */ + rte = rt_fetch(rti, pstmt->rtable); + + /* + * We can't and don't need to identify null entries, and we don't want + * to identify join entries. + */ + if (rte == NULL || rte->rtekind == RTE_JOIN) + continue; + + /* + * If this is not a relation added by partitioned table expansion, + * then the top RTI/RTE are just the same as this RTI/RTE. Otherwise, + * we need the information for the top RTI/RTE, and must also fetch + * the partition schema and name. + */ + top_rti = top_rti_map[rti - 1]; + if (rti == top_rti) + top_rte = rte; + else + { + top_rte = rt_fetch(top_rti, pstmt->rtable); + partnsp = + get_namespace_name_or_temp(get_rel_namespace(rte->relid)); + partrel = get_rel_name(rte->relid); + } + + /* Compute the correct occurrence number. */ + occurrence = pgpa_occurrence_number(pstmt->rtable, top_rti_map, + rtinfo, top_rti); + + /* Get the name of the current plan (NULL for toplevel query). */ + plan_name = rtinfo == NULL ? NULL : rtinfo->plan_name; + + /* Save all the details we've derived. */ + rid = &result[rti - 1]; + rid->alias_name = top_rte->eref->aliasname; + rid->occurrence = occurrence; + rid->partnsp = partnsp; + rid->partrel = partrel; + rid->plan_name = plan_name; + } + + return result; +} + +/* + * Search for a pgpa_identifier in the array of identifiers computed for the + * range table. If exactly one match is found, return the matching RTI; else + * return 0. + */ +Index +pgpa_compute_rti_from_identifier(int rtable_length, + pgpa_identifier *rt_identifiers, + pgpa_identifier *rid) +{ + Index result = 0; + + for (Index rti = 1; rti <= rtable_length; ++rti) + { + pgpa_identifier *rti_rid = &rt_identifiers[rti - 1]; + + /* If there's no identifier for this RTI, skip it. */ + if (rti_rid->alias_name == NULL) + continue; + + /* + * If it matches, return this RTI. As usual, an omitted partition + * schema matches anything, but partition and plan names must either + * match exactly or be omitted on both sides. + */ + if (strcmp(rid->alias_name, rti_rid->alias_name) == 0 && + rid->occurrence == rti_rid->occurrence && + (rid->partnsp == NULL || rti_rid->partnsp == NULL || + strcmp(rid->partnsp, rti_rid->partnsp) == 0) && + strings_equal_or_both_null(rid->partrel, rti_rid->partrel) && + strings_equal_or_both_null(rid->plan_name, rti_rid->plan_name)) + { + if (result != 0) + { + /* Multiple matches were found. */ + return 0; + } + result = rti; + } + } + + return result; +} + +/* + * Build a mapping from each RTI to the RTI whose alias_name will be used to + * construct the range table identifier. + * + * For child relations, this is the topmost parent that is still of type + * RTE_RELATION. For other relations, it's just the original RTI. + * + * Since we're eventually going to need this information for every RTI in + * the range table, it's best to compute all the answers in a single pass over + * the AppendRelInfo list. Otherwise, we might end up searching through that + * list repeatedly for entries of interest. + * + * Note that the returned array is uses zero-based indexing, while RTIs use + * 1-based indexing, so subtract 1 from the RTI before looking it up in the + * array. + */ +static Index * +pgpa_create_top_rti_map(Index rtable_length, List *rtable, List *appinfos) +{ + Index *top_rti_map = palloc0_array(Index, rtable_length); + + /* Initially, make every RTI point to itself. */ + for (Index rti = 1; rti <= rtable_length; ++rti) + top_rti_map[rti - 1] = rti; + + /* Update the map for each AppendRelInfo object. */ + foreach_node(AppendRelInfo, appinfo, appinfos) + { + Index parent_rti = appinfo->parent_relid; + RangeTblEntry *parent_rte = rt_fetch(parent_rti, rtable); + + /* If the parent is not RTE_RELATION, ignore this entry. */ + if (parent_rte->rtekind != RTE_RELATION) + continue; + + /* + * Map the child to wherever we mapped the parent. Parents always + * precede their children in the AppendRelInfo list, so this should + * work out. + */ + top_rti_map[appinfo->child_relid - 1] = top_rti_map[parent_rti - 1]; + } + + return top_rti_map; +} + +/* + * Find the occurence number of a certain relation within a certain subquery. + * + * The same alias name can occur multiple times within a subquery, but we want + * to disambiguate by giving different occurrences different integer indexes. + * However, child tables are disambiguated by including the table name rather + * than by incrementing the occurrence number; and joins are not named and so + * shouldn't increment the occurence number either. + */ +static int +pgpa_occurrence_number(List *rtable, Index *top_rti_map, + SubPlanRTInfo *rtinfo, Index rti) +{ + Index rtoffset = (rtinfo == NULL) ? 0 : rtinfo->rtoffset; + int occurrence = 1; + RangeTblEntry *rte = rt_fetch(rti, rtable); + + for (Index prior_rti = rtoffset + 1; prior_rti < rti; ++prior_rti) + { + RangeTblEntry *prior_rte; + + /* + * If this is a child rel of a parent that is a relation, skip it. + * + * Such range table entries are disambiguated by mentioning the schema + * and name of the table, not by counting them as separate occurrences + * of the same table. + */ + if (top_rti_map[prior_rti - 1] != prior_rti) + continue; + + /* Skip joins. */ + prior_rte = rt_fetch(prior_rti, rtable); + if (prior_rte->rtekind == RTE_JOIN) + continue; + + /* Skip if the alias name differs. */ + if (strcmp(prior_rte->eref->aliasname, rte->eref->aliasname) != 0) + continue; + + /* Looks like a true duplicate. */ + ++occurrence; + } + + return occurrence; +} diff --git a/contrib/pg_plan_advice/pgpa_identifier.h b/contrib/pg_plan_advice/pgpa_identifier.h new file mode 100644 index 00000000000..b000d2b7081 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_identifier.h @@ -0,0 +1,52 @@ +/*------------------------------------------------------------------------- + * + * pgpa_identifier.h + * create appropriate identifiers for range table entries + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_identifier.h + * + *------------------------------------------------------------------------- + */ + +#ifndef PGPA_IDENTIFIER_H +#define PGPA_IDENTIFIER_H + +#include "nodes/pathnodes.h" +#include "nodes/plannodes.h" + +typedef struct pgpa_identifier +{ + const char *alias_name; + int occurrence; + const char *partnsp; + const char *partrel; + const char *plan_name; +} pgpa_identifier; + +/* Convenience function for comparing possibly-NULL strings. */ +static inline bool +strings_equal_or_both_null(const char *a, const char *b) +{ + if (a == b) + return true; + else if (a == NULL || b == NULL) + return false; + else + return strcmp(a, b) == 0; +} + +extern const char *pgpa_identifier_string(const pgpa_identifier *rid); +extern void pgpa_compute_identifier_by_rti(PlannerInfo *root, Index rti, + pgpa_identifier *rid); +extern int pgpa_compute_identifiers_by_relids(PlannerInfo *root, + Bitmapset *relids, + pgpa_identifier *rids); +extern pgpa_identifier *pgpa_create_identifiers_for_planned_stmt(PlannedStmt *pstmt); + +extern Index pgpa_compute_rti_from_identifier(int rtable_length, + pgpa_identifier *rt_identifiers, + pgpa_identifier *rid); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_join.c b/contrib/pg_plan_advice/pgpa_join.c new file mode 100644 index 00000000000..ec8e1a666ec --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_join.c @@ -0,0 +1,629 @@ +/*------------------------------------------------------------------------- + * + * pgpa_join.c + * analysis of joins in Plan trees + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_join.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgpa_join.h" +#include "pgpa_scan.h" +#include "pgpa_walker.h" + +#include "nodes/pathnodes.h" +#include "nodes/print.h" +#include "parser/parsetree.h" + +/* + * Temporary object used when unrolling a join tree. + */ +struct pgpa_join_unroller +{ + unsigned nallocated; + unsigned nused; + Plan *outer_subplan; + ElidedNode *outer_elided_node; + bool outer_beneath_any_gather; + pgpa_join_strategy *strategy; + Plan **inner_subplans; + ElidedNode **inner_elided_nodes; + pgpa_join_unroller **inner_unrollers; + bool *inner_beneath_any_gather; +}; + +static pgpa_join_strategy pgpa_decompose_join(pgpa_plan_walker_context *walker, + Plan *plan, + Plan **realouter, + Plan **realinner, + ElidedNode **elidedrealouter, + ElidedNode **elidedrealinner, + bool *found_any_outer_gather, + bool *found_any_inner_gather); +static ElidedNode *pgpa_descend_node(PlannedStmt *pstmt, Plan **plan); +static ElidedNode *pgpa_descend_any_gather(PlannedStmt *pstmt, Plan **plan, + bool *found_any_gather); +static bool pgpa_descend_any_unique(PlannedStmt *pstmt, Plan **plan, + ElidedNode **elided_node); + +static bool is_result_node_with_child(Plan *plan); +static bool is_sorting_plan(Plan *plan); + +/* + * Create an initially-empty object for unrolling joins. + * + * This function creates a helper object that can later be used to create a + * pgpa_unrolled_join, after first calling pgpa_unroll_join one or more times. + */ +pgpa_join_unroller * +pgpa_create_join_unroller(void) +{ + pgpa_join_unroller *join_unroller; + + join_unroller = palloc0_object(pgpa_join_unroller); + join_unroller->nallocated = 4; + join_unroller->strategy = + palloc_array(pgpa_join_strategy, join_unroller->nallocated); + join_unroller->inner_subplans = + palloc_array(Plan *, join_unroller->nallocated); + join_unroller->inner_elided_nodes = + palloc_array(ElidedNode *, join_unroller->nallocated); + join_unroller->inner_unrollers = + palloc_array(pgpa_join_unroller *, join_unroller->nallocated); + join_unroller->inner_beneath_any_gather = + palloc_array(bool, join_unroller->nallocated); + + return join_unroller; +} + +/* + * Unroll one level of an unrollable join tree. + * + * Our basic goal here is to unroll join trees as they occur in the Plan + * tree into a simpler and more regular structure that we can more easily + * use for further processing. Unrolling is outer-deep, so if the plan tree + * has Join1(Join2(A,B),Join3(C,D)), the same join unroller object should be + * used for Join1 and Join2, but a different one will be needed for Join3, + * since that involves a join within the *inner* side of another join. + * + * pgpa_plan_walker creates a "top level" join unroller object when it + * encounters a join in a portion of the plan tree in which no join unroller + * is already active. From there, this function is responsible for determing + * to what portion of the plan tree that join unroller applies, and for + * creating any subordinate join unroller objects that are needed as a result + * of non-outer-deep join trees. We do this by returning the join unroller + * objects that should be used for further traversal of the outer and inner + * subtrees of the current plan node via *outer_join_unroller and + * *inner_join_unroller, respectively. + */ +void +pgpa_unroll_join(pgpa_plan_walker_context *walker, Plan *plan, + bool beneath_any_gather, + pgpa_join_unroller *join_unroller, + pgpa_join_unroller **outer_join_unroller, + pgpa_join_unroller **inner_join_unroller) +{ + pgpa_join_strategy strategy; + Plan *realinner, + *realouter; + ElidedNode *elidedinner, + *elidedouter; + int n; + bool found_any_outer_gather = false; + bool found_any_inner_gather = false; + + Assert(join_unroller != NULL); + + /* + * We need to pass the join_unroller object down through certain types of + * plan nodes -- anything that's considered part of the join strategy, and + * any other nodes that can occur in a join tree despite not being scans + * or joins. + * + * This includes: + * + * (1) Materialize, Memoize, and Hash nodes, which are part of the join + * strategy, + * + * (2) Gather and Gather Merge nodes, which can occur at any point in the + * join tree where the planner decided to initiate parallelism, + * + * (3) Sort and IncrementalSort nodes, which can occur beneath MergeJoin + * or GatherMerge, + * + * (4) Agg and Unique nodes, which can occur when we decide to make the + * nullable side of a semijoin unique and then join the result, and + * + * (5) Result nodes with children, which can be added either to project to + * enforce a one-time filter (but Result nodes without children are + * degenerate scans or joins). + */ + if (IsA(plan, Material) || IsA(plan, Memoize) || IsA(plan, Hash) + || IsA(plan, Gather) || IsA(plan, GatherMerge) + || is_sorting_plan(plan) || IsA(plan, Agg) || IsA(plan, Unique) + || is_result_node_with_child(plan)) + { + *outer_join_unroller = join_unroller; + return; + } + + /* + * Since we've already handled nodes that require pass-through treatment, + * this should be an unrollable join. + */ + strategy = pgpa_decompose_join(walker, plan, + &realouter, &realinner, + &elidedouter, &elidedinner, + &found_any_outer_gather, + &found_any_inner_gather); + + /* If our workspace is full, expand it. */ + if (join_unroller->nused >= join_unroller->nallocated) + { + join_unroller->nallocated *= 2; + join_unroller->strategy = + repalloc_array(join_unroller->strategy, + pgpa_join_strategy, + join_unroller->nallocated); + join_unroller->inner_subplans = + repalloc_array(join_unroller->inner_subplans, + Plan *, + join_unroller->nallocated); + join_unroller->inner_elided_nodes = + repalloc_array(join_unroller->inner_elided_nodes, + ElidedNode *, + join_unroller->nallocated); + join_unroller->inner_beneath_any_gather = + repalloc_array(join_unroller->inner_beneath_any_gather, + bool, + join_unroller->nallocated); + join_unroller->inner_unrollers = + repalloc_array(join_unroller->inner_unrollers, + pgpa_join_unroller *, + join_unroller->nallocated); + } + + /* + * Since we're flattening outer-deep join trees, it follows that if the + * outer side is still an unrollable join, it should be unrolled into this + * same object. Otherwise, we've reached the limit of what we can unroll + * into this object and must remember the outer side as the final outer + * subplan. + */ + if (elidedouter == NULL && pgpa_is_join(realouter)) + *outer_join_unroller = join_unroller; + else + { + join_unroller->outer_subplan = realouter; + join_unroller->outer_elided_node = elidedouter; + join_unroller->outer_beneath_any_gather = + beneath_any_gather || found_any_outer_gather; + } + + /* + * Store the inner subplan. If it's an unrollable join, it needs to be + * flattened in turn, but into a new unroller object, not this one. + */ + n = join_unroller->nused++; + join_unroller->strategy[n] = strategy; + join_unroller->inner_subplans[n] = realinner; + join_unroller->inner_elided_nodes[n] = elidedinner; + join_unroller->inner_beneath_any_gather[n] = + beneath_any_gather || found_any_inner_gather; + if (elidedinner == NULL && pgpa_is_join(realinner)) + *inner_join_unroller = pgpa_create_join_unroller(); + else + *inner_join_unroller = NULL; + join_unroller->inner_unrollers[n] = *inner_join_unroller; +} + +/* + * Use the data we've accumulated in a pgpa_join_unroller object to construct + * a pgpa_unrolled_join. + */ +pgpa_unrolled_join * +pgpa_build_unrolled_join(pgpa_plan_walker_context *walker, + pgpa_join_unroller *join_unroller) +{ + pgpa_unrolled_join *ujoin; + int i; + + /* + * We shouldn't have gone even so far as to create a join unroller unless + * we found at least one unrollable join. + */ + Assert(join_unroller->nused > 0); + + /* Allocate result structures. */ + ujoin = palloc0_object(pgpa_unrolled_join); + ujoin->ninner = join_unroller->nused; + ujoin->strategy = palloc0_array(pgpa_join_strategy, join_unroller->nused); + ujoin->inner = palloc0_array(pgpa_join_member, join_unroller->nused); + + /* Handle the outermost join. */ + ujoin->outer.plan = join_unroller->outer_subplan; + ujoin->outer.elided_node = join_unroller->outer_elided_node; + ujoin->outer.scan = + pgpa_build_scan(walker, ujoin->outer.plan, + ujoin->outer.elided_node, + join_unroller->outer_beneath_any_gather, + true); + + /* + * We want the joins from the deepest part of the plan tree to appear + * first in the result object, but the join unroller adds them in exactly + * the reverse of that order, so we need to flip the order of the arrays + * when constructing the final result. + */ + for (i = 0; i < join_unroller->nused; ++i) + { + int k = join_unroller->nused - i - 1; + + /* Copy strategy, Plan, and ElidedNode. */ + ujoin->strategy[i] = join_unroller->strategy[k]; + ujoin->inner[i].plan = join_unroller->inner_subplans[k]; + ujoin->inner[i].elided_node = join_unroller->inner_elided_nodes[k]; + + /* + * Fill in remaining details, using either the nested join unroller, + * or by deriving them from the plan and elided nodes. + */ + if (join_unroller->inner_unrollers[k] != NULL) + ujoin->inner[i].unrolled_join = + pgpa_build_unrolled_join(walker, + join_unroller->inner_unrollers[k]); + else + ujoin->inner[i].scan = + pgpa_build_scan(walker, ujoin->inner[i].plan, + ujoin->inner[i].elided_node, + join_unroller->inner_beneath_any_gather[k], + true); + } + + return ujoin; +} + +/* + * Free memory allocated for pgpa_join_unroller. + */ +void +pgpa_destroy_join_unroller(pgpa_join_unroller *join_unroller) +{ + pfree(join_unroller->strategy); + pfree(join_unroller->inner_subplans); + pfree(join_unroller->inner_elided_nodes); + pfree(join_unroller->inner_unrollers); + pfree(join_unroller); +} + +/* + * Identify the join strategy used by a join and the "real" inner and outer + * plans. + * + * For example, a Hash Join always has a Hash node on the inner side, but + * for all intents and purposes the real inner input is the Hash node's child, + * not the Hash node itself. + * + * Likewise, a Merge Join may have Sort note on the inner or outer side; if + * it does, the real input to the join is the Sort node's child, not the + * Sort node itself. + * + * In addition, with a Merge Join or a Nested Loop, the join planning code + * may add additional nodes such as Materialize or Memoize. We regard these + * as an aspect of the join strategy. As in the previous cases, the true input + * to the join is the underlying node. + * + * However, if any involved child node previously had a now-elided node stacked + * on top, then we can't "look through" that node -- indeed, what's going to be + * relevant for our purposes is the ElidedNode on top of that plan node, rather + * than the plan node itself. + * + * If there are multiple elided nodes, we want that one that would have been + * uppermost in the plan tree prior to setrefs processing; we expect to find + * that one last in the list of elided nodes. + * + * On return *realouter and *realinner will have been set to the real inner + * and real outer plans that we identified, and *elidedrealouter and + * *elidedrealinner to the last of any correspoding elided nodes. + * Additionally, *found_any_outer_gather and *found_any_inner_gather will + * be set to true if we looked through a Gather or Gather Merge node on + * that side of the join, and false otherwise. + */ +static pgpa_join_strategy +pgpa_decompose_join(pgpa_plan_walker_context *walker, Plan *plan, + Plan **realouter, Plan **realinner, + ElidedNode **elidedrealouter, ElidedNode **elidedrealinner, + bool *found_any_outer_gather, bool *found_any_inner_gather) +{ + PlannedStmt *pstmt = walker->pstmt; + JoinType jointype = ((Join *) plan)->jointype; + Plan *outerplan = plan->lefttree; + Plan *innerplan = plan->righttree; + ElidedNode *elidedouter; + ElidedNode *elidedinner; + pgpa_join_strategy strategy; + bool uniqueouter; + bool uniqueinner; + + elidedouter = pgpa_last_elided_node(pstmt, outerplan); + elidedinner = pgpa_last_elided_node(pstmt, innerplan); + *found_any_outer_gather = false; + *found_any_inner_gather = false; + + switch (nodeTag(plan)) + { + case T_MergeJoin: + + /* + * The planner may have chosen to place a Material node on the + * inner side of the MergeJoin; if this is present, we record it + * as part of the join strategy. + */ + if (elidedinner == NULL && IsA(innerplan, Material)) + { + elidedinner = pgpa_descend_node(pstmt, &innerplan); + strategy = JSTRAT_MERGE_JOIN_MATERIALIZE; + } + else + strategy = JSTRAT_MERGE_JOIN_PLAIN; + + /* + * For a MergeJoin, either the outer or the inner subplan, or + * both, may have needed to be sorted; we must disregard any Sort + * or IncrementalSort node to find the real inner or outer + * subplan. + */ + if (elidedouter == NULL && is_sorting_plan(outerplan)) + elidedouter = pgpa_descend_node(pstmt, &outerplan); + if (elidedinner == NULL && is_sorting_plan(innerplan)) + elidedinner = pgpa_descend_node(pstmt, &innerplan); + break; + + case T_NestLoop: + + /* + * The planner may have chosen to place a Material or Memoize node + * on the inner side of the NestLoop; if this is present, we + * record it as part of the join strategy. + */ + if (elidedinner == NULL && IsA(innerplan, Material)) + { + elidedinner = pgpa_descend_node(pstmt, &innerplan); + strategy = JSTRAT_NESTED_LOOP_MATERIALIZE; + } + else if (elidedinner == NULL && IsA(innerplan, Memoize)) + { + elidedinner = pgpa_descend_node(pstmt, &innerplan); + strategy = JSTRAT_NESTED_LOOP_MEMOIZE; + } + else + strategy = JSTRAT_NESTED_LOOP_PLAIN; + break; + + case T_HashJoin: + + /* + * The inner subplan of a HashJoin is always a Hash node; the real + * inner subplan is the Hash node's child. + */ + Assert(IsA(innerplan, Hash)); + Assert(elidedinner == NULL); + elidedinner = pgpa_descend_node(pstmt, &innerplan); + strategy = JSTRAT_HASH_JOIN; + break; + + default: + elog(ERROR, "unrecognized node type: %d", (int) nodeTag(plan)); + } + + /* + * The planner may have decided to implement a semijoin by first making + * the nullable side of the plan unique, and then performing a normal join + * against the result. Therefore, we might need to descend through a + * unique node on either side of the plan. + */ + uniqueouter = pgpa_descend_any_unique(pstmt, &outerplan, &elidedouter); + uniqueinner = pgpa_descend_any_unique(pstmt, &innerplan, &elidedinner); + + /* + * The planner may have decided to parallelize part of the join tree, so + * we could find a Gather or Gather Merge node here. Note that, if + * present, this will appear below nodes we considered as part of the join + * strategy, but we could find another uniqueness-enforcing node below the + * Gather or Gather Merge, if present. + */ + if (elidedouter == NULL) + { + elidedouter = pgpa_descend_any_gather(pstmt, &outerplan, + found_any_outer_gather); + if (found_any_outer_gather && + pgpa_descend_any_unique(pstmt, &outerplan, &elidedouter)) + uniqueouter = true; + } + if (elidedinner == NULL) + { + elidedinner = pgpa_descend_any_gather(pstmt, &innerplan, + found_any_inner_gather); + if (found_any_inner_gather && + pgpa_descend_any_unique(pstmt, &innerplan, &elidedinner)) + uniqueinner = true; + } + + /* + * It's possible that Result node has been inserted either to project a + * target list or to implement a one-time filter. If so, we can descend + * throught it. Note that a result node without a child would be a + * degenerate scan or join, and not something we could descend through. + * + * XXX. I suspect it's possible for this to happen above the Gather or + * Gather Merge node, too, but apparently we have no test case for that + * scenario. + */ + if (elidedouter == NULL && is_result_node_with_child(outerplan)) + elidedouter = pgpa_descend_node(pstmt, &outerplan); + if (elidedinner == NULL && is_result_node_with_child(innerplan)) + elidedinner = pgpa_descend_node(pstmt, &innerplan); + + /* + * If this is a semijoin that was converted to an inner join by making one + * side or the other unique, make a note that the inner or outer subplan, + * as appropriate, should be treated as a query plan feature when the main + * tree traversal reaches it. + * + * Conversely, if the planner could have made one side of the join unique + * and thereby converted it to an inner join, and chose not to do so, that + * is also worth noting. + * + * NB: This code could appear slightly higher up in in this function, but + * none of the nodes through which we just descended should have + * associated RTIs. + * + * NB: This seems like a somewhat hacky way of passing information up to + * the main tree walk, but I don't currently have a better idea. + */ + if (uniqueouter) + pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_UNIQUE, outerplan); + else if (jointype == JOIN_RIGHT_SEMI) + pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_NON_UNIQUE, outerplan); + if (uniqueinner) + pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_UNIQUE, innerplan); + else if (jointype == JOIN_SEMI) + pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_NON_UNIQUE, innerplan); + + /* Set output parameters. */ + *realouter = outerplan; + *realinner = innerplan; + *elidedrealouter = elidedouter; + *elidedrealinner = elidedinner; + return strategy; +} + +/* + * Descend through a Plan node in a join tree that the caller has determined + * to be irrelevant. + * + * Updates *plan, and returns the last of any elided nodes pertaining to the + * new plan node. + */ +static ElidedNode * +pgpa_descend_node(PlannedStmt *pstmt, Plan **plan) +{ + *plan = (*plan)->lefttree; + return pgpa_last_elided_node(pstmt, *plan); +} + +/* + * Descend through a Gather or Gather Merge node, if present, and any Sort + * or IncrementalSort node occurring under a Gather Merge. + * + * Caller should have verified that there is no ElidedNode pertaining to + * the initial value of *plan. + * + * Updates *plan, and returns the last of any elided nodes pertaining to the + * new plan node. Sets *found_any_gather = true if either Gather or + * Gather Merge was found, and otherwise leaves it unchanged. + */ +static ElidedNode * +pgpa_descend_any_gather(PlannedStmt *pstmt, Plan **plan, + bool *found_any_gather) +{ + if (IsA(*plan, Gather)) + { + *found_any_gather = true; + return pgpa_descend_node(pstmt, plan); + } + + if (IsA(*plan, GatherMerge)) + { + ElidedNode *elided = pgpa_descend_node(pstmt, plan); + + if (elided == NULL && is_sorting_plan(*plan)) + elided = pgpa_descend_node(pstmt, plan); + + *found_any_gather = true; + return elided; + } + + return NULL; +} + +/* + * If *plan is an Agg or Unique node, we want to descend through it, unless + * it has a corresponding elided node. If its immediate child is a Sort or + * IncrementalSort, we also want to descend through that, unless it has a + * corresponding elided node. + * + * On entry, *elided_node must be the last of any elided nodes corresponding + * to *plan; on exit, this will still be true, but *plan may have been updated. + * + * The reason we don't want to descend through elided nodes is that a single + * join tree can't cross through any sort of elided node: subqueries are + * planned separately, and planning inside an Append or MergeAppend is + * separate from planning outside of it. + * + * The return value is true if we descend through a node that we believe is + * making one side of a semijoin unique, and otherwise false. + */ +static bool +pgpa_descend_any_unique(PlannedStmt *pstmt, Plan **plan, + ElidedNode **elided_node) +{ + bool descend = false; + bool sjunique = false; + + if (*elided_node != NULL) + return sjunique; + + if (IsA(*plan, Unique)) + { + descend = true; + sjunique = true; + } + else if (IsA(*plan, Agg)) + { + /* + * If this is a simple Agg node, then assume it's here to implement + * semijoin uniqueness. Otherwise, assume it's completing an eager + * aggregation or partitionwise aggregation operation that began at a + * higher level of the plan tree. + * + * XXX. I suspect this logic does not cover all cases: couldn't SJ + * uniqueness be implemented in two steps with an intermediate Gather? + */ + descend = true; + sjunique = (((Agg *) *plan)->aggsplit == AGGSPLIT_SIMPLE); + } + + if (descend) + { + *elided_node = pgpa_descend_node(pstmt, plan); + + if (*elided_node == NULL && is_sorting_plan(*plan)) + *elided_node = pgpa_descend_node(pstmt, plan); + } + + return sjunique; +} + +/* + * Is this a Result node that has a child? + */ +static bool +is_result_node_with_child(Plan *plan) +{ + return IsA(plan, Result) && plan->lefttree != NULL; +} + +/* + * Is this a Plan node whose purpose is put the data in a certain order? + */ +static bool +is_sorting_plan(Plan *plan) +{ + return IsA(plan, Sort) || IsA(plan, IncrementalSort); +} diff --git a/contrib/pg_plan_advice/pgpa_join.h b/contrib/pg_plan_advice/pgpa_join.h new file mode 100644 index 00000000000..4dc72986a70 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_join.h @@ -0,0 +1,105 @@ +/*------------------------------------------------------------------------- + * + * pgpa_join.h + * analysis of joins in Plan trees + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_join.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_JOIN_H +#define PGPA_JOIN_H + +#include "nodes/plannodes.h" + +typedef struct pgpa_plan_walker_context pgpa_plan_walker_context; +typedef struct pgpa_join_unroller pgpa_join_unroller; +typedef struct pgpa_unrolled_join pgpa_unrolled_join; + +/* + * Although there are three main join strategies, we try to classify things + * more precisely here: merge joins have the option of using materialization + * on the inner side, and nested loops can use either materialization or + * memoization. + */ +typedef enum +{ + JSTRAT_MERGE_JOIN_PLAIN = 0, + JSTRAT_MERGE_JOIN_MATERIALIZE, + JSTRAT_NESTED_LOOP_PLAIN, + JSTRAT_NESTED_LOOP_MATERIALIZE, + JSTRAT_NESTED_LOOP_MEMOIZE, + JSTRAT_HASH_JOIN + /* update NUM_PGPA_JOIN_STRATEGY if you add anything here */ +} pgpa_join_strategy; + +#define NUM_PGPA_JOIN_STRATEGY ((int) JSTRAT_HASH_JOIN + 1) + +/* + * In an outer-deep join tree, every member of an unrolled join will be a scan, + * but join trees with other shapes can contain unrolled joins. + * + * The plan node we store here will be the inner or outer child of the join + * node, as appropriate, except that we look through subnodes that we regard as + * part of the join method itself. For instance, for a Nested Loop that + * materializes the inner input, we'll store the child of the Materialize node, + * not the Materialize node itself. + * + * If setrefs processing elided one or more nodes from the plan tree, then + * we'll store details about the topmost of those in elided_node; otherwise, + * it will be NULL. + * + * Exactly one of scan and unrolled_join will be non-NULL. + */ +typedef struct +{ + Plan *plan; + ElidedNode *elided_node; + struct pgpa_scan *scan; + pgpa_unrolled_join *unrolled_join; +} pgpa_join_member; + +/* + * We convert outer-deep join trees to a flat structure; that is, ((A JOIN B) + * JOIN C) JOIN D gets converted to outer = A, inner = . When joins + * aren't outer-deep, substructure is required, e.g. (A JOIN B) JOIN (C JOIN D) + * is represented as outer = A, inner = , where X is a pgpa_unrolled_join + * covering C-D. + */ +struct pgpa_unrolled_join +{ + /* Outermost member; must not itself be an unrolled join. */ + pgpa_join_member outer; + + /* Number of inner members. Length of the strategy and inner arrays. */ + unsigned ninner; + + /* Array of strategies, one per non-outermost member. */ + pgpa_join_strategy *strategy; + + /* Array of members, excluding the outermost. Deepest first. */ + pgpa_join_member *inner; +}; + +/* + * Does this plan node inherit from Join? + */ +static inline bool +pgpa_is_join(Plan *plan) +{ + return IsA(plan, NestLoop) || IsA(plan, MergeJoin) || IsA(plan, HashJoin); +} + +extern pgpa_join_unroller *pgpa_create_join_unroller(void); +extern void pgpa_unroll_join(pgpa_plan_walker_context *walker, + Plan *plan, bool beneath_any_gather, + pgpa_join_unroller *join_unroller, + pgpa_join_unroller **outer_join_unroller, + pgpa_join_unroller **inner_join_unroller); +extern pgpa_unrolled_join *pgpa_build_unrolled_join(pgpa_plan_walker_context *walker, + pgpa_join_unroller *join_unroller); +extern void pgpa_destroy_join_unroller(pgpa_join_unroller *join_unroller); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_output.c b/contrib/pg_plan_advice/pgpa_output.c new file mode 100644 index 00000000000..67647acdf5a --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_output.c @@ -0,0 +1,571 @@ +/*------------------------------------------------------------------------- + * + * pgpa_output.c + * produce textual output from the results of a plan tree walk + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_output.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgpa_output.h" +#include "pgpa_scan.h" + +#include "nodes/parsenodes.h" +#include "parser/parsetree.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" + +/* + * Context object for textual advice generation. + * + * rt_identifiers is the caller-provided array of range table identifiers. + * See the comments at the top of pgpa_identifier.c for more details. + * + * buf is the caller-provided output buffer. + * + * wrap_column is the wrap column, so that we don't create output that is + * too wide. See pgpa_maybe_linebreak() and comments in pgpa_output_advice. + */ +typedef struct pgpa_output_context +{ + const char **rid_strings; + StringInfo buf; + int wrap_column; +} pgpa_output_context; + +static void pgpa_output_unrolled_join(pgpa_output_context *context, + pgpa_unrolled_join *join); +static void pgpa_output_join_member(pgpa_output_context *context, + pgpa_join_member *member); +static void pgpa_output_scan_strategy(pgpa_output_context *context, + pgpa_scan_strategy strategy, + List *scans); +static void pgpa_output_relation_name(pgpa_output_context *context, Oid relid); +static void pgpa_output_query_feature(pgpa_output_context *context, + pgpa_qf_type type, + List *query_features); +static void pgpa_output_simple_strategy(pgpa_output_context *context, + char *strategy, + List *relid_sets); +static void pgpa_output_no_gather(pgpa_output_context *context, + Bitmapset *relids); +static void pgpa_output_relations(pgpa_output_context *context, StringInfo buf, + Bitmapset *relids); + +static char *pgpa_cstring_join_strategy(pgpa_join_strategy strategy); +static char *pgpa_cstring_scan_strategy(pgpa_scan_strategy strategy); +static char *pgpa_cstring_query_feature_type(pgpa_qf_type type); + +static void pgpa_maybe_linebreak(StringInfo buf, int wrap_column); + +/* + * Append query advice to the provided buffer. + * + * Before calling this function, 'walker' must be used to iterate over the + * main plan tree and all subplans from the PlannedStmt. + * + * 'rt_identifiers' is a table of unique identifiers, one for each RTI. + * See pgpa_create_identifiers_for_planned_stmt(). + * + * Results will be appended to 'buf'. + */ +void +pgpa_output_advice(StringInfo buf, pgpa_plan_walker_context *walker, + pgpa_identifier *rt_identifiers) +{ + Index rtable_length = list_length(walker->pstmt->rtable); + ListCell *lc; + pgpa_output_context context; + + /* Basic initialization. */ + memset(&context, 0, sizeof(pgpa_output_context)); + context.buf = buf; + + /* + * Convert identifiers to string form. Note that the loop variable here is + * not an RTI, because RTIs are 1-based. Some RTIs will have no + * identifier, either because the reloptkind is RTE_JOIN or because that + * portion of the query didn't make it into the final plan. + */ + context.rid_strings = palloc0_array(const char *, rtable_length); + for (int i = 0; i < rtable_length; ++i) + if (rt_identifiers[i].alias_name != NULL) + context.rid_strings[i] = pgpa_identifier_string(&rt_identifiers[i]); + + /* + * If the user chooses to use EXPLAIN (PLAN_ADVICE) in an 80-column window + * from a psql client with default settings, psql will add one space to + * the left of the output and EXPLAIN will add two more to the left of the + * advice. Thus, lines of more than 77 characters will wrap. We set the + * wrap limit to 76 here so that the output won't reach all the way to the + * very last column of the terminal. + * + * Of course, this is fairly arbitrary set of assumptions, and one could + * well make an argument for a different wrap limit, or for a configurable + * one. + */ + context.wrap_column = 76; + + /* + * Each piece of JOIN_ORDER() advice fully describes the join order for a + * a single unrolled join. Merging is not permitted, because that would + * change the meaning, e.g. SEQ_SCAN(a b c d) means simply that sequential + * scans should be used for all of those relations, and is thus equivalent + * to SEQ_SCAN(a b) SEQ_SCAN(c d), but JOIN_ORDER(a b c d) means that "a" + * is the driving table which is then joined to "b" then "c" then "d", + * which is totally different from JOIN_ORDER(a b) and JOIN_ORDER(c d). + */ + foreach(lc, walker->toplevel_unrolled_joins) + { + pgpa_unrolled_join *ujoin = lfirst(lc); + + if (buf->len > 0) + appendStringInfoChar(buf, '\n'); + appendStringInfo(context.buf, "JOIN_ORDER("); + pgpa_output_unrolled_join(&context, ujoin); + appendStringInfoChar(context.buf, ')'); + pgpa_maybe_linebreak(context.buf, context.wrap_column); + } + + /* Emit join strategy advice. */ + for (int s = 0; s < NUM_PGPA_JOIN_STRATEGY; ++s) + { + char *strategy = pgpa_cstring_join_strategy(s); + + pgpa_output_simple_strategy(&context, + strategy, + walker->join_strategies[s]); + } + + /* + * Emit scan strategy advice (but not for ordinary scans, which are + * definitionally uninteresting). + */ + for (int c = 0; c < NUM_PGPA_SCAN_STRATEGY; ++c) + if (c != PGPA_SCAN_ORDINARY) + pgpa_output_scan_strategy(&context, c, walker->scans[c]); + + /* Emit query feature advice. */ + for (int t = 0; t < NUM_PGPA_QF_TYPES; ++t) + pgpa_output_query_feature(&context, t, walker->query_features[t]); + + /* Emit NO_GATHER advice. */ + pgpa_output_no_gather(&context, walker->no_gather_scans); +} + +/* + * Output the members of an unrolled join, first the outermost member, and + * then the inner members one by one, as part of JOIN_ORDER() advice. + */ +static void +pgpa_output_unrolled_join(pgpa_output_context *context, + pgpa_unrolled_join *join) +{ + pgpa_output_join_member(context, &join->outer); + + for (int k = 0; k < join->ninner; ++k) + { + pgpa_join_member *member = &join->inner[k]; + + pgpa_maybe_linebreak(context->buf, context->wrap_column); + appendStringInfoChar(context->buf, ' '); + pgpa_output_join_member(context, member); + } +} + +/* + * Output a single member of an unrolled join as part of JOIN_ORDER() advice. + */ +static void +pgpa_output_join_member(pgpa_output_context *context, + pgpa_join_member *member) +{ + if (member->unrolled_join != NULL) + { + appendStringInfoChar(context->buf, '('); + pgpa_output_unrolled_join(context, member->unrolled_join); + appendStringInfoChar(context->buf, ')'); + } + else + { + pgpa_scan *scan = member->scan; + + Assert(scan != NULL); + if (bms_membership(scan->relids) == BMS_SINGLETON) + pgpa_output_relations(context, context->buf, scan->relids); + else + { + appendStringInfoChar(context->buf, '{'); + pgpa_output_relations(context, context->buf, scan->relids); + appendStringInfoChar(context->buf, '}'); + } + } +} + +/* + * Output advice for a List of pgpa_scan objects. + * + * All the scans must use the strategy specified by the "strategy" argument. + */ +static void +pgpa_output_scan_strategy(pgpa_output_context *context, + pgpa_scan_strategy strategy, + List *scans) +{ + bool first = true; + + if (scans == NIL) + return; + + if (context->buf->len > 0) + appendStringInfoChar(context->buf, '\n'); + appendStringInfo(context->buf, "%s(", + pgpa_cstring_scan_strategy(strategy)); + + foreach_ptr(pgpa_scan, scan, scans) + { + Plan *plan = scan->plan; + + if (first) + first = false; + else + { + pgpa_maybe_linebreak(context->buf, context->wrap_column); + appendStringInfoChar(context->buf, ' '); + } + + /* Output the relation identifiers. */ + if (bms_membership(scan->relids) == BMS_SINGLETON) + pgpa_output_relations(context, context->buf, scan->relids); + else + { + appendStringInfoChar(context->buf, '('); + pgpa_output_relations(context, context->buf, scan->relids); + appendStringInfoChar(context->buf, ')'); + } + + /* For index or index-only scans, output index information. */ + if (strategy == PGPA_SCAN_INDEX) + { + Assert(IsA(plan, IndexScan)); + pgpa_maybe_linebreak(context->buf, context->wrap_column); + appendStringInfoChar(context->buf, ' '); + pgpa_output_relation_name(context, ((IndexScan *) plan)->indexid); + } + else if (strategy == PGPA_SCAN_INDEX_ONLY) + { + Assert(IsA(plan, IndexOnlyScan)); + pgpa_maybe_linebreak(context->buf, context->wrap_column); + appendStringInfoChar(context->buf, ' '); + pgpa_output_relation_name(context, + ((IndexOnlyScan *) plan)->indexid); + } + } + + appendStringInfoChar(context->buf, ')'); + pgpa_maybe_linebreak(context->buf, context->wrap_column); +} + +/* + * Output a schema-qualified relation name. + */ +static void +pgpa_output_relation_name(pgpa_output_context *context, Oid relid) +{ + Oid nspoid = get_rel_namespace(relid); + char *relnamespace = get_namespace_name_or_temp(nspoid); + char *relname = get_rel_name(relid); + + appendStringInfoString(context->buf, quote_identifier(relnamespace)); + appendStringInfoChar(context->buf, '.'); + appendStringInfoString(context->buf, quote_identifier(relname)); +} + +/* + * Output advice for a List of pgpa_query_feature objects. + * + * All features must be of the type specified by the "type" argument. + */ +static void +pgpa_output_query_feature(pgpa_output_context *context, pgpa_qf_type type, + List *query_features) +{ + bool first = true; + + if (query_features == NIL) + return; + + if (context->buf->len > 0) + appendStringInfoChar(context->buf, '\n'); + appendStringInfo(context->buf, "%s(", + pgpa_cstring_query_feature_type(type)); + + foreach_ptr(pgpa_query_feature, qf, query_features) + { + if (first) + first = false; + else + { + pgpa_maybe_linebreak(context->buf, context->wrap_column); + appendStringInfoChar(context->buf, ' '); + } + + if (bms_membership(qf->relids) == BMS_SINGLETON) + pgpa_output_relations(context, context->buf, qf->relids); + else + { + appendStringInfoChar(context->buf, '('); + pgpa_output_relations(context, context->buf, qf->relids); + appendStringInfoChar(context->buf, ')'); + } + } + + appendStringInfoChar(context->buf, ')'); + pgpa_maybe_linebreak(context->buf, context->wrap_column); +} + +/* + * Output "simple" advice for a List of Bitmapset objects each of which + * contains one or more RTIs. + * + * By simple, we just mean that the advice emitted follows the most + * straightforward pattern: the strategy name, followed by a list of items + * separated by spaces and surrounded by parentheses. Individual items in + * the list are a single relation identifier for a Bitmapset that contains + * just one member, or a sub-list again separated by spaces and surrounded + * by parentheses for a Bitmapset with multiple members. Bitmapsets with + * no members probably shouldn't occur here, but if they do they'll be + * rendered as an empty sub-list. + */ +static void +pgpa_output_simple_strategy(pgpa_output_context *context, char *strategy, + List *relid_sets) +{ + bool first = true; + + if (relid_sets == NIL) + return; + + if (context->buf->len > 0) + appendStringInfoChar(context->buf, '\n'); + appendStringInfo(context->buf, "%s(", strategy); + + foreach_node(Bitmapset, relids, relid_sets) + { + if (first) + first = false; + else + { + pgpa_maybe_linebreak(context->buf, context->wrap_column); + appendStringInfoChar(context->buf, ' '); + } + + if (bms_membership(relids) == BMS_SINGLETON) + pgpa_output_relations(context, context->buf, relids); + else + { + appendStringInfoChar(context->buf, '('); + pgpa_output_relations(context, context->buf, relids); + appendStringInfoChar(context->buf, ')'); + } + } + + appendStringInfoChar(context->buf, ')'); + pgpa_maybe_linebreak(context->buf, context->wrap_column); +} + +/* + * Output NO_GATHER advice for all relations not appearing beneath any + * Gather or Gather Merge node. + */ +static void +pgpa_output_no_gather(pgpa_output_context *context, Bitmapset *relids) +{ + if (relids == NULL) + return; + if (context->buf->len > 0) + appendStringInfoChar(context->buf, '\n'); + appendStringInfoString(context->buf, "NO_GATHER("); + pgpa_output_relations(context, context->buf, relids); + appendStringInfoChar(context->buf, ')'); +} + +/* + * Output the identifiers for each RTI in the provided set. + * + * Identifiers are separated by spaces, and a line break is possible after + * each one. + */ +static void +pgpa_output_relations(pgpa_output_context *context, StringInfo buf, + Bitmapset *relids) +{ + int rti = -1; + bool first = true; + + while ((rti = bms_next_member(relids, rti)) >= 0) + { + const char *rid_string = context->rid_strings[rti - 1]; + + if (rid_string == NULL) + elog(ERROR, "no identifier for RTI %d", rti); + + if (first) + { + first = false; + appendStringInfoString(buf, rid_string); + } + else + { + pgpa_maybe_linebreak(buf, context->wrap_column); + appendStringInfo(buf, " %s", rid_string); + } + } +} + +/* + * Get a C string that corresponds to the specified join strategy. + */ +static char * +pgpa_cstring_join_strategy(pgpa_join_strategy strategy) +{ + switch (strategy) + { + case JSTRAT_MERGE_JOIN_PLAIN: + return "MERGE_JOIN_PLAIN"; + case JSTRAT_MERGE_JOIN_MATERIALIZE: + return "MERGE_JOIN_MATERIALIZE"; + case JSTRAT_NESTED_LOOP_PLAIN: + return "NESTED_LOOP_PLAIN"; + case JSTRAT_NESTED_LOOP_MATERIALIZE: + return "NESTED_LOOP_MATERIALIZE"; + case JSTRAT_NESTED_LOOP_MEMOIZE: + return "NESTED_LOOP_MEMOIZE"; + case JSTRAT_HASH_JOIN: + return "HASH_JOIN"; + } + + pg_unreachable(); + return NULL; +} + +/* + * Get a C string that corresponds to the specified scan strategy. + */ +static char * +pgpa_cstring_scan_strategy(pgpa_scan_strategy strategy) +{ + switch (strategy) + { + case PGPA_SCAN_ORDINARY: + return "ORDINARY_SCAN"; + case PGPA_SCAN_SEQ: + return "SEQ_SCAN"; + case PGPA_SCAN_BITMAP_HEAP: + return "BITMAP_HEAP_SCAN"; + case PGPA_SCAN_FOREIGN: + return "FOREIGN_JOIN"; + case PGPA_SCAN_INDEX: + return "INDEX_SCAN"; + case PGPA_SCAN_INDEX_ONLY: + return "INDEX_ONLY_SCAN"; + case PGPA_SCAN_PARTITIONWISE: + return "PARTITIONWISE"; + case PGPA_SCAN_TID: + return "TID_SCAN"; + } + + pg_unreachable(); + return NULL; +} + +/* + * Get a C string that corresponds to the specified scan strategy. + */ +static char * +pgpa_cstring_query_feature_type(pgpa_qf_type type) +{ + switch (type) + { + case PGPAQF_GATHER: + return "GATHER"; + case PGPAQF_GATHER_MERGE: + return "GATHER_MERGE"; + case PGPAQF_SEMIJOIN_NON_UNIQUE: + return "SEMIJOIN_NON_UNIQUE"; + case PGPAQF_SEMIJOIN_UNIQUE: + return "SEMIJOIN_UNIQUE"; + } + + + pg_unreachable(); + return NULL; +} + +/* + * Insert a line break into the StringInfoData, if needed. + * + * If wrap_column is zero or negative, this does nothing. Otherwise, we + * consider inserting a newline. We only insert a newline if the length of + * the last line in the buffer exceeds wrap_column, and not if we'd be + * inserting a newline at or before the beginning of the current line. + * + * The position at which the newline is inserted is simply wherever the + * buffer ended the last time this function was called. In other words, + * the caller is expected to call this function every time we reach a good + * place for a line break. + */ +static void +pgpa_maybe_linebreak(StringInfo buf, int wrap_column) +{ + char *trailing_nl; + int line_start; + int save_cursor; + + /* If line wrapping is disabled, exit quickly. */ + if (wrap_column <= 0) + return; + + /* + * Set line_start to the byte offset within buf->data of the first + * character of the current line, where the current line means the last + * one in the buffer. Note that line_start could be the offset of the + * trailing '\0' if the last character in the buffer is a line break. + */ + trailing_nl = strrchr(buf->data, '\n'); + if (trailing_nl == NULL) + line_start = 0; + else + line_start = (trailing_nl - buf->data) + 1; + + /* + * Remember that the current end of the buffer is a potential location to + * insert a line break on a future call to this function. + */ + save_cursor = buf->cursor; + buf->cursor = buf->len; + + /* If we haven't passed the wrap column, we don't need a newline. */ + if (buf->len - line_start <= wrap_column) + return; + + /* + * It only makes sense to insert a newline at a position later than the + * beginning of the current line. + */ + if (buf->cursor <= line_start) + return; + + /* Insert a newline at the previous cursor location. */ + enlargeStringInfo(buf, 1); + memmove(&buf->data[save_cursor] + 1, &buf->data[save_cursor], + buf->len - save_cursor); + ++buf->cursor; + buf->data[++buf->len] = '\0'; + buf->data[save_cursor] = '\n'; +} diff --git a/contrib/pg_plan_advice/pgpa_output.h b/contrib/pg_plan_advice/pgpa_output.h new file mode 100644 index 00000000000..47496d76f52 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_output.h @@ -0,0 +1,22 @@ +/*------------------------------------------------------------------------- + * + * pgpa_output.h + * produce textual output from the results of a plan tree walk + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_output.c + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_OUTPUT_H +#define PGPA_OUTPUT_H + +#include "pgpa_identifier.h" +#include "pgpa_walker.h" + +extern void pgpa_output_advice(StringInfo buf, + pgpa_plan_walker_context *walker, + pgpa_identifier *rt_identifiers); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_parser.y b/contrib/pg_plan_advice/pgpa_parser.y new file mode 100644 index 00000000000..4c3a3ed6db9 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_parser.y @@ -0,0 +1,301 @@ +%{ +/* + * Parser for plan advice + * + * Copyright (c) 2000-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_parser.y + */ + +#include "postgres.h" + +#include +#include + +#include "fmgr.h" +#include "nodes/miscnodes.h" +#include "utils/builtins.h" +#include "utils/float.h" + +#include "pgpa_ast.h" +#include "pgpa_parser.h" + +/* + * Bison doesn't allocate anything that needs to live across parser calls, + * so we can easily have it use palloc instead of malloc. This prevents + * memory leaks if we error out during parsing. + */ +#define YYMALLOC palloc +#define YYFREE pfree +%} + +/* BISON Declarations */ +%parse-param {List **result} +%parse-param {char **parse_error_msg_p} +%parse-param {yyscan_t yyscanner} +%lex-param {List **result} +%lex-param {char **parse_error_msg_p} +%lex-param {yyscan_t yyscanner} +%pure-parser +%expect 0 +%name-prefix="pgpa_yy" + +%union +{ + char *str; + int integer; + List *list; + pgpa_advice_item *item; + pgpa_advice_target *target; + pgpa_index_target *itarget; +} +%token TOK_IDENT TOK_TAG_JOIN_ORDER TOK_TAG_INDEX +%token TOK_TAG_SIMPLE TOK_TAG_GENERIC +%token TOK_INTEGER + +%type opt_ri_occurrence +%type advice_item +%type advice_item_list generic_target_list +%type index_target_list join_order_target_list +%type opt_partition simple_target_list +%type identifier opt_plan_name +%type generic_sublist join_order_sublist +%type relation_identifier +%type index_name + +%start parse_toplevel + +/* Grammar follows */ +%% + +parse_toplevel: advice_item_list + { + (void) yynerrs; /* suppress compiler warning */ + *result = $1; + } + ; + +advice_item_list: advice_item_list advice_item + { $$ = lappend($1, $2); } + | + { $$ = NIL; } + ; + +advice_item: TOK_TAG_JOIN_ORDER '(' join_order_target_list ')' + { + $$ = palloc0_object(pgpa_advice_item); + $$->tag = PGPA_TAG_JOIN_ORDER; + $$->targets = $3; + if ($3 == NIL) + pgpa_yyerror(result, parse_error_msg_p, yyscanner, + "JOIN_ORDER must have at least one target"); + } + | TOK_TAG_INDEX '(' index_target_list ')' + { + $$ = palloc0_object(pgpa_advice_item); + if (strcmp($1, "index_only_scan") == 0) + $$->tag = PGPA_TAG_INDEX_ONLY_SCAN; + else if (strcmp($1, "index_scan") == 0) + $$->tag = PGPA_TAG_INDEX_SCAN; + else + elog(ERROR, "tag parsing failed: %s", $1); + $$->targets = $3; + } + | TOK_TAG_SIMPLE '(' simple_target_list ')' + { + $$ = palloc0_object(pgpa_advice_item); + if (strcmp($1, "bitmap_heap_scan") == 0) + $$->tag = PGPA_TAG_BITMAP_HEAP_SCAN; + else if (strcmp($1, "no_gather") == 0) + $$->tag = PGPA_TAG_NO_GATHER; + else if (strcmp($1, "seq_scan") == 0) + $$->tag = PGPA_TAG_SEQ_SCAN; + else if (strcmp($1, "tid_scan") == 0) + $$->tag = PGPA_TAG_TID_SCAN; + else + elog(ERROR, "tag parsing failed: %s", $1); + $$->targets = $3; + } + | TOK_TAG_GENERIC '(' generic_target_list ')' + { + bool fail; + + $$ = palloc0_object(pgpa_advice_item); + $$->tag = pgpa_parse_advice_tag($1, &fail); + if (fail) + { + pgpa_yyerror(result, parse_error_msg_p, yyscanner, + "unrecognized advice tag"); + } + + if ($$->tag == PGPA_TAG_FOREIGN_JOIN) + { + foreach_ptr(pgpa_advice_target, target, $3) + { + if (target->ttype == PGPA_TARGET_IDENTIFIER || + list_length(target->children) == 1) + pgpa_yyerror(result, parse_error_msg_p, yyscanner, + "FOREIGN_JOIN targets must contain more than one relation identifier"); + } + } + + $$->targets = $3; + } + ; + +relation_identifier: identifier opt_ri_occurrence opt_partition opt_plan_name + { + $$ = palloc0_object(pgpa_advice_target); + $$->ttype = PGPA_TARGET_IDENTIFIER; + $$->rid.alias_name = $1; + $$->rid.occurrence = $2; + if (list_length($3) == 2) + { + $$->rid.partnsp = linitial($3); + $$->rid.partrel = lsecond($3); + } + else if ($3 != NIL) + $$->rid.partrel = linitial($3); + $$->rid.plan_name = $4; + } + ; + +index_name: identifier + { + $$ = palloc0_object(pgpa_index_target); + $$->indname = $1; + } + | identifier '.' identifier + { + $$ = palloc0_object(pgpa_index_target); + $$->indnamespace = $1; + $$->indname = $3; + } + ; + +opt_ri_occurrence: + '#' TOK_INTEGER + { + if ($2 <= 0) + pgpa_yyerror(result, parse_error_msg_p, yyscanner, + "only positive occurrence numbers are permitted"); + $$ = $2; + } + | + { + /* The default occurrence number is 1. */ + $$ = 1; + } + ; + +identifier: TOK_IDENT + | TOK_TAG_JOIN_ORDER + | TOK_TAG_INDEX + | TOK_TAG_SIMPLE + | TOK_TAG_GENERIC + ; + +/* + * When generating advice, we always schema-qualify the partition name, but + * when parsing advice, we accept a specification that lacks one. + */ +opt_partition: + '/' TOK_IDENT '.' TOK_IDENT + { $$ = list_make2($2, $4); } + | '/' TOK_IDENT + { $$ = list_make1($2); } + | + { $$ = NIL; } + ; + +opt_plan_name: + '@' TOK_IDENT + { $$ = $2; } + | + { $$ = NULL; } + ; + +generic_target_list: generic_target_list relation_identifier + { $$ = lappend($1, $2); } + | generic_target_list generic_sublist + { $$ = lappend($1, $2); } + | + { $$ = NIL; } + ; + +generic_sublist: '(' simple_target_list ')' + { + $$ = palloc0_object(pgpa_advice_target); + $$->ttype = PGPA_TARGET_ORDERED_LIST; + $$->children = $2; + } + ; + +index_target_list: + index_target_list relation_identifier index_name + { + $2->itarget = $3; + $$ = lappend($1, $2); + } + | + { $$ = NIL; } + ; + +join_order_target_list: join_order_target_list relation_identifier + { $$ = lappend($1, $2); } + | join_order_target_list join_order_sublist + { $$ = lappend($1, $2); } + | + { $$ = NIL; } + ; + +join_order_sublist: + '(' join_order_target_list ')' + { + $$ = palloc0_object(pgpa_advice_target); + $$->ttype = PGPA_TARGET_ORDERED_LIST; + $$->children = $2; + } + | '{' simple_target_list '}' + { + $$ = palloc0_object(pgpa_advice_target); + $$->ttype = PGPA_TARGET_UNORDERED_LIST; + $$->children = $2; + } + ; + +simple_target_list: simple_target_list relation_identifier + { $$ = lappend($1, $2); } + | + { $$ = NIL; } + ; + +%% + +/* + * Parse an advice_string and return the resulting list of pgpa_advice_item + * objects. If a parse error occurs, instead return NULL. + * + * If the return value is NULL, *error_p will be set to the error message; + * otherwise, *error_p will be set to NULL. + */ +List * +pgpa_parse(const char *advice_string, char **error_p) +{ + yyscan_t scanner; + List *result; + char *error = NULL; + + pgpa_scanner_init(advice_string, &scanner); + pgpa_yyparse(&result, &error, scanner); + pgpa_scanner_finish(scanner); + + if (error != NULL) + { + *error_p = error; + return NULL; + } + + *error_p = NULL; + return result; +} diff --git a/contrib/pg_plan_advice/pgpa_planner.c b/contrib/pg_plan_advice/pgpa_planner.c new file mode 100644 index 00000000000..1a14ff9fd4b --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_planner.c @@ -0,0 +1,2140 @@ +/*------------------------------------------------------------------------- + * + * pgpa_planner.c + * planner hooks + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_planner.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pg_plan_advice.h" +#include "pgpa_collector.h" +#include "pgpa_identifier.h" +#include "pgpa_output.h" +#include "pgpa_planner.h" +#include "pgpa_trove.h" +#include "pgpa_walker.h" + +#include "commands/defrem.h" +#include "common/hashfn_unstable.h" +#include "nodes/makefuncs.h" +#include "optimizer/extendplan.h" +#include "optimizer/pathnode.h" +#include "optimizer/paths.h" +#include "optimizer/plancat.h" +#include "optimizer/planner.h" +#include "parser/parsetree.h" +#include "utils/lsyscache.h" + +#ifdef USE_ASSERT_CHECKING + +/* + * When assertions are enabled, we try generating relation identifiers during + * planning, saving them in a hash table, and then cross-checking them against + * the ones generated after planning is complete. + */ +typedef struct pgpa_ri_checker_key +{ + char *plan_name; + Index rti; +} pgpa_ri_checker_key; + +typedef struct pgpa_ri_checker +{ + pgpa_ri_checker_key key; + uint32 status; + const char *rid_string; +} pgpa_ri_checker; + +static uint32 pgpa_ri_checker_hash_key(pgpa_ri_checker_key key); + +static inline bool +pgpa_ri_checker_compare_key(pgpa_ri_checker_key a, pgpa_ri_checker_key b) +{ + if (a.rti != b.rti) + return false; + if (a.plan_name == NULL) + return (b.plan_name == NULL); + if (b.plan_name == NULL) + return false; + return strcmp(a.plan_name, b.plan_name) == 0; +} + +#define SH_PREFIX pgpa_ri_check +#define SH_ELEMENT_TYPE pgpa_ri_checker +#define SH_KEY_TYPE pgpa_ri_checker_key +#define SH_KEY key +#define SH_HASH_KEY(tb, key) pgpa_ri_checker_hash_key(key) +#define SH_EQUAL(tb, a, b) pgpa_ri_checker_compare_key(a, b) +#define SH_SCOPE static inline +#define SH_DECLARE +#define SH_DEFINE +#include "lib/simplehash.h" + +#endif + +typedef enum pgpa_jo_outcome +{ + PGPA_JO_PERMITTED, /* permit this join order */ + PGPA_JO_DENIED, /* deny this join order */ + PGPA_JO_INDIFFERENT /* do neither */ +} pgpa_jo_outcome; + +typedef struct pgpa_planner_state +{ + ExplainState *explain_state; + bool generate_advice_feedback; + bool generate_advice_string; + pgpa_trove *trove; + MemoryContext trove_cxt; + List *sj_unique_rels; + +#ifdef USE_ASSERT_CHECKING + pgpa_ri_check_hash *ri_check_hash; +#endif +} pgpa_planner_state; + +typedef struct pgpa_join_state +{ + /* Most-recently-considered outer rel. */ + RelOptInfo *outerrel; + + /* Most-recently-considered inner rel. */ + RelOptInfo *innerrel; + + /* + * Array of relation identifiers for all members of this joinrel, with + * outerrel idenifiers before innerrel identifiers. + */ + pgpa_identifier *rids; + + /* Number of outer rel identifiers. */ + int outer_count; + + /* Number of inner rel identifiers. */ + int inner_count; + + /* + * Trove lookup results. + * + * join_entries and rel_entries are arrays of entries, and join_indexes + * and rel_indexes are the integer offsets within those arrays of entries + * potentially relevant to us. The "join" fields correspond to a lookup + * using PGPA_TROVE_LOOKUP_JOIN and the "rel" fields to a lookup using + * PGPA_TROVE_LOOKUP_REL. + */ + pgpa_trove_entry *join_entries; + Bitmapset *join_indexes; + pgpa_trove_entry *rel_entries; + Bitmapset *rel_indexes; +} pgpa_join_state; + +/* Saved hook values */ +static get_relation_info_hook_type prev_get_relation_info = NULL; +static join_path_setup_hook_type prev_join_path_setup = NULL; +static joinrel_setup_hook_type prev_joinrel_setup = NULL; +static planner_setup_hook_type prev_planner_setup = NULL; +static planner_shutdown_hook_type prev_planner_shutdown = NULL; + +/* Other global variabes */ +static int planner_extension_id = -1; + +/* Function prototypes. */ +static void pgpa_planner_setup(PlannerGlobal *glob, Query *parse, + const char *query_string, + int cursorOptions, + double *tuple_fraction, + ExplainState *es); +static void pgpa_planner_shutdown(PlannerGlobal *glob, Query *parse, + const char *query_string, PlannedStmt *pstmt); +static void pgpa_get_relation_info(PlannerInfo *root, + Oid relationObjectId, + bool inhparent, + RelOptInfo *rel); +static void pgpa_joinrel_setup(PlannerInfo *root, + RelOptInfo *joinrel, + RelOptInfo *outerrel, + RelOptInfo *innerrel, + SpecialJoinInfo *sjinfo, + List *restrictlist); +static void pgpa_join_path_setup(PlannerInfo *root, + RelOptInfo *joinrel, + RelOptInfo *outerrel, + RelOptInfo *innerrel, + JoinType jointype, + JoinPathExtraData *extra); +static pgpa_join_state *pgpa_get_join_state(PlannerInfo *root, + RelOptInfo *joinrel, + RelOptInfo *outerrel, + RelOptInfo *innerrel); +static void pgpa_planner_apply_joinrel_advice(uint64 *pgs_mask_p, + char *plan_name, + pgpa_join_state *pjs); +static void pgpa_planner_apply_join_path_advice(JoinType jointype, + uint64 *pgs_mask_p, + char *plan_name, + pgpa_join_state *pjs); +static void pgpa_planner_apply_scan_advice(RelOptInfo *rel, + pgpa_trove_entry *scan_entries, + Bitmapset *scan_indexes, + pgpa_trove_entry *rel_entries, + Bitmapset *rel_indexes); +static uint64 pgpa_join_strategy_mask_from_advice_tag(pgpa_advice_tag_type tag); +static pgpa_jo_outcome pgpa_join_order_permits_join(int outer_count, + int inner_count, + pgpa_identifier *rids, + pgpa_trove_entry *entry); +static bool pgpa_join_method_permits_join(int outer_count, int inner_count, + pgpa_identifier *rids, + pgpa_trove_entry *entry, + bool *restrict_method); +static bool pgpa_opaque_join_permits_join(int outer_count, int inner_count, + pgpa_identifier *rids, + pgpa_trove_entry *entry, + bool *restrict_method); +static bool pgpa_semijoin_permits_join(int outer_count, int inner_count, + pgpa_identifier *rids, + pgpa_trove_entry *entry, + bool outer_side_nullable, + bool *restrict_method); + +static List *pgpa_planner_append_feedback(List *list, pgpa_trove *trove, + pgpa_trove_lookup_type type, + pgpa_identifier *rt_identifiers, + pgpa_plan_walker_context *walker); +static void pgpa_planner_feedback_warning(List *feedback); + +static inline void pgpa_ri_checker_save(pgpa_planner_state *pps, + PlannerInfo *root, + RelOptInfo *rel); +static void pgpa_ri_checker_validate(pgpa_planner_state *pps, + PlannedStmt *pstmt); + +static char *pgpa_bms_to_cstring(Bitmapset *bms); +static const char *pgpa_jointype_to_cstring(JoinType jointype); + +/* + * Install planner-related hooks. + */ +void +pgpa_planner_install_hooks(void) +{ + planner_extension_id = GetPlannerExtensionId("pg_plan_advice"); + prev_planner_setup = planner_setup_hook; + planner_setup_hook = pgpa_planner_setup; + prev_planner_shutdown = planner_shutdown_hook; + planner_shutdown_hook = pgpa_planner_shutdown; + prev_get_relation_info = get_relation_info_hook; + get_relation_info_hook = pgpa_get_relation_info; + prev_joinrel_setup = joinrel_setup_hook; + joinrel_setup_hook = pgpa_joinrel_setup; + prev_join_path_setup = join_path_setup_hook; + join_path_setup_hook = pgpa_join_path_setup; +} + +/* + * Carry out whatever setup work we need to do before planning. + */ +static void +pgpa_planner_setup(PlannerGlobal *glob, Query *parse, const char *query_string, + int cursorOptions, double *tuple_fraction, + ExplainState *es) +{ + pgpa_trove *trove = NULL; + pgpa_planner_state *pps; + char *supplied_advice; + bool generate_advice_feedback = false; + bool generate_advice_string = false; + bool needs_pps = false; + + /* + * Decide whether we need to generate an advice string. We must do this if + * the user has told us to do it categorically, or if at least one + * collector is enabled, or if the user has requested it using the EXPLAIN + * (PLAN_ADVICE) option. + */ + generate_advice_string = (pg_plan_advice_always_store_advice_details || + pg_plan_advice_local_collector || + pg_plan_advice_shared_collector || + pg_plan_advice_should_explain(es)); + if (generate_advice_string) + needs_pps = true; + + /* + * If any advice was provided, build a trove of advice for use during + * planning. + */ + supplied_advice = pg_plan_advice_get_supplied_query_advice(glob, parse, + query_string, + cursorOptions, + es); + if (supplied_advice != NULL && supplied_advice[0] != '\0') + { + List *advice_items; + char *error; + + /* + * If the supplied advice string comes from pg_plan_advice.advice, + * parsing shouldn't fail here, because we must have previously parsed + * successfully in pg_plan_advice_advice_check_hook. However, it might + * also be come from a hook registered via pg_plan_advice_add_advisor, + * and we can't be sure whether that's valid. (Plus, having an error + * check of here seems like a good idea anyway, just for safety.) + */ + advice_items = pgpa_parse(supplied_advice, &error); + if (error) + ereport(WARNING, + errmsg("could not parse supplied advice: %s", error)); + + /* + * It's possible that the advice string was non-empty but contained no + * actual advice, e.g. it was all whitespace. + */ + if (advice_items != NIL) + { + trove = pgpa_build_trove(advice_items); + needs_pps = true; + + /* + * If we know that we're running under EXPLAIN, or if the user has + * told us to always do the work, generate advice feedback. + */ + if (es != NULL || pg_plan_advice_feedback_warnings || + pg_plan_advice_always_store_advice_details) + generate_advice_feedback = true; + } + } + +#ifdef USE_ASSERT_CHECKING + + /* + * If asserts are enabled, always build a private state object for + * cross-checks. + */ + needs_pps = true; +#endif + + /* + * We only create and initialize a private state object if it's needed for + * some purpose. That could be (1) recording that we will need to generate + * an advice string, (2) storing a trove of supplied advice, or (3) + * facilitating debugging cross-checks when asserts are enabled. + */ + if (needs_pps) + { + pps = palloc0_object(pgpa_planner_state); + pps->explain_state = es; + pps->generate_advice_feedback = generate_advice_feedback; + pps->generate_advice_string = generate_advice_string; + pps->trove = trove; +#ifdef USE_ASSERT_CHECKING + pps->ri_check_hash = + pgpa_ri_check_create(CurrentMemoryContext, 1024, NULL); +#endif + SetPlannerGlobalExtensionState(glob, planner_extension_id, pps); + } +} + +/* + * Carry out whatever work we want to do after planning is complete. + */ +static void +pgpa_planner_shutdown(PlannerGlobal *glob, Query *parse, + const char *query_string, PlannedStmt *pstmt) +{ + pgpa_planner_state *pps; + pgpa_trove *trove = NULL; + pgpa_plan_walker_context walker = {0}; /* placate compiler */ + bool generate_advice_feedback = false; + bool generate_advice_string = false; + List *pgpa_items = NIL; + pgpa_identifier *rt_identifiers = NULL; + + /* Fetch our private state, set up by pgpa_planner_setup(). */ + pps = GetPlannerGlobalExtensionState(glob, planner_extension_id); + if (pps != NULL) + { + trove = pps->trove; + generate_advice_feedback = pps->generate_advice_feedback; + generate_advice_string = pps->generate_advice_string; + } + + /* + * If we're trying to generate an advice string or if we're trying to + * provide advice feedback, then we will need to create range table + * identifiers. + */ + if (generate_advice_string || generate_advice_feedback) + { + pgpa_plan_walker(&walker, pstmt, pps->sj_unique_rels); + rt_identifiers = pgpa_create_identifiers_for_planned_stmt(pstmt); + } + + /* Generate the advice string, if we need to do so. */ + if (generate_advice_string) + { + char *advice_string; + StringInfoData buf; + + /* Generate a textual advice string. */ + initStringInfo(&buf); + pgpa_output_advice(&buf, &walker, rt_identifiers); + advice_string = buf.data; + + /* If the advice string is empty, don't bother collecting it. */ + if (advice_string[0] != '\0') + pgpa_collect_advice(pstmt->queryId, query_string, advice_string); + + /* Save the advice string in the final plan. */ + pgpa_items = lappend(pgpa_items, + makeDefElem("advice_string", + (Node *) makeString(advice_string), + -1)); + } + + /* + * If we're trying to provide advice feedback, then we will need to + * analyze how successful the advice was. + */ + if (generate_advice_feedback) + { + List *feedback = NIL; + + /* + * Inject a Node-tree representation of all the trove-entry flags into + * the PlannedStmt. + */ + feedback = pgpa_planner_append_feedback(feedback, + trove, + PGPA_TROVE_LOOKUP_SCAN, + rt_identifiers, &walker); + feedback = pgpa_planner_append_feedback(feedback, + trove, + PGPA_TROVE_LOOKUP_JOIN, + rt_identifiers, &walker); + feedback = pgpa_planner_append_feedback(feedback, + trove, + PGPA_TROVE_LOOKUP_REL, + rt_identifiers, &walker); + + pgpa_items = lappend(pgpa_items, makeDefElem("feedback", + (Node *) feedback, -1)); + + /* If we were asked to generate feedback warnings, do so. */ + if (pg_plan_advice_feedback_warnings) + pgpa_planner_feedback_warning(feedback); + } + + /* Push whatever data we're saving into the PlannedStmt. */ + if (pgpa_items != NIL) + pstmt->extension_state = + lappend(pstmt->extension_state, + makeDefElem("pg_plan_advice", (Node *) pgpa_items, -1)); + + /* + * If assertions are enabled, cross-check the generated range table + * identifiers. + */ + if (pps != NULL) + pgpa_ri_checker_validate(pps, pstmt); +} + +/* + * Hook function for get_relation_info(). + * + * We can apply scan advice at this opint, and we also usee this as an + * opportunity to do range-table identifier cross-checking in assert-enabled + * builds. + */ +static void +pgpa_get_relation_info(PlannerInfo *root, Oid relationObjectId, + bool inhparent, RelOptInfo *rel) +{ + pgpa_planner_state *pps; + + /* Fetch our private state, set up by pgpa_planner_setup(). */ + pps = GetPlannerGlobalExtensionState(root->glob, planner_extension_id); + + /* Save details needed for range table identifier cross-checking. */ + if (pps != NULL) + pgpa_ri_checker_save(pps, root, rel); + + /* If query advice was provided, search for relevant entries. */ + if (pps != NULL && pps->trove != NULL) + { + pgpa_identifier rid; + pgpa_trove_result tresult_scan; + pgpa_trove_result tresult_rel; + + /* Search for scan advice and general rel advice. */ + pgpa_compute_identifier_by_rti(root, rel->relid, &rid); + pgpa_trove_lookup(pps->trove, PGPA_TROVE_LOOKUP_SCAN, 1, &rid, + &tresult_scan); + pgpa_trove_lookup(pps->trove, PGPA_TROVE_LOOKUP_REL, 1, &rid, + &tresult_rel); + + /* If relevant entries were found, apply them. */ + if (tresult_scan.indexes != NULL || tresult_rel.indexes != NULL) + { + uint64 original_mask = rel->pgs_mask; + + pgpa_planner_apply_scan_advice(rel, + tresult_scan.entries, + tresult_scan.indexes, + tresult_rel.entries, + tresult_rel.indexes); + + /* Emit debugging message, if enabled. */ + if (pg_plan_advice_trace_mask && original_mask != rel->pgs_mask) + ereport(WARNING, + (errmsg("strategy mask for RTI %u changed from 0x%" PRIx64 " to 0x%" PRIx64, + rel->relid, original_mask, rel->pgs_mask))); + } + } + + /* Pass call to previous hook. */ + if (prev_get_relation_info) + (*prev_get_relation_info) (root, relationObjectId, inhparent, rel); +} + +/* + * Enforce any provided advice that is relevant to any method of implementing + * this join. + * + * Although we're passed the outerrel and innerrel here, those are just + * whatever values happened to prompt the creation of this joinrel; they + * shouldn't really influence our choice of what advice to apply. + */ +static void +pgpa_joinrel_setup(PlannerInfo *root, RelOptInfo *joinrel, + RelOptInfo *outerrel, RelOptInfo *innerrel, + SpecialJoinInfo *sjinfo, List *restrictlist) +{ + pgpa_join_state *pjs; + + Assert(bms_membership(joinrel->relids) == BMS_MULTIPLE); + + /* Get our private state information for this join. */ + pjs = pgpa_get_join_state(root, joinrel, outerrel, innerrel); + + /* If there is relevant advice, call a helper function to apply it. */ + if (pjs != NULL) + { + uint64 original_mask = joinrel->pgs_mask; + + pgpa_planner_apply_joinrel_advice(&joinrel->pgs_mask, + root->plan_name, + pjs); + + /* Emit debugging message, if enabled. */ + if (pg_plan_advice_trace_mask && original_mask != joinrel->pgs_mask) + ereport(WARNING, + (errmsg("strategy mask for join on RTIs %s changed from 0x%" PRIx64 " to 0x%" PRIx64, + pgpa_bms_to_cstring(joinrel->relids), + original_mask, + joinrel->pgs_mask))); + } + + /* Pass call to previous hook. */ + if (prev_joinrel_setup) + (*prev_joinrel_setup) (root, joinrel, outerrel, innerrel, + sjinfo, restrictlist); +} + +/* + * Enforce any provided advice that is relevant to this particular method of + * implementing this particular join. + */ +static void +pgpa_join_path_setup(PlannerInfo *root, RelOptInfo *joinrel, + RelOptInfo *outerrel, RelOptInfo *innerrel, + JoinType jointype, JoinPathExtraData *extra) +{ + pgpa_join_state *pjs; + + Assert(bms_membership(joinrel->relids) == BMS_MULTIPLE); + + /* + * If we're considering implementing a semijoin by making one side unique, + * make a note of it in the pgpa_planner_state. See comments for + * pgpa_sj_unique_rel for why we do this. + */ + if (jointype == JOIN_UNIQUE_OUTER || jointype == JOIN_UNIQUE_INNER) + { + pgpa_planner_state *pps; + RelOptInfo *uniquerel; + + uniquerel = jointype == JOIN_UNIQUE_OUTER ? outerrel : innerrel; + pps = GetPlannerGlobalExtensionState(root->glob, planner_extension_id); + if (pps != NULL && + (pps->generate_advice_string || pps->generate_advice_feedback)) + { + bool found = false; + + /* Avoid adding duplicates. */ + foreach_ptr(pgpa_sj_unique_rel, ur, pps->sj_unique_rels) + { + /* + * We should always use the same pointer for the same plan + * name, so we need not use strcmp() here. + */ + if (root->plan_name == ur->plan_name && + bms_equal(uniquerel->relids, ur->relids)) + { + found = true; + break; + } + } + + /* If not a duplicate, append to the list. */ + if (!found) + { + pgpa_sj_unique_rel *ur = palloc_object(pgpa_sj_unique_rel); + + ur->plan_name = root->plan_name; + ur->relids = uniquerel->relids; + pps->sj_unique_rels = lappend(pps->sj_unique_rels, ur); + } + } + } + + /* Get our private state information for this join. */ + pjs = pgpa_get_join_state(root, joinrel, outerrel, innerrel); + + /* If there is relevant advice, call a helper function to apply it. */ + if (pjs != NULL) + { + uint64 original_mask = extra->pgs_mask; + + pgpa_planner_apply_join_path_advice(jointype, + &extra->pgs_mask, + root->plan_name, + pjs); + + /* Emit debugging message, if enabled. */ + if (pg_plan_advice_trace_mask && original_mask != extra->pgs_mask) + ereport(WARNING, + (errmsg("strategy mask for %s join on %s with outer %s and inner %s changed from 0x%" PRIx64 " to 0x%" PRIx64, + pgpa_jointype_to_cstring(jointype), + pgpa_bms_to_cstring(joinrel->relids), + pgpa_bms_to_cstring(outerrel->relids), + pgpa_bms_to_cstring(innerrel->relids), + original_mask, + extra->pgs_mask))); + } + + /* Pass call to previous hook. */ + if (prev_join_path_setup) + (*prev_join_path_setup) (root, joinrel, outerrel, innerrel, + jointype, extra); +} + +/* + * Search for advice pertaining to a proposed join. + */ +static pgpa_join_state * +pgpa_get_join_state(PlannerInfo *root, RelOptInfo *joinrel, + RelOptInfo *outerrel, RelOptInfo *innerrel) +{ + pgpa_planner_state *pps; + pgpa_join_state *pjs; + bool new_pjs = false; + + /* Fetch our private state, set up by pgpa_planner_setup(). */ + pps = GetPlannerGlobalExtensionState(root->glob, planner_extension_id); + if (pps == NULL || pps->trove == NULL) + { + /* No advice applies to this query, hence none to this joinrel. */ + return NULL; + } + + /* + * See whether we've previously associated a pgpa_join_state with this + * joinrel. If we have not, we need to try to construct one. If we have, + * then there are two cases: (a) if innerrel and outerrel are unchanged, + * we can simply use it, and (b) if they have changed, we need to rejigger + * the array of identifiers but can still skip the trove lookup. + */ + pjs = GetRelOptInfoExtensionState(joinrel, planner_extension_id); + if (pjs != NULL) + { + if (pjs->join_indexes == NULL && pjs->rel_indexes == NULL) + { + /* + * If there's no potentially relevant advice, then the presence of + * this pgpa_join_state acts like a negative cache entry: it tells + * us not to bother searching the trove for advice, because we + * will not find any. + */ + return NULL; + } + + if (pjs->outerrel == outerrel && pjs->innerrel == innerrel) + { + /* No updates required, so just return. */ + /* XXX. Does this need to do something different under GEQO? */ + return pjs; + } + } + + /* + * If there's no pgpa_join_state yet, we need to allocate one. Trove keys + * will not get built for RTE_JOIN RTEs, so the array may end up being + * larger than needed. It's not worth trying to compute a perfectly + * accurate count here. + */ + if (pjs == NULL) + { + int pessimistic_count = bms_num_members(joinrel->relids); + + pjs = palloc0_object(pgpa_join_state); + pjs->rids = palloc_array(pgpa_identifier, pessimistic_count); + new_pjs = true; + } + + /* + * Either we just allocated a new pgpa_join_state, or the existing one + * needs reconfiguring for a new innerrel and outerrel. The required array + * size can't change, so we can overwrite the existing one. + */ + pjs->outerrel = outerrel; + pjs->innerrel = innerrel; + pjs->outer_count = + pgpa_compute_identifiers_by_relids(root, outerrel->relids, pjs->rids); + pjs->inner_count = + pgpa_compute_identifiers_by_relids(root, innerrel->relids, + pjs->rids + pjs->outer_count); + + /* + * If we allocated a new pgpa_join_state, search our trove of advice for + * relevant entries. The trove lookup will return the same results for + * every outerrel/innerrel combination, so we don't need to repeat that + * work every time. + */ + if (new_pjs) + { + pgpa_trove_result tresult; + + /* Find join entries. */ + pgpa_trove_lookup(pps->trove, PGPA_TROVE_LOOKUP_JOIN, + pjs->outer_count + pjs->inner_count, + pjs->rids, &tresult); + pjs->join_entries = tresult.entries; + pjs->join_indexes = tresult.indexes; + + /* Find rel entries. */ + pgpa_trove_lookup(pps->trove, PGPA_TROVE_LOOKUP_REL, + pjs->outer_count + pjs->inner_count, + pjs->rids, &tresult); + pjs->rel_entries = tresult.entries; + pjs->rel_indexes = tresult.indexes; + + /* Now that the new pgpa_join_state is fully valid, save a pointer. */ + SetRelOptInfoExtensionState(joinrel, planner_extension_id, pjs); + + /* + * If there was no relevant advice found, just return NULL. This + * pgpa_join_state will stick around as a sort of negative cache + * entry, so that future calls for this same joinrel quickly return + * NULL. + */ + if (pjs->join_indexes == NULL && pjs->rel_indexes == NULL) + return NULL; + } + + return pjs; +} + +/* + * Enforce overall restrictions on a join relation that apply uniformly + * regardless of the choice of inner and outer rel. + */ +static void +pgpa_planner_apply_joinrel_advice(uint64 *pgs_mask_p, char *plan_name, + pgpa_join_state *pjs) +{ + int i = -1; + int flags; + bool gather_conflict = false; + uint64 gather_mask = 0; + Bitmapset *gather_partial_match = NULL; + Bitmapset *gather_full_match = NULL; + bool partitionwise_conflict = false; + int partitionwise_outcome = 0; + Bitmapset *partitionwise_partial_match = NULL; + Bitmapset *partitionwise_full_match = NULL; + + /* Iterate over all possibly-relevant advice. */ + while ((i = bms_next_member(pjs->rel_indexes, i)) >= 0) + { + pgpa_trove_entry *entry = &pjs->rel_entries[i]; + pgpa_itm_type itm; + bool full_match = false; + uint64 my_gather_mask = 0; + int my_partitionwise_outcome = 0; /* >0 yes, <0 no */ + + /* + * For GATHER and GATHER_MERGE, if the specified relations exactly + * match this joinrel, do whatever the advice says; otherwise, don't + * allow Gather or Gather Merge at this level. For NO_GATHER, there + * must be a single target relation which must be included in this + * joinrel, so just don't allow Gather or Gather Merge here, full + * stop. + */ + if (entry->tag == PGPA_TAG_NO_GATHER) + { + my_gather_mask = PGS_CONSIDER_NONPARTIAL; + full_match = true; + } + else + { + int total_count; + + total_count = pjs->outer_count + pjs->inner_count; + itm = pgpa_identifiers_match_target(total_count, pjs->rids, + entry->target); + Assert(itm != PGPA_ITM_DISJOINT); + + if (itm == PGPA_ITM_EQUAL) + { + full_match = true; + if (entry->tag == PGPA_TAG_PARTITIONWISE) + my_partitionwise_outcome = 1; + else if (entry->tag == PGPA_TAG_GATHER) + my_gather_mask = PGS_GATHER; + else if (entry->tag == PGPA_TAG_GATHER_MERGE) + my_gather_mask = PGS_GATHER_MERGE; + else + elog(ERROR, "unexpected advice tag: %d", + (int) entry->tag); + } + else + { + if (entry->tag == PGPA_TAG_PARTITIONWISE) + { + my_partitionwise_outcome = -1; + my_gather_mask = PGS_CONSIDER_NONPARTIAL; + } + else if (entry->tag == PGPA_TAG_GATHER || + entry->tag == PGPA_TAG_GATHER_MERGE) + { + my_partitionwise_outcome = -1; + my_gather_mask = PGS_CONSIDER_NONPARTIAL; + } + else + elog(ERROR, "unexpected advice tag: %d", + (int) entry->tag); + } + } + + /* + * If we set my_gather_mask up above, then we (1) make a note if the + * advice conflicted, (2) remember the mask value, and (3) remember + * whether this was a full or partial match. + */ + if (my_gather_mask != 0) + { + if (gather_mask != 0 && gather_mask != my_gather_mask) + gather_conflict = true; + gather_mask = my_gather_mask; + if (full_match) + gather_full_match = bms_add_member(gather_full_match, i); + else + gather_partial_match = bms_add_member(gather_partial_match, i); + } + + /* + * Likewise, if we set my_partitionwise_outcome up above, then we (1) + * make a note if the advice conflicted, (2) remember what the desired + * outcome was, and (3) remember whether this was a full or partial + * match. + */ + if (my_partitionwise_outcome != 0) + { + if (partitionwise_outcome != 0 && + partitionwise_outcome != my_partitionwise_outcome) + partitionwise_conflict = true; + partitionwise_outcome = my_partitionwise_outcome; + if (full_match) + partitionwise_full_match = + bms_add_member(partitionwise_full_match, i); + else + partitionwise_partial_match = + bms_add_member(partitionwise_partial_match, i); + } + } + + /* + * Mark every Gather-related piece of advice as partially matched, and if + * the set of targets exactly matched this relation, fully matched. If + * there was a conflict, mark them all as conflicting. + */ + flags = PGPA_TE_MATCH_PARTIAL; + if (gather_conflict) + flags |= PGPA_TE_CONFLICTING; + pgpa_trove_set_flags(pjs->rel_entries, gather_partial_match, flags); + flags |= PGPA_TE_MATCH_FULL; + pgpa_trove_set_flags(pjs->rel_entries, gather_full_match, flags); + + /* Likewise for partitionwise advice. */ + flags = PGPA_TE_MATCH_PARTIAL; + if (partitionwise_conflict) + flags |= PGPA_TE_CONFLICTING; + pgpa_trove_set_flags(pjs->rel_entries, partitionwise_partial_match, flags); + flags |= PGPA_TE_MATCH_FULL; + pgpa_trove_set_flags(pjs->rel_entries, partitionwise_full_match, flags); + + /* + * Enforce restrictions on the Gather/Gather Merge. Only clear bits here, + * so that we still respect the enable_* GUCs. Do nothing if the advise + * conflicts. + */ + if (gather_mask != 0 && !gather_conflict) + { + uint64 all_gather_mask; + + all_gather_mask = + PGS_GATHER | PGS_GATHER_MERGE | PGS_CONSIDER_NONPARTIAL; + *pgs_mask_p &= ~(all_gather_mask & ~gather_mask); + } + + /* + * As above, but for partitionwise advice. + * + * To induce a partitionwise join, we disable all the ordinary means of + * performing a join, so that an Append or MergeAppend path will hopefully + * be chosen. + * + * To prevent one, we just disable Append and MergeAppend. Note that we + * must not unset PGS_CONSIDER_PARTITIONWISE even when we don't want a + * partitionwise join here, because we might want one at a higher level + * that is constructing using paths from this level. + */ + if (partitionwise_outcome != 0 && !partitionwise_conflict) + { + if (partitionwise_outcome > 0) + *pgs_mask_p = (*pgs_mask_p & ~PGS_JOIN_ANY); + else + *pgs_mask_p &= ~(PGS_APPEND | PGS_MERGE_APPEND); + } +} + +/* + * Enforce restrictions on the join order or join method. + */ +static void +pgpa_planner_apply_join_path_advice(JoinType jointype, uint64 *pgs_mask_p, + char *plan_name, + pgpa_join_state *pjs) +{ + int i = -1; + Bitmapset *jo_permit_indexes = NULL; + Bitmapset *jo_deny_indexes = NULL; + Bitmapset *jo_deny_rel_indexes = NULL; + Bitmapset *jm_indexes = NULL; + bool jm_conflict = false; + uint32 join_mask = 0; + Bitmapset *sj_permit_indexes = NULL; + Bitmapset *sj_deny_indexes = NULL; + + /* + * Reconsider PARTITIONWISE(...) advice. + * + * We already thought about this for the joinrel as a whole, but in some + * cases, partitionwise advice can also constrain the join order. For + * instance, if the advice says PARTITIONWISE((t1 t2)), we shouldn't build + * join paths for a any joinrel that includes t1 or t2 unless it also + * includes the other. In general, the paritionwise operation must have + * already been completed within one side of the current join or the + * other, else the join order is impermissible. + * + * NB: It might seem tempting to try to deal with PARTITIONWISE advise + * entirely in this function, but that doesn't work. Here, we can only + * affect the pgs_mask within a particular JoinPathExtraData, that is, for + * a particular choice of innerrel and outerrel. Partitionwise paths are + * not built that way, so we must set pgs_mask for the RelOptInfo, which + * is best done in pgpa_planner_apply_joinrel_advice. + */ + while ((i = bms_next_member(pjs->rel_indexes, i)) >= 0) + { + pgpa_trove_entry *entry = &pjs->rel_entries[i]; + pgpa_itm_type inner_itm; + pgpa_itm_type outer_itm; + + if (entry->tag != PGPA_TAG_PARTITIONWISE) + continue; + + outer_itm = pgpa_identifiers_match_target(pjs->outer_count, + pjs->rids, entry->target); + if (outer_itm == PGPA_ITM_EQUAL || + outer_itm == PGPA_ITM_TARGETS_ARE_SUBSET) + continue; + + inner_itm = pgpa_identifiers_match_target(pjs->inner_count, + pjs->rids + pjs->outer_count, + entry->target); + if (inner_itm == PGPA_ITM_EQUAL || + inner_itm == PGPA_ITM_TARGETS_ARE_SUBSET) + continue; + + jo_deny_rel_indexes = bms_add_member(jo_deny_rel_indexes, i); + } + + /* Iterate over advice that pertains to the join order and method. */ + i = -1; + while ((i = bms_next_member(pjs->join_indexes, i)) >= 0) + { + pgpa_trove_entry *entry = &pjs->join_entries[i]; + uint32 my_join_mask; + + /* Handle join order advice. */ + if (entry->tag == PGPA_TAG_JOIN_ORDER) + { + pgpa_jo_outcome jo_outcome; + + jo_outcome = pgpa_join_order_permits_join(pjs->outer_count, + pjs->inner_count, + pjs->rids, + entry); + if (jo_outcome == PGPA_JO_PERMITTED) + jo_permit_indexes = bms_add_member(jo_permit_indexes, i); + else if (jo_outcome == PGPA_JO_DENIED) + jo_deny_indexes = bms_add_member(jo_deny_indexes, i); + continue; + } + + /* Handle join method advice. */ + my_join_mask = pgpa_join_strategy_mask_from_advice_tag(entry->tag); + if (my_join_mask != 0) + { + bool permit; + bool restrict_method; + + if (entry->tag == PGPA_TAG_FOREIGN_JOIN) + permit = pgpa_opaque_join_permits_join(pjs->outer_count, + pjs->inner_count, + pjs->rids, + entry, + &restrict_method); + else + permit = pgpa_join_method_permits_join(pjs->outer_count, + pjs->inner_count, + pjs->rids, + entry, + &restrict_method); + if (!permit) + jo_deny_indexes = bms_add_member(jo_deny_indexes, i); + else if (restrict_method) + { + jm_indexes = bms_add_member(jm_indexes, i); + if (join_mask != 0 && join_mask != my_join_mask) + jm_conflict = true; + join_mask = my_join_mask; + } + continue; + } + + /* Handle semijoin uniqueness advice. */ + if (entry->tag == PGPA_TAG_SEMIJOIN_UNIQUE || + entry->tag == PGPA_TAG_SEMIJOIN_NON_UNIQUE) + { + bool outer_side_nullable; + bool restrict_method; + + /* Planner has nullable side of the semijoin on the outer side? */ + outer_side_nullable = (jointype == JOIN_UNIQUE_OUTER || + jointype == JOIN_RIGHT_SEMI); + + if (!pgpa_semijoin_permits_join(pjs->outer_count, + pjs->inner_count, + pjs->rids, + entry, + outer_side_nullable, + &restrict_method)) + jo_deny_indexes = bms_add_member(jo_deny_indexes, i); + else if (restrict_method) + { + bool advice_unique; + bool jt_unique; + bool jt_non_unique; + + /* Advice wants to unique-ify and use a regular join? */ + advice_unique = (entry->tag == PGPA_TAG_SEMIJOIN_UNIQUE); + + /* Planner is trying to unique-ify and use a regular join? */ + jt_unique = (jointype == JOIN_UNIQUE_INNER || + jointype == JOIN_UNIQUE_OUTER); + + /* Planner is trying a semi-join, without unique-ifying? */ + jt_non_unique = (jointype == JOIN_SEMI || + jointype == JOIN_RIGHT_SEMI); + + if (!jt_unique && !jt_non_unique) + { + /* + * This doesn't seem to be a semijoin to which SJ_UNIQUE + * or SJ_NON_UNIQUE can be applied. + */ + entry->flags |= PGPA_TE_INAPPLICABLE; + } + else if (advice_unique != jt_unique) + sj_deny_indexes = bms_add_member(sj_deny_indexes, i); + else + sj_permit_indexes = bms_add_member(sj_permit_indexes, i); + } + continue; + } + } + + /* + * If the advice indicates both that this join order is permissible and + * also that it isn't, then mark advice related to the join order as + * conflicting. + */ + if (jo_permit_indexes != NULL && + (jo_deny_indexes != NULL || jo_deny_rel_indexes != NULL)) + { + pgpa_trove_set_flags(pjs->join_entries, jo_permit_indexes, + PGPA_TE_CONFLICTING); + pgpa_trove_set_flags(pjs->join_entries, jo_deny_indexes, + PGPA_TE_CONFLICTING); + pgpa_trove_set_flags(pjs->rel_entries, jo_deny_rel_indexes, + PGPA_TE_CONFLICTING); + } + + /* + * If more than one join method specification is relevant here and they + * differ, mark them all as conflicting. + */ + if (jm_conflict) + pgpa_trove_set_flags(pjs->join_entries, jm_indexes, + PGPA_TE_CONFLICTING); + + /* If semijoin advice says both yes and no, mark it all as conflicting. */ + if (sj_permit_indexes != NULL && sj_deny_indexes != NULL) + { + pgpa_trove_set_flags(pjs->join_entries, sj_permit_indexes, + PGPA_TE_CONFLICTING); + pgpa_trove_set_flags(pjs->join_entries, sj_deny_indexes, + PGPA_TE_CONFLICTING); + } + + /* + * Enforce restrictions on the join order and join method, and any + * semijoin-related restrictions. Only clear bits here, so that we still + * respect the enable_* GUCs. Do nothing in cases where the advice on a + * single topic conflicts. + */ + if ((jo_deny_indexes != NULL || jo_deny_rel_indexes != NULL) && + jo_permit_indexes == NULL) + *pgs_mask_p &= ~PGS_JOIN_ANY; + if (join_mask != 0 && !jm_conflict) + *pgs_mask_p &= ~(PGS_JOIN_ANY & ~join_mask); + if (sj_deny_indexes != NULL && sj_permit_indexes == NULL) + *pgs_mask_p &= ~PGS_JOIN_ANY; +} + +/* + * Translate an advice tag into a path generation strategy mask. + * + * This function can be called with tag types that don't represent join + * strategies. In such cases, we just return 0, which can't be confused with + * a valid mask. + */ +static uint64 +pgpa_join_strategy_mask_from_advice_tag(pgpa_advice_tag_type tag) +{ + switch (tag) + { + case PGPA_TAG_FOREIGN_JOIN: + return PGS_FOREIGNJOIN; + case PGPA_TAG_MERGE_JOIN_PLAIN: + return PGS_MERGEJOIN_PLAIN; + case PGPA_TAG_MERGE_JOIN_MATERIALIZE: + return PGS_MERGEJOIN_MATERIALIZE; + case PGPA_TAG_NESTED_LOOP_PLAIN: + return PGS_NESTLOOP_PLAIN; + case PGPA_TAG_NESTED_LOOP_MATERIALIZE: + return PGS_NESTLOOP_MATERIALIZE; + case PGPA_TAG_NESTED_LOOP_MEMOIZE: + return PGS_NESTLOOP_MEMOIZE; + case PGPA_TAG_HASH_JOIN: + return PGS_HASHJOIN; + default: + return 0; + } +} + +/* + * Does a certain item of join order advice permit a certain join? + * + * Returns PGPA_JO_DENIED if the advice is incompatible with the proposed + * join order. + * + * Returns PGPA_JO_PERMITTED if the advice specifies exactly the proposed + * join order. This implies that a partitionwise join should not be + * performed at this level; rather, one of the traditional join methods + * should be used. + * + * Returns PGPA_JO_INDIFFERENT if the advice does not care what happens. + * We use this for unordered JOIN_ORDER sublists, which are compatible with + * partitionwise join but do not mandate it. + */ +static pgpa_jo_outcome +pgpa_join_order_permits_join(int outer_count, int inner_count, + pgpa_identifier *rids, + pgpa_trove_entry *entry) +{ + bool loop = true; + bool sublist = false; + int length; + int outer_length; + pgpa_advice_target *target = entry->target; + pgpa_advice_target *prefix_target; + + /* We definitely have at least a partial match for this trove entry. */ + entry->flags |= PGPA_TE_MATCH_PARTIAL; + + /* + * Find the innermost sublist that contains all keys; if no sublist does, + * then continue processing with the toplevel list. + * + * For example, if the advice says JOIN_ORDER(t1 t2 (t3 t4 t5)), then we + * should evaluate joins that only involve t3, t4, and/or t5 against the + * (t3 t4 t5) sublist, and others against the full list. + * + * Note that (1) outermost sublist is always ordered and (2) whenever we + * zoom into an unordered sublist, we instantly return + * PGPA_JO_INDIFFERENT. + */ + while (loop) + { + Assert(target->ttype == PGPA_TARGET_ORDERED_LIST); + + loop = false; + foreach_ptr(pgpa_advice_target, child_target, target->children) + { + pgpa_itm_type itm; + + if (child_target->ttype == PGPA_TARGET_IDENTIFIER) + continue; + + itm = pgpa_identifiers_match_target(outer_count + inner_count, + rids, child_target); + if (itm == PGPA_ITM_EQUAL || itm == PGPA_ITM_KEYS_ARE_SUBSET) + { + if (child_target->ttype == PGPA_TARGET_ORDERED_LIST) + { + target = child_target; + sublist = true; + loop = true; + break; + } + else + { + Assert(child_target->ttype == PGPA_TARGET_UNORDERED_LIST); + return PGPA_JO_INDIFFERENT; + } + } + } + } + + /* + * Try to find a prefix of the selected join order list that is exactly + * equal to the outer side of the proposed join. + */ + length = list_length(target->children); + prefix_target = palloc0_object(pgpa_advice_target); + prefix_target->ttype = PGPA_TARGET_ORDERED_LIST; + for (outer_length = 1; outer_length <= length; ++outer_length) + { + pgpa_itm_type itm; + + /* Avoid leaking memory in every loop iteration. */ + if (prefix_target->children != NULL) + list_free(prefix_target->children); + prefix_target->children = list_copy_head(target->children, + outer_length); + + /* Search, hoping to find an exact match. */ + itm = pgpa_identifiers_match_target(outer_count, rids, prefix_target); + if (itm == PGPA_ITM_EQUAL) + break; + + /* + * If the prefix of the join order list that we're considering + * includes some but not all of the outer rels, we can make the prefix + * longer to find an exact match. But the advice hasn't mentioned + * everything that's part of our outer rel yet, but has mentioned + * things that are not, then this join doesn't match the join order + * list. + */ + if (itm != PGPA_ITM_TARGETS_ARE_SUBSET) + return PGPA_JO_DENIED; + } + + /* + * If the previous looped stopped before the prefix_target included the + * entire join order list, then the next member of the join order list + * must exactly match the inner side of the join. + * + * Example: Given JOIN_ORDER(t1 t2 (t3 t4 t5)), if the outer side of the + * current join includes only t1, then the inner side must be exactly t2; + * if the outer side includes both t1 and t2, then the inner side must + * include exactly t3, t4, and t5. + */ + if (outer_length < length) + { + pgpa_advice_target *inner_target; + pgpa_itm_type itm; + + inner_target = list_nth(target->children, outer_length); + + itm = pgpa_identifiers_match_target(inner_count, rids + outer_count, + inner_target); + + /* + * Before returning, consider whether we need to mark this entry as + * fully matched. If we're considering the full list rather than a + * sublist, and if we found every item but one on the outer side of + * the join and the last item on the inner side of the join, then the + * answer is yes. + */ + if (!sublist && outer_length + 1 == length && itm == PGPA_ITM_EQUAL) + entry->flags |= PGPA_TE_MATCH_FULL; + + return (itm == PGPA_ITM_EQUAL) ? PGPA_JO_PERMITTED : PGPA_JO_DENIED; + } + + /* + * If we get here, then the outer side of the join includes the entirety + * of the join order list. In this case, we behave differently depending + * on whether we're looking at the top-level join order list or sublist. + * At the top-level, we treat the specified list as mandating that the + * actual join order has the given list as a prefix, but a sublist + * requires an exact match. + * + * Exmaple: Given JOIN_ORDER(t1 t2 (t3 t4 t5)), we must start by joining + * all five of those relations and in that sequence, but once that is + * done, it's OK to join any other rels that are part of the join problem. + * This allows a user to specify the driving table and perhaps the first + * few things to which it should be joined while leaving the rest of the + * join order up the optimizer. But it seems like it would be surprising, + * given that specification, if the user could add t6 to the (t3 t4 t5) + * sub-join, so we don't allow that. If we did want to allow it, the logic + * earlier in this function would require substantial adjustment: we could + * allow the t3-t4-t5-t6 join to be built here, but the next step of + * joining t1-t2 to the result would still be rejected. + */ + if (!sublist) + entry->flags |= PGPA_TE_MATCH_FULL; + return sublist ? PGPA_JO_DENIED : PGPA_JO_PERMITTED; +} + +/* + * Does a certain item of join method advice permit a certain join? + * + * Advice such as HASH_JOIN((x y)) means that there should be a hash join with + * exactly x and y on the inner side. Obviously, this means that if we are + * considering a join with exactly x and y on the inner side, we should enforce + * the use of a hash join. However, it also means that we must reject some + * incompatible join orders entirely. For example, a join with exactly x + * and y on the outer side shouldn't be allowed, because such paths might win + * over the advice-driven path on cost. + * + * To accommodate these requirements, this function returns true if the join + * should be allowed and false if it should not. Furthermore, *restrict_method + * is set to true if the join method should be enforced and false if not. + */ +static bool +pgpa_join_method_permits_join(int outer_count, int inner_count, + pgpa_identifier *rids, + pgpa_trove_entry *entry, + bool *restrict_method) +{ + pgpa_advice_target *target = entry->target; + pgpa_itm_type inner_itm; + pgpa_itm_type outer_itm; + pgpa_itm_type join_itm; + + /* We definitely have at least a partial match for this trove entry. */ + entry->flags |= PGPA_TE_MATCH_PARTIAL; + + *restrict_method = false; + + /* + * If our inner rel mentions exactly the same relations as the advice + * target, allow the join and enforce the join method restriction. + * + * If our inner rel mentions a superset of the target relations, allow the + * join. The join we care about has already taken place, and this advice + * imposes no further restrictions. + */ + inner_itm = pgpa_identifiers_match_target(inner_count, + rids + outer_count, + target); + if (inner_itm == PGPA_ITM_EQUAL) + { + entry->flags |= PGPA_TE_MATCH_FULL; + *restrict_method = true; + return true; + } + else if (inner_itm == PGPA_ITM_TARGETS_ARE_SUBSET) + return true; + + /* + * If our outer rel mentions a supserset of the relations in the advice + * target, no restrictions apply. The join we care has already taken + * place, and this advice imposes no further restrictions. + * + * On the other hand, if our outer rel mentions exactly the relations + * mentioned in the advice target, the planner is trying to reverse the + * sides of the join as compared with our desired outcome. Reject that. + */ + outer_itm = pgpa_identifiers_match_target(outer_count, + rids, target); + if (outer_itm == PGPA_ITM_TARGETS_ARE_SUBSET) + return true; + else if (outer_itm == PGPA_ITM_EQUAL) + return false; + + /* + * If the advice target mentions only a single relation, the test below + * cannot ever pass, so save some work by exiting now. + */ + if (target->ttype == PGPA_TARGET_IDENTIFIER) + return false; + + /* + * If everything in the joinrel appears in the advice target, we're below + * the level of the join we want to control. + * + * For example, HASH_JOIN((x y)) doesn't restrict how x and y can be + * joined. + * + * This lookup shouldn't return PGPA_ITM_DISJOINT, because any such advice + * should not have been returned from the trove in the first place. + */ + join_itm = pgpa_identifiers_match_target(outer_count + inner_count, + rids, target); + Assert(join_itm != PGPA_ITM_DISJOINT); + if (join_itm == PGPA_ITM_KEYS_ARE_SUBSET || + join_itm == PGPA_ITM_EQUAL) + return true; + + /* + * We've already permitted all allowable cases, so reject this. + * + * If we reach this point, then the advice overlaps with this join but + * isn't entirely contained within either side, and there's also at least + * one relation present in the join that isn't mentioned by the advice. + * + * For instance, in the HASH_JOIN((x y)) example, we would reach here if x + * were on one side of the join, y on the other, and at least one of the + * two sides also included some other relation, say t. In that case, + * accepting this join would allow the (x y t) joinrel to contain + * non-disabled paths that do not put (x y) on the inner side of a hash + * join; we could instead end up with something like (x JOIN t) JOIN y. + */ + return false; +} + +/* + * Does advice concerning an opaque join permit a certain join? + * + * By an opaque join, we mean one where the exact mechanism by which the + * join is performed is not visible to PostgreSQL. Currently this is the + * case only for foreign joins: FOREIGN_JOIN((x y z)) means that x, y, and + * z are joined on the remote side, but we know nothing about the join order + * or join methods used over there. + * + * The logic here needs to differ from pgpa_join_method_permits_join because, + * for other join types, the advice target is the set of inner rels; here, it + * includes both inner and outer rels. + */ +static bool +pgpa_opaque_join_permits_join(int outer_count, int inner_count, + pgpa_identifier *rids, + pgpa_trove_entry *entry, + bool *restrict_method) +{ + pgpa_advice_target *target = entry->target; + pgpa_itm_type join_itm; + + /* We definitely have at least a partial match for this trove entry. */ + entry->flags |= PGPA_TE_MATCH_PARTIAL; + + *restrict_method = false; + + join_itm = pgpa_identifiers_match_target(outer_count + inner_count, + rids, target); + if (join_itm == PGPA_ITM_EQUAL) + { + /* + * We have an exact match, and should therefore allow the join and + * enforce the use of the relevant opaque join method. + */ + entry->flags |= PGPA_TE_MATCH_FULL; + *restrict_method = true; + return true; + } + + if (join_itm == PGPA_ITM_KEYS_ARE_SUBSET || + join_itm == PGPA_ITM_TARGETS_ARE_SUBSET) + { + /* + * If join_itm == PGPA_ITM_TARGETS_ARE_SUBSET, then the join we care + * about has already taken place and no further restrictions apply. + * + * If join_itm == PGPA_ITM_KEYS_ARE_SUBSET, we're still building up to + * the join we care about and have not introduced any extraneous + * relations not named in the advice. Note that ForeignScan paths for + * joins are built up from ForeignScan paths from underlying joins and + * scans, so we must not disable this join when considering a subset + * of the relations we ultimately want. + */ + return true; + } + + /* + * The advice overlaps the join, but at least one relation is present in + * the join that isn't mentioned by the advice. We want to disable such + * paths so that we actually push down the join as intended. + */ + return false; +} + +/* + * Does advice concerning a semijoin permit a certain join? + * + * Unlike join method advice, which lists the rels on the inner side of the + * join, semijoin uniqueness advice lists the rels on the nullable side of the + * join. Those can be the same, if the join type is JOIN_UNIQUE_INNER or + * JOIN_SEMI, or they can be different, in case of JOIN_UNIQUE_OUTER or + * JOIN_RIGHT_SEMI. + * + * We don't know here whether the caller specified SEMIJOIN_UNIQUE or + * SEMIJOIN_NON_UNIQUE. The caller should check the join type against the + * advice type if and only if we set *restrict_method to true. + */ +static bool +pgpa_semijoin_permits_join(int outer_count, int inner_count, + pgpa_identifier *rids, + pgpa_trove_entry *entry, + bool outer_is_nullable, + bool *restrict_method) +{ + pgpa_advice_target *target = entry->target; + pgpa_itm_type join_itm; + pgpa_itm_type inner_itm; + pgpa_itm_type outer_itm; + + *restrict_method = false; + + /* We definitely have at least a partial match for this trove entry. */ + entry->flags |= PGPA_TE_MATCH_PARTIAL; + + /* + * If outer rel is the nullable side and contains exactly the same + * relations as the advice target, then the join order is allowable, but + * the caller must check whether the advice tag (either SEMIJOIN_UNIQUE or + * SEMIJOIN_NON_UNIQUE) matches the join type. + * + * If the outer rel is a superset of the target relations, the join we + * care about has already taken place, so we should impose no futher + * restritions. + */ + outer_itm = pgpa_identifiers_match_target(outer_count, + rids, target); + if (outer_itm == PGPA_ITM_EQUAL) + { + entry->flags |= PGPA_TE_MATCH_FULL; + if (outer_is_nullable) + { + *restrict_method = true; + return true; + } + } + else if (outer_itm == PGPA_ITM_TARGETS_ARE_SUBSET) + return true; + + /* As above, but for the inner rel. */ + inner_itm = pgpa_identifiers_match_target(inner_count, + rids + outer_count, + target); + if (inner_itm == PGPA_ITM_EQUAL) + { + entry->flags |= PGPA_TE_MATCH_FULL; + if (!outer_is_nullable) + { + *restrict_method = true; + return true; + } + } + else if (inner_itm == PGPA_ITM_TARGETS_ARE_SUBSET) + return true; + + /* + * If everything in the joinrel appears in the advice target, we're below + * the level of the join we want to control. + */ + join_itm = pgpa_identifiers_match_target(outer_count + inner_count, + rids, target); + Assert(join_itm != PGPA_ITM_DISJOINT); + if (join_itm == PGPA_ITM_KEYS_ARE_SUBSET || + join_itm == PGPA_ITM_EQUAL) + return true; + + /* + * We've tested for all allowable possibilities, and so must reject this + * join order. This can happen in two ways. + * + * First, we migh be considering a semijoin that overlaps incompletely + * with one or both sides of the join. For example, if the user has + * specified SEMIJOIN_UNIQUE((t1 t2)) or SEMIJOIN_NON_UNIQUE((t1 t2)), we + * should reject a proposed t2-t3 join, since that could not result in a + * final plan compatible with the advice. + * + * Second, we might be considering a semijoin where the advice target + * perfectly matches one side of the join, but it's the wrong one. For + * example, in the example above, we might see a 3-way join between t1, + * t2, and t3, with (t1 t2) on the non-nullable side. That, too, would be + * incompatible with the advice. + */ + return false; +} + +/* + * Apply scan advice to a RelOptInfo. + */ +static void +pgpa_planner_apply_scan_advice(RelOptInfo *rel, + pgpa_trove_entry *scan_entries, + Bitmapset *scan_indexes, + pgpa_trove_entry *rel_entries, + Bitmapset *rel_indexes) +{ + bool gather_conflict = false; + Bitmapset *gather_partial_match = NULL; + Bitmapset *gather_full_match = NULL; + int i = -1; + pgpa_trove_entry *scan_entry = NULL; + int flags; + bool scan_type_conflict = false; + Bitmapset *scan_type_indexes = NULL; + Bitmapset *scan_type_rel_indexes = NULL; + uint64 gather_mask = 0; + uint64 scan_type = 0; + + /* Scrutinize available scan advice. */ + while ((i = bms_next_member(scan_indexes, i)) >= 0) + { + pgpa_trove_entry *my_entry = &scan_entries[i]; + uint64 my_scan_type = 0; + + /* Translate our advice tags to a scan strategy advice value. */ + if (my_entry->tag == PGPA_TAG_BITMAP_HEAP_SCAN) + { + /* + * Clearly PGS_CONSIDER_INDEXONLY can suppress Bitmap Heap Scans, + * so don't clear it when such a scan is requested. This happens + * because build_index_scan() thinks that the possibility of an + * index-only scan is a sufficient reason to consider using an + * otherwise-useless index, and get_index_paths() thinks that the + * same paths that are useful for index or index-only scans should + * also be considered for bitmap scans. Perhaps that logic should + * be tightened up, but until then we need to include + * PGS_CONSIDER_INDEXONLY in my_scan_type here. + */ + my_scan_type = PGS_BITMAPSCAN | PGS_CONSIDER_INDEXONLY; + } + else if (my_entry->tag == PGPA_TAG_INDEX_ONLY_SCAN) + my_scan_type = PGS_INDEXONLYSCAN | PGS_CONSIDER_INDEXONLY; + else if (my_entry->tag == PGPA_TAG_INDEX_SCAN) + my_scan_type = PGS_INDEXSCAN; + else if (my_entry->tag == PGPA_TAG_SEQ_SCAN) + my_scan_type = PGS_SEQSCAN; + else if (my_entry->tag == PGPA_TAG_TID_SCAN) + my_scan_type = PGS_TIDSCAN; + + /* + * If this is understandable scan advice, hang on to the entry, the + * inferred scan type type, and the index at which we found it. + * + * Also make a note if we see conflicting scan type advice. Note that + * we regard two index specifications as conflicting unless they match + * exactly. In theory, perhaps we could regard INDEX_SCAN(a c) and + * INDEX_SCAN(a b.c) as non-conflicting if it happens that the only + * index named c is in schema b, but it doesn't seem worth the code. + */ + if (my_scan_type != 0) + { + if (scan_type != 0 && scan_type != my_scan_type) + scan_type_conflict = true; + if (!scan_type_conflict && scan_entry != NULL && + my_entry->target->itarget != NULL && + scan_entry->target->itarget != NULL && + !pgpa_index_targets_equal(scan_entry->target->itarget, + my_entry->target->itarget)) + scan_type_conflict = true; + scan_entry = my_entry; + scan_type = my_scan_type; + scan_type_indexes = bms_add_member(scan_type_indexes, i); + } + } + + /* Scrutinize available gather-related and partitionwise advice. */ + i = -1; + while ((i = bms_next_member(rel_indexes, i)) >= 0) + { + pgpa_trove_entry *my_entry = &rel_entries[i]; + uint64 my_gather_mask = 0; + bool just_one_rel; + + just_one_rel = my_entry->target->ttype == PGPA_TARGET_IDENTIFIER + || list_length(my_entry->target->children) == 1; + + /* + * PARTITIONWISE behaves like a scan type, except that if there's more + * than one relation targeted, it has no effect at this level. + */ + if (my_entry->tag == PGPA_TAG_PARTITIONWISE) + { + if (just_one_rel) + { + const uint64 my_scan_type = PGS_APPEND | PGS_MERGE_APPEND; + + if (scan_type != 0 && scan_type != my_scan_type) + scan_type_conflict = true; + scan_entry = my_entry; + scan_type = my_scan_type; + scan_type_rel_indexes = + bms_add_member(scan_type_rel_indexes, i); + } + continue; + } + + /* + * GATHER and GATHER_MERGE applied to a single rel mean that we should + * use the correspondings strategy here, while applying either to more + * than one rel means we should not use those strategies here, but + * rather at the level of the joinrel that corresponds to what was + * specified. NO_GATHER can only be applied to single rels. + * + * Note that setting PGS_CONSIDER_NONPARTIAL in my_gather_mask is + * equivalent to allowing the non-use of either form of Gather here. + */ + if (my_entry->tag == PGPA_TAG_GATHER || + my_entry->tag == PGPA_TAG_GATHER_MERGE) + { + if (!just_one_rel) + my_gather_mask = PGS_CONSIDER_NONPARTIAL; + else if (my_entry->tag == PGPA_TAG_GATHER) + my_gather_mask = PGS_GATHER; + else + my_gather_mask = PGS_GATHER_MERGE; + } + else if (my_entry->tag == PGPA_TAG_NO_GATHER) + { + Assert(just_one_rel); + my_gather_mask = PGS_CONSIDER_NONPARTIAL; + } + + /* + * If we set my_gather_mask up above, then we (1) make a note if the + * advice conflicted, (2) remember the mask value, and (3) remember + * whether this was a full or partial match. + */ + if (my_gather_mask != 0) + { + if (gather_mask != 0 && gather_mask != my_gather_mask) + gather_conflict = true; + gather_mask = my_gather_mask; + if (just_one_rel) + gather_full_match = bms_add_member(gather_full_match, i); + else + gather_partial_match = bms_add_member(gather_partial_match, i); + } + } + + /* Enforce choice of index. */ + if (scan_entry != NULL && !scan_type_conflict && + (scan_entry->tag == PGPA_TAG_INDEX_SCAN || + scan_entry->tag == PGPA_TAG_INDEX_ONLY_SCAN)) + { + pgpa_index_target *itarget = scan_entry->target->itarget; + IndexOptInfo *matched_index = NULL; + + foreach_node(IndexOptInfo, index, rel->indexlist) + { + char *relname = get_rel_name(index->indexoid); + Oid nspoid = get_rel_namespace(index->indexoid); + char *relnamespace = get_namespace_name_or_temp(nspoid); + + if (strcmp(itarget->indname, relname) == 0 && + (itarget->indnamespace == NULL || + strcmp(itarget->indnamespace, relnamespace) == 0)) + { + matched_index = index; + break; + } + } + + if (matched_index == NULL) + { + /* Don't force the scan type if the index doesn't exist. */ + scan_type = 0; + + /* Mark advice as inapplicable. */ + pgpa_trove_set_flags(scan_entries, scan_type_indexes, + PGPA_TE_INAPPLICABLE); + } + else + { + /* Disable every other index. */ + foreach_node(IndexOptInfo, index, rel->indexlist) + { + if (index != matched_index) + index->disabled = true; + } + } + } + + /* + * Mark all the scan method entries as fully matched; and if they specify + * different things, mark them all as conflicting. + */ + flags = PGPA_TE_MATCH_PARTIAL | PGPA_TE_MATCH_FULL; + if (scan_type_conflict) + flags |= PGPA_TE_CONFLICTING; + pgpa_trove_set_flags(scan_entries, scan_type_indexes, flags); + pgpa_trove_set_flags(rel_entries, scan_type_rel_indexes, flags); + + /* + * Mark every Gather-related piece of advice as partially matched. Mark + * the ones that included this relation as a target by itself as fully + * matched. If there was a conflict, mark them all as conflicting. + */ + flags = PGPA_TE_MATCH_PARTIAL; + if (gather_conflict) + flags |= PGPA_TE_CONFLICTING; + pgpa_trove_set_flags(rel_entries, gather_partial_match, flags); + flags |= PGPA_TE_MATCH_FULL; + pgpa_trove_set_flags(rel_entries, gather_full_match, flags); + + /* + * Enforce restrictions on the scan type and use of Gather/Gather Merge. + * Only clear bits here, so that we still respect the enable_* GUCs. Do + * nothing in cases where the advice on a single topic conflicts. + */ + if (scan_type != 0 && !scan_type_conflict) + { + uint64 all_scan_mask; + + all_scan_mask = PGS_SCAN_ANY | PGS_APPEND | PGS_MERGE_APPEND | + PGS_CONSIDER_INDEXONLY; + rel->pgs_mask &= ~(all_scan_mask & ~scan_type); + } + if (gather_mask != 0 && !gather_conflict) + { + uint64 all_gather_mask; + + all_gather_mask = + PGS_GATHER | PGS_GATHER_MERGE | PGS_CONSIDER_NONPARTIAL; + rel->pgs_mask &= ~(all_gather_mask & ~gather_mask); + } +} + +/* + * Add feedback entries to for one trove slice to the provided list and + * return the resulting list. + * + * Feedback entries are generated from the trove entry's flags. It's assumed + * that the caller has already set all relevant flags with the exception of + * PGPA_TE_FAILED. We set that flag here if appropriate. + */ +static List * +pgpa_planner_append_feedback(List *list, pgpa_trove *trove, + pgpa_trove_lookup_type type, + pgpa_identifier *rt_identifiers, + pgpa_plan_walker_context *walker) +{ + pgpa_trove_entry *entries; + int nentries; + StringInfoData buf; + + initStringInfo(&buf); + pgpa_trove_lookup_all(trove, type, &entries, &nentries); + for (int i = 0; i < nentries; ++i) + { + pgpa_trove_entry *entry = &entries[i]; + DefElem *item; + + /* + * If this entry was fully matched, check whether generating advice + * from this plan would produce such an entry. If not, label the entry + * as failed. + */ + if ((entry->flags & PGPA_TE_MATCH_FULL) != 0 && + !pgpa_walker_would_advise(walker, rt_identifiers, + entry->tag, entry->target)) + entry->flags |= PGPA_TE_FAILED; + + item = makeDefElem(pgpa_cstring_trove_entry(entry), + (Node *) makeInteger(entry->flags), -1); + list = lappend(list, item); + } + + return list; +} + +/* + * Emit a WARNING tell the user about a problem with the supplied plan advice. + */ +static void +pgpa_planner_feedback_warning(List *feedback) +{ + StringInfoData detailbuf; + StringInfoData flagbuf; + + /* Quick exit if there's no feedback. */ + if (feedback == NIL) + return; + + /* Initialize buffers. */ + initStringInfo(&detailbuf); + initStringInfo(&flagbuf); + + /* Main loop. */ + foreach_node(DefElem, item, feedback) + { + int flags = defGetInt32(item); + + /* + * Don't emit anything if it was fully matched with no problems found. + * + * NB: Feedback should never be marked fully matched without also + * being marked partially matched. + */ + if (flags == (PGPA_TE_MATCH_PARTIAL | PGPA_TE_MATCH_FULL)) + continue; + + /* + * Terminate each detail line except the last with a newline. This is + * also a convenient place to reset flagbuf. + */ + if (detailbuf.len > 0) + { + appendStringInfoChar(&detailbuf, '\n'); + resetStringInfo(&flagbuf); + } + + /* Generate output. */ + pgpa_trove_append_flags(&flagbuf, flags); + appendStringInfo(&detailbuf, _("advice %s feedback is \"%s\""), + item->defname, flagbuf.data); + } + + /* Emit the warning, if any problems were found. */ + if (detailbuf.len > 0) + ereport(WARNING, + errmsg("supplied plan advice was not enforced"), + errdetail("%s", detailbuf.data)); +} + +#ifdef USE_ASSERT_CHECKING + +/* + * Fast hash function for a key consisting of an RTI and plan name. + */ +static uint32 +pgpa_ri_checker_hash_key(pgpa_ri_checker_key key) +{ + fasthash_state hs; + int sp_len; + + fasthash_init(&hs, 0); + + hs.accum = key.rti; + fasthash_combine(&hs); + + /* plan_name can be NULL */ + if (key.plan_name == NULL) + sp_len = 0; + else + sp_len = fasthash_accum_cstring(&hs, key.plan_name); + + /* hashfn_unstable.h recommends using string length as tweak */ + return fasthash_final32(&hs, sp_len); +} + +#endif + +/* + * Save the range table identifier for one relation for future cross-checking. + */ +static void +pgpa_ri_checker_save(pgpa_planner_state *pps, PlannerInfo *root, + RelOptInfo *rel) +{ +#ifdef USE_ASSERT_CHECKING + pgpa_ri_checker_key key; + pgpa_ri_checker *check; + pgpa_identifier rid; + const char *rid_string; + bool found; + + key.rti = bms_singleton_member(rel->relids); + key.plan_name = root->plan_name; + pgpa_compute_identifier_by_rti(root, key.rti, &rid); + rid_string = pgpa_identifier_string(&rid); + check = pgpa_ri_check_insert(pps->ri_check_hash, key, &found); + Assert(!found || strcmp(check->rid_string, rid_string) == 0); + check->rid_string = rid_string; +#endif +} + +/* + * Validate that the range table identifiers we were able to generate during + * planning match the ones we generated from the final plan. + */ +static void +pgpa_ri_checker_validate(pgpa_planner_state *pps, PlannedStmt *pstmt) +{ +#ifdef USE_ASSERT_CHECKING + pgpa_identifier *rt_identifiers; + pgpa_ri_check_iterator it; + pgpa_ri_checker *check; + + /* Create identifiers from the planned statement. */ + rt_identifiers = pgpa_create_identifiers_for_planned_stmt(pstmt); + + /* Iterate over identifiers created during planning, so we can compare. */ + pgpa_ri_check_start_iterate(pps->ri_check_hash, &it); + while ((check = pgpa_ri_check_iterate(pps->ri_check_hash, &it)) != NULL) + { + int rtoffset = 0; + const char *rid_string; + Index flat_rti; + + /* + * If there's no plan name associated with this entry, then the + * rtoffset is 0. Otherwise, we can search the SubPlanRTInfo list to + * find the rtoffset. + */ + if (check->key.plan_name != NULL) + { + foreach_node(SubPlanRTInfo, rtinfo, pstmt->subrtinfos) + { + /* + * If rtinfo->dummy is set, then the subquery's range table + * will only have been partially copied to the final range + * table. Specifically, only RTE_RELATION entries and + * RTE_SUBQUERY entries that were once RTE_RELATION entries + * will be copied, as per add_rtes_to_flat_rtable. Therefore, + * there's no fixed rtoffset that we can apply to the RTIs + * used during planning to locate the corresponding relations + * in the final rtable. + * + * With more complex logic, we could work around that problem + * by remembering the whole contents of the subquery's rtable + * during planning, determining which of those would have been + * copied to the final rtable, and matching them up. But it + * doesn't seem like a worthwhile endeavor for right now, + * because RTIs from such subqueries won't appear in the plan + * tree itself, just in the range table. Hence, we can neither + * generate nor accept advice for them. + */ + if (strcmp(check->key.plan_name, rtinfo->plan_name) == 0 + && !rtinfo->dummy) + { + rtoffset = rtinfo->rtoffset; + Assert(rtoffset > 0); + break; + } + } + + /* + * It's not an error if we don't find the plan name: that just + * means that we planned a subplan by this name but it ended up + * being a dummy subplan and so wasn't included in the final plan + * tree. + */ + if (rtoffset == 0) + continue; + } + + /* + * check->key.rti is the RTI that we saw prior to range-table + * flattening, so we must add the appropriate RT offset to get the + * final RTI. + */ + flat_rti = check->key.rti + rtoffset; + Assert(flat_rti <= list_length(pstmt->rtable)); + + /* Assert that the string we compute now matches the previous one. */ + rid_string = pgpa_identifier_string(&rt_identifiers[flat_rti - 1]); + Assert(strcmp(rid_string, check->rid_string) == 0); + } +#endif +} + +/* + * Convert a bitmapset to a C string of comma-separated integers. + */ +static char * +pgpa_bms_to_cstring(Bitmapset *bms) +{ + StringInfoData buf; + int x = -1; + + if (bms_is_empty(bms)) + return "none"; + + initStringInfo(&buf); + while ((x = bms_next_member(bms, x)) >= 0) + { + if (buf.len > 0) + appendStringInfo(&buf, ", %d", x); + else + appendStringInfo(&buf, "%d", x); + } + + return buf.data; +} + +/* + * Convert a JoinType to a C string. + */ +static const char * +pgpa_jointype_to_cstring(JoinType jointype) +{ + switch (jointype) + { + case JOIN_INNER: + return "inner"; + case JOIN_LEFT: + return "left"; + case JOIN_FULL: + return "full"; + case JOIN_RIGHT: + return "right"; + case JOIN_SEMI: + return "semi"; + case JOIN_ANTI: + return "anti"; + case JOIN_RIGHT_SEMI: + return "right semi"; + case JOIN_RIGHT_ANTI: + return "right anti"; + case JOIN_UNIQUE_OUTER: + return "unique outer"; + case JOIN_UNIQUE_INNER: + return "unique inner"; + } + return "???"; +} diff --git a/contrib/pg_plan_advice/pgpa_planner.h b/contrib/pg_plan_advice/pgpa_planner.h new file mode 100644 index 00000000000..7d40b910b00 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_planner.h @@ -0,0 +1,17 @@ +/*------------------------------------------------------------------------- + * + * pgpa_planner.h + * planner hooks + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_planner.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_PLANNER_H +#define PGPA_PLANNER_H + +extern void pgpa_planner_install_hooks(void); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_scan.c b/contrib/pg_plan_advice/pgpa_scan.c new file mode 100644 index 00000000000..75d1a3efa36 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_scan.c @@ -0,0 +1,288 @@ +/*------------------------------------------------------------------------- + * + * pgpa_scan.c + * analysis of scans in Plan trees + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_scan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pgpa_scan.h" +#include "pgpa_walker.h" + +#include "nodes/parsenodes.h" +#include "parser/parsetree.h" + +static pgpa_scan *pgpa_make_scan(pgpa_plan_walker_context *walker, Plan *plan, + pgpa_scan_strategy strategy, + Bitmapset *relids); + + +static RTEKind unique_nonjoin_rtekind(Bitmapset *relids, List *rtable); + +/* + * Build a pgpa_scan object for a Plan node and update the plan walker + * context as appopriate. If this is an Append or MergeAppend scan, also + * build pgpa_scan for any scans that were consolidated into this one by + * Append/MergeAppend pull-up. + * + * If there is at least one ElidedNode for this plan node, pass the uppermost + * one as elided_node, else pass NULL. + * + * Set the 'beneath_any_gather' node if we are underneath a Gather or + * Gather Merge node (except for a single-copy Gather node, for which + * GATHER or GATHER_MERGE advice should not be emitted). + * + * Set the 'within_join_problem' flag if we're inside of a join problem and + * not otherwise. + */ +pgpa_scan * +pgpa_build_scan(pgpa_plan_walker_context *walker, Plan *plan, + ElidedNode *elided_node, + bool beneath_any_gather, bool within_join_problem) +{ + pgpa_scan_strategy strategy = PGPA_SCAN_ORDINARY; + Bitmapset *relids = NULL; + int rti = -1; + List *child_append_relid_sets = NIL; + + if (elided_node != NULL) + { + NodeTag elided_type = elided_node->elided_type; + + /* + * If setrefs processing elided an Append or MergeAppend node that had + * only one surviving child, it might be a partitionwise operation, + * but then this is either a setop over subqueries, or a partitionwise + * operation (which might be a scan or a join in reality, but here we + * don't care about the distinction and consider it simply a scan). + * + * A setop over subqueries, or a trivial SubQueryScan that was elided, + * is an "ordinary" scan i.e. one for which we need to generate advice + * because the planner has not made any meaningful choice. + */ + relids = elided_node->relids; + if ((elided_type == T_Append || elided_type == T_MergeAppend) && + unique_nonjoin_rtekind(relids, + walker->pstmt->rtable) == RTE_RELATION) + strategy = PGPA_SCAN_PARTITIONWISE; + else + strategy = PGPA_SCAN_ORDINARY; + + /* Join RTIs can be present, but advice never refers to them. */ + relids = pgpa_filter_out_join_relids(relids, walker->pstmt->rtable); + } + else if ((rti = pgpa_scanrelid(plan)) != 0) + { + relids = bms_make_singleton(rti); + + switch (nodeTag(plan)) + { + case T_SeqScan: + strategy = PGPA_SCAN_SEQ; + break; + case T_BitmapHeapScan: + strategy = PGPA_SCAN_BITMAP_HEAP; + break; + case T_IndexScan: + strategy = PGPA_SCAN_INDEX; + break; + case T_IndexOnlyScan: + strategy = PGPA_SCAN_INDEX_ONLY; + break; + case T_TidScan: + case T_TidRangeScan: + strategy = PGPA_SCAN_TID; + break; + default: + + /* + * This case includes a ForeignScan targeting a single + * relation; no other strategy is possible in that case, but + * see below, where things are different in multi-relation + * cases. + */ + strategy = PGPA_SCAN_ORDINARY; + break; + } + } + else if ((relids = pgpa_relids(plan)) != NULL) + { + switch (nodeTag(plan)) + { + case T_ForeignScan: + + /* + * If multiple relations are being targeted by a single + * foreign scan, then the foreign join has been pushed to the + * remote side, and we want that to be reflected in the + * generated advice. + */ + strategy = PGPA_SCAN_FOREIGN; + break; + case T_Append: + + /* + * Append nodes can represent partitionwise scans of a a + * relation, but when they implement a set operation, they are + * just ordinary scans. + */ + if (unique_nonjoin_rtekind(relids, walker->pstmt->rtable) + == RTE_RELATION) + strategy = PGPA_SCAN_PARTITIONWISE; + else + strategy = PGPA_SCAN_ORDINARY; + + /* Be sure to account for pulled-up scans. */ + child_append_relid_sets = + ((Append *) plan)->child_append_relid_sets; + break; + case T_MergeAppend: + /* Some logic here as for Append, above. */ + if (unique_nonjoin_rtekind(relids, walker->pstmt->rtable) + == RTE_RELATION) + strategy = PGPA_SCAN_PARTITIONWISE; + else + strategy = PGPA_SCAN_ORDINARY; + + /* Be sure to account for pulled-up scans. */ + child_append_relid_sets = + ((MergeAppend *) plan)->child_append_relid_sets; + break; + default: + strategy = PGPA_SCAN_ORDINARY; + break; + } + + + /* Join RTIs can be present, but advice never refers to them. */ + relids = pgpa_filter_out_join_relids(relids, walker->pstmt->rtable); + } + + /* + * If this is an Append or MergeAppend node into which subordinate Append + * or MergeAppend paths were merged, each of those merged paths is + * effectively another scan for which we need to account. + */ + foreach_node(Bitmapset, child_relids, child_append_relid_sets) + { + Bitmapset *child_nonjoin_relids; + + child_nonjoin_relids = + pgpa_filter_out_join_relids(child_relids, + walker->pstmt->rtable); + (void) pgpa_make_scan(walker, plan, strategy, + child_nonjoin_relids); + } + + /* + * If this plan node has no associated RTIs, it's not a scan. When the + * 'within_join_problem' flag is set, that's unexpected, so throw an + * error, else return quietly. + */ + if (relids == NULL) + { + if (within_join_problem) + elog(ERROR, "plan node has no RTIs: %d", (int) nodeTag(plan)); + return NULL; + } + + /* + * Add the appropriate set of RTIs to walker->no_gather_scans. + * + * Add nothing if we're beneath a Gather or Gather Merge node, since + * NO_GATHER advice is clearly inappropriate in that situation. + * + * Add nothing if this is an Append or MergeAppend node, we'll emit + * NO_GATHER() for the underlying scan, which is good enough. + * + * Add nothing if this is an elided node. If it's an elided Append or + * MergeAppend node, the same argument applies as for a non-elided Append + * or MergeAppend. An elided SubqueryScan is likely to have underlying + * tables as well, but even if it doesn't, emitting NO_GATHER() for a + * non-RTE_RELATION won't work anyway, since get_relation_info() isn't + * called in such cases. + * + * In fact, we need to filter out any non-RTE_RELATION RTIs for exactly + * this reason, and avoid adding them to the no_gather_scans set. + */ + if (!beneath_any_gather && elided_node == NULL && + !IsA(plan, Append) && !IsA(plan, MergeAppend)) + { + int no_gather_rti = -1; + + while ((no_gather_rti = bms_next_member(relids, no_gather_rti)) >= 0) + { + RangeTblEntry *rte; + + rte = rt_fetch(no_gather_rti, walker->pstmt->rtable); + if (rte->rtekind == RTE_RELATION) + walker->no_gather_scans = + bms_add_member(walker->no_gather_scans, no_gather_rti); + } + } + + /* Caller tells us whether NO_GATHER() advice for this scan is needed. */ + return pgpa_make_scan(walker, plan, strategy, relids); +} + +/* + * Create a single pgpa_scan object and update the pgpa_plan_walker_context. + */ +static pgpa_scan * +pgpa_make_scan(pgpa_plan_walker_context *walker, Plan *plan, + pgpa_scan_strategy strategy, Bitmapset *relids) +{ + pgpa_scan *scan; + + /* Create the scan object. */ + scan = palloc(sizeof(pgpa_scan)); + scan->plan = plan; + scan->strategy = strategy; + scan->relids = relids; + + /* Add it to the appropriate list. */ + walker->scans[scan->strategy] = lappend(walker->scans[scan->strategy], + scan); + + return scan; +} + +/* + * Determine the unique rtekind of a set of relids. + */ +static RTEKind +unique_nonjoin_rtekind(Bitmapset *relids, List *rtable) +{ + int rti = -1; + bool first = true; + RTEKind rtekind; + + Assert(relids != NULL); + + while ((rti = bms_next_member(relids, rti)) >= 0) + { + RangeTblEntry *rte = rt_fetch(rti, rtable); + + if (rte->rtekind == RTE_JOIN) + continue; + + if (first) + { + rtekind = rte->rtekind; + first = false; + } + else if (rtekind != rte->rtekind) + elog(ERROR, "rtekind mismatch: %d vs. %d", + rtekind, rte->rtekind); + } + + if (first) + elog(ERROR, "no non-RTE_JOIN RTEs found"); + + return rtekind; +} diff --git a/contrib/pg_plan_advice/pgpa_scan.h b/contrib/pg_plan_advice/pgpa_scan.h new file mode 100644 index 00000000000..3bb8726ff1e --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_scan.h @@ -0,0 +1,85 @@ +/*------------------------------------------------------------------------- + * + * pgpa_scan.h + * analysis of scans in Plan trees + * + * For purposes of this module, a "scan" includes (1) single plan nodes that + * scan multiple RTIs, such as a degenerate Result node that replaces what + * would otherwise have been a join, and (2) Append and MergeAppend nodes + * implementing a partitionwise scan or a partitionwise join. Said + * differently, scans are the leaves of the join tree for a single join + * problem. + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_scan.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_SCAN_H +#define PGPA_SCAN_H + +#include "nodes/plannodes.h" + +typedef struct pgpa_plan_walker_context pgpa_plan_walker_context; + +/* + * Scan strategies. + * + * PGPA_SCAN_ORDINARY is any scan strategy that isn't interesting to us + * because there is no meaningful planner decision involved. For example, + * the only way to scan a subquery is a SubqueryScan, and the only way to + * scan a VALUES construct is a ValuesScan. We need not care exactly which + * type of planner node was used in such cases, because the same thing will + * happen when replanning. + * + * PGPA_SCAN_ORDINARY also includes Result nodes that correspond to scans + * or even joins that are proved empty. We don't know whether or not the scan + * or join will still be provably empty at replanning time, but if it is, + * then no scan-type advice is needed, and if it's not, we can't recommend + * a scan type based on the current plan. + * + * PGPA_SCAN_PARTITIONWISE also lumps together scans and joins: this can + * be either a partitionwise scan of a partitioned table or a partitionwise + * join between several partitioned tables. Note that all decisions about + * whether or not to use partitionwise join are meaningful: no matter what + * we decided this time, we could do more or fewer things partitionwise the + * next time. + * + * PGPA_SCAN_FOREIGN is only used when there's more than one relation involved; + * a single-table foreign scan is classified as ordinary, since there is no + * decision to make in that case. + * + * Other scan strategies map one-to-one to plan nodes. + */ +typedef enum +{ + PGPA_SCAN_ORDINARY = 0, + PGPA_SCAN_SEQ, + PGPA_SCAN_BITMAP_HEAP, + PGPA_SCAN_FOREIGN, + PGPA_SCAN_INDEX, + PGPA_SCAN_INDEX_ONLY, + PGPA_SCAN_PARTITIONWISE, + PGPA_SCAN_TID + /* update NUM_PGPA_SCAN_STRATEGY if you add anything here */ +} pgpa_scan_strategy; + +#define NUM_PGPA_SCAN_STRATEGY ((int) PGPA_SCAN_TID + 1) + +/* + * All of the details we need regarding a scan. + */ +typedef struct pgpa_scan +{ + Plan *plan; + pgpa_scan_strategy strategy; + Bitmapset *relids; +} pgpa_scan; + +extern pgpa_scan *pgpa_build_scan(pgpa_plan_walker_context *walker, Plan *plan, + ElidedNode *elided_node, + bool beneath_any_gather, + bool within_join_problem); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_scanner.l b/contrib/pg_plan_advice/pgpa_scanner.l new file mode 100644 index 00000000000..a887735f314 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_scanner.l @@ -0,0 +1,297 @@ +%top{ +/* + * Scanner for plan advice + * + * Copyright (c) 2000-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_scanner.l + */ +#include "postgres.h" + +#include "common/string.h" +#include "nodes/miscnodes.h" +#include "parser/scansup.h" + +#include "pgpa_ast.h" +#include "pgpa_parser.h" + +/* + * Extra data that we pass around when during scanning. + * + * 'litbuf' is used to implement the exclusive state, which handles + * double-quoted identifiers. + */ +typedef struct pgpa_yy_extra_type +{ + StringInfoData litbuf; +} pgpa_yy_extra_type; + +} + +%{ +/* LCOV_EXCL_START */ + +#define YY_DECL \ + extern int pgpa_yylex(union YYSTYPE *yylval_param, List **result, \ + char **parse_error_msg_p, yyscan_t yyscanner) + +/* No reason to constrain amount of data slurped */ +#define YY_READ_BUF_SIZE 16777216 + +/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */ +#undef fprintf +#define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg) + +static void +fprintf_to_ereport(const char *fmt, const char *msg) +{ + ereport(ERROR, (errmsg_internal("%s", msg))); +} +%} + +%option reentrant +%option bison-bridge +%option 8bit +%option never-interactive +%option nodefault +%option noinput +%option nounput +%option noyywrap +%option noyyalloc +%option noyyrealloc +%option noyyfree +%option warn +%option prefix="pgpa_yy" +%option extra-type="pgpa_yy_extra_type *" + +/* + * What follows is a severely stripped-down version of the core scanner. We + * only care about recognizing identifiers with or without identifier quoting + * (i.e. double-quoting), decimal integers, and a small handful of other + * things. Keep these rules in sync with src/backend/parser/scan.l. As in that + * file, we use an exclusive state called 'xc' for C-style comments, and an + * exclusive state called 'xd' for double-quoted identifiers. + */ +%x xc +%x xd + +ident_start [A-Za-z\200-\377_] +ident_cont [A-Za-z\200-\377_0-9\$] + +identifier {ident_start}{ident_cont}* + +decdigit [0-9] +decinteger {decdigit}(_?{decdigit})* + +space [ \t\n\r\f\v] +whitespace {space}+ + +dquote \" +xdstart {dquote} +xdstop {dquote} +xddouble {dquote}{dquote} +xdinside [^"]+ + +xcstart \/\* +xcstop \*+\/ +xcinside [^*/]+ + +%% + +{whitespace} { /* ignore */ } + +{identifier} { + char *str; + bool fail; + pgpa_advice_tag_type tag; + + /* + * Unlike the core scanner, we don't truncate identifiers + * here. There is no obvious reason to do so. + */ + str = downcase_identifier(yytext, yyleng, false, false); + yylval->str = str; + + /* + * If it's not a tag, just return TOK_IDENT; else, return + * a token type based on how further parsing should + * proceed. + */ + tag = pgpa_parse_advice_tag(str, &fail); + if (fail) + return TOK_IDENT; + else if (tag == PGPA_TAG_JOIN_ORDER) + return TOK_TAG_JOIN_ORDER; + else if (tag == PGPA_TAG_INDEX_SCAN || + tag == PGPA_TAG_INDEX_ONLY_SCAN) + return TOK_TAG_INDEX; + else if (tag == PGPA_TAG_SEQ_SCAN || + tag == PGPA_TAG_TID_SCAN || + tag == PGPA_TAG_BITMAP_HEAP_SCAN || + tag == PGPA_TAG_NO_GATHER) + return TOK_TAG_SIMPLE; + else + return TOK_TAG_GENERIC; + } + +{decinteger} { + char *endptr; + + errno = 0; + yylval->integer = strtoint(yytext, &endptr, 10); + if (*endptr != '\0' || errno == ERANGE) + pgpa_yyerror(result, parse_error_msg_p, yyscanner, + "integer out of range"); + return TOK_INTEGER; + } + +{xcstart} { + BEGIN(xc); + } + +{xdstart} { + BEGIN(xd); + resetStringInfo(&yyextra->litbuf); + } + +. { return yytext[0]; } + +{xcstop} { + BEGIN(INITIAL); + } + +{xcinside} { + /* discard multiple characters without slash or asterisk */ + } + +. { + /* + * Discard any single character. flex prefers longer + * matches, so this rule will never be picked when we could + * have matched xcstop. + * + * NB: At present, we don't bother to support nested + * C-style comments here, but this logic could be extended + * if that restriction poses a problem. + */ + } + +<> { + BEGIN(INITIAL); + pgpa_yyerror(result, parse_error_msg_p, yyscanner, + "unterminated comment"); + } + +{xdstop} { + BEGIN(INITIAL); + if (yyextra->litbuf.len == 0) + pgpa_yyerror(result, parse_error_msg_p, yyscanner, + "zero-length delimited identifier"); + yylval->str = pstrdup(yyextra->litbuf.data); + return TOK_IDENT; + } + +{xddouble} { + appendStringInfoChar(&yyextra->litbuf, '"'); + } + +{xdinside} { + appendBinaryStringInfo(&yyextra->litbuf, yytext, yyleng); + } + +<> { + BEGIN(INITIAL); + pgpa_yyerror(result, parse_error_msg_p, yyscanner, + "unterminated quoted identifier"); + } + +%% + +/* LCOV_EXCL_STOP */ + +/* + * Handler for errors while scanning or parsing advice. + * + * bison passes the error message to us via 'message', and the context is + * available via the 'yytext' macro. We assemble those values into a final + * error text and then arrange to pass it back to the caller of pgpa_yyparse() + * by storing it into *parse_error_msg_p. + */ +void +pgpa_yyerror(List **result, char **parse_error_msg_p, yyscan_t yyscanner, + const char *message) +{ + struct yyguts_t *yyg = (struct yyguts_t *) yyscanner; /* needed for yytext + * macro */ + + + /* report only the first error in a parse operation */ + if (*parse_error_msg_p) + return; + + if (yytext[0]) + *parse_error_msg_p = psprintf("%s at or near \"%s\"", message, yytext); + else + *parse_error_msg_p = psprintf("%s at end of input", message); +} + +/* + * Initialize the advice scanner. + * + * This should be called before parsing begins. + */ +void +pgpa_scanner_init(const char *str, yyscan_t *yyscannerp) +{ + yyscan_t yyscanner; + pgpa_yy_extra_type *yyext = palloc0_object(pgpa_yy_extra_type); + + if (yylex_init(yyscannerp) != 0) + elog(ERROR, "yylex_init() failed: %m"); + + yyscanner = *yyscannerp; + + initStringInfo(&yyext->litbuf); + pgpa_yyset_extra(yyext, yyscanner); + + yy_scan_string(str, yyscanner); +} + + +/* + * Shut down the advice scanner. + * + * This should be called after parsing is complete. + */ +void +pgpa_scanner_finish(yyscan_t yyscanner) +{ + yylex_destroy(yyscanner); +} + +/* + * Interface functions to make flex use palloc() instead of malloc(). + * It'd be better to make these static, but flex insists otherwise. + */ + +void * +yyalloc(yy_size_t size, yyscan_t yyscanner) +{ + return palloc(size); +} + +void * +yyrealloc(void *ptr, yy_size_t size, yyscan_t yyscanner) +{ + if (ptr) + return repalloc(ptr, size); + else + return palloc(size); +} + +void +yyfree(void *ptr, yyscan_t yyscanner) +{ + if (ptr) + pfree(ptr); +} diff --git a/contrib/pg_plan_advice/pgpa_trove.c b/contrib/pg_plan_advice/pgpa_trove.c new file mode 100644 index 00000000000..e924959c010 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_trove.c @@ -0,0 +1,516 @@ +/*------------------------------------------------------------------------- + * + * pgpa_trove.c + * All of the advice given for a particular query, appropriately + * organized for convenient access. + * + * This name comes from the English expression "trove of advice", which + * means a collection of wisdom. This slightly unusual term is chosen to + * avoid naming confusion; for example, "collection of advice" would + * invite confusion with pgpa_collector.c. Note that, while we don't know + * whether the provided advice is actually wise, it's not our job to + * question the user's choices. + * + * The goal of this module is to make it easy to locate the specific + * bits of advice that pertain to any given part of a query, or to + * determine that there are none. + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_trove.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pgpa_trove.h" + +#include "common/hashfn_unstable.h" + +/* + * An advice trove is organized into a series of "slices", each of which + * contains information about one topic e.g. scan methods. Each slice consists + * of an array of trove entries plus a hash table that we can use to determine + * which ones are relevant to a particular part of the query. + */ +typedef struct pgpa_trove_slice +{ + unsigned nallocated; + unsigned nused; + pgpa_trove_entry *entries; + struct pgpa_trove_entry_hash *hash; +} pgpa_trove_slice; + +/* + * Scan advice is stored into 'scan'; join advice is stored into 'join'; and + * advice that can apply to both cases is stored into 'rel'. This lets callers + * ask just for what's relevant. These slices correspond to the possible values + * of pgpa_trove_lookup_type. + */ +struct pgpa_trove +{ + pgpa_trove_slice join; + pgpa_trove_slice rel; + pgpa_trove_slice scan; +}; + +/* + * We're going to build a hash table to allow clients of this module to find + * relevant advice for a given part of the query quickly. However, we're going + * to use only three of the five key fields as hash keys. There are two reasons + * for this. + * + * First, it's allowable to set partition_schema to NULL to match a partition + * with the correct name in any schema. + * + * Second, we expect the "occurrence" and "partition_schema" portions of the + * relation identifiers to be mostly uninteresting. Most of the time, the + * occurrence field will be 1 and the partition_schema values will all be the + * same. Even when there is some variation, the absolute number of entries + * that have the same values for all three of these key fields should be + * quite small. + */ +typedef struct +{ + const char *alias_name; + const char *partition_name; + const char *plan_name; +} pgpa_trove_entry_key; + +typedef struct +{ + pgpa_trove_entry_key key; + int status; + Bitmapset *indexes; +} pgpa_trove_entry_element; + +static uint32 pgpa_trove_entry_hash_key(pgpa_trove_entry_key key); + +static inline bool +pgpa_trove_entry_compare_key(pgpa_trove_entry_key a, pgpa_trove_entry_key b) +{ + if (strcmp(a.alias_name, b.alias_name) != 0) + return false; + + if (!strings_equal_or_both_null(a.partition_name, b.partition_name)) + return false; + + if (!strings_equal_or_both_null(a.plan_name, b.plan_name)) + return false; + + return true; +} + +#define SH_PREFIX pgpa_trove_entry +#define SH_ELEMENT_TYPE pgpa_trove_entry_element +#define SH_KEY_TYPE pgpa_trove_entry_key +#define SH_KEY key +#define SH_HASH_KEY(tb, key) pgpa_trove_entry_hash_key(key) +#define SH_EQUAL(tb, a, b) pgpa_trove_entry_compare_key(a, b) +#define SH_SCOPE static inline +#define SH_DECLARE +#define SH_DEFINE +#include "lib/simplehash.h" + +static void pgpa_init_trove_slice(pgpa_trove_slice *tslice); +static void pgpa_trove_add_to_slice(pgpa_trove_slice *tslice, + pgpa_advice_tag_type tag, + pgpa_advice_target *target); +static void pgpa_trove_add_to_hash(pgpa_trove_entry_hash *hash, + pgpa_advice_target *target, + int index); +static Bitmapset *pgpa_trove_slice_lookup(pgpa_trove_slice *tslice, + pgpa_identifier *rid); + +/* + * Build a trove of advice from a list of advice items. + * + * Caller can obtain a list of advice items to pass to this function by + * calling pgpa_parse(). + */ +pgpa_trove * +pgpa_build_trove(List *advice_items) +{ + pgpa_trove *trove = palloc_object(pgpa_trove); + + pgpa_init_trove_slice(&trove->join); + pgpa_init_trove_slice(&trove->rel); + pgpa_init_trove_slice(&trove->scan); + + foreach_ptr(pgpa_advice_item, item, advice_items) + { + switch (item->tag) + { + case PGPA_TAG_JOIN_ORDER: + { + pgpa_advice_target *target; + + /* + * For most advice types, each element in the top-level + * list is a separate target, but it's most convenient to + * regard the entirety of a JOIN_ORDER specification as a + * single target. Since it wasn't represented that way + * during parsing, build a surrogate object now. + */ + target = palloc0_object(pgpa_advice_target); + target->ttype = PGPA_TARGET_ORDERED_LIST; + target->children = item->targets; + + pgpa_trove_add_to_slice(&trove->join, + item->tag, target); + } + break; + + case PGPA_TAG_BITMAP_HEAP_SCAN: + case PGPA_TAG_INDEX_ONLY_SCAN: + case PGPA_TAG_INDEX_SCAN: + case PGPA_TAG_SEQ_SCAN: + case PGPA_TAG_TID_SCAN: + + /* + * Scan advice. + */ + foreach_ptr(pgpa_advice_target, target, item->targets) + { + /* + * For now, all of our scan types target single relations, + * but in the future this might not be true, e.g. a custom + * scan could replace a join. + */ + Assert(target->ttype == PGPA_TARGET_IDENTIFIER); + pgpa_trove_add_to_slice(&trove->scan, + item->tag, target); + } + break; + + case PGPA_TAG_FOREIGN_JOIN: + case PGPA_TAG_HASH_JOIN: + case PGPA_TAG_MERGE_JOIN_MATERIALIZE: + case PGPA_TAG_MERGE_JOIN_PLAIN: + case PGPA_TAG_NESTED_LOOP_MATERIALIZE: + case PGPA_TAG_NESTED_LOOP_MEMOIZE: + case PGPA_TAG_NESTED_LOOP_PLAIN: + case PGPA_TAG_SEMIJOIN_NON_UNIQUE: + case PGPA_TAG_SEMIJOIN_UNIQUE: + + /* + * Join strategy advice. + */ + foreach_ptr(pgpa_advice_target, target, item->targets) + { + pgpa_trove_add_to_slice(&trove->join, + item->tag, target); + } + break; + + case PGPA_TAG_PARTITIONWISE: + case PGPA_TAG_GATHER: + case PGPA_TAG_GATHER_MERGE: + case PGPA_TAG_NO_GATHER: + + /* + * Advice about a RelOptInfo relevant to both scans and joins. + */ + foreach_ptr(pgpa_advice_target, target, item->targets) + { + pgpa_trove_add_to_slice(&trove->rel, + item->tag, target); + } + break; + } + } + + return trove; +} + +/* + * Search a trove of advice for relevant entries. + * + * All parameters are input parameters except for *result, which is an output + * parameter used to return results to the caller. + */ +void +pgpa_trove_lookup(pgpa_trove *trove, pgpa_trove_lookup_type type, + int nrids, pgpa_identifier *rids, pgpa_trove_result *result) +{ + pgpa_trove_slice *tslice; + Bitmapset *indexes; + + Assert(nrids > 0); + + if (type == PGPA_TROVE_LOOKUP_SCAN) + tslice = &trove->scan; + else if (type == PGPA_TROVE_LOOKUP_JOIN) + tslice = &trove->join; + else + tslice = &trove->rel; + + indexes = pgpa_trove_slice_lookup(tslice, &rids[0]); + for (int i = 1; i < nrids; ++i) + { + Bitmapset *other_indexes; + + /* + * If the caller is asking about two relations that aren't part of the + * same subquery, they've messed up. + */ + Assert(strings_equal_or_both_null(rids[0].plan_name, + rids[i].plan_name)); + + other_indexes = pgpa_trove_slice_lookup(tslice, &rids[i]); + indexes = bms_union(indexes, other_indexes); + } + + result->entries = tslice->entries; + result->indexes = indexes; +} + +/* + * Return all entries in a trove slice to the caller. + * + * The first two arguments are input arguments, and the remainder are output + * arguments. + */ +void +pgpa_trove_lookup_all(pgpa_trove *trove, pgpa_trove_lookup_type type, + pgpa_trove_entry **entries, int *nentries) +{ + pgpa_trove_slice *tslice; + + if (type == PGPA_TROVE_LOOKUP_SCAN) + tslice = &trove->scan; + else if (type == PGPA_TROVE_LOOKUP_JOIN) + tslice = &trove->join; + else + tslice = &trove->rel; + + *entries = tslice->entries; + *nentries = tslice->nused; +} + +/* + * Convert a trove entry to an item of plan advice that would produce it. + */ +char * +pgpa_cstring_trove_entry(pgpa_trove_entry *entry) +{ + StringInfoData buf; + + initStringInfo(&buf); + appendStringInfo(&buf, "%s", pgpa_cstring_advice_tag(entry->tag)); + + /* JOIN_ORDER tags are transformed by pgpa_build_trove; undo that here */ + if (entry->tag != PGPA_TAG_JOIN_ORDER) + appendStringInfoChar(&buf, '('); + else + Assert(entry->target->ttype == PGPA_TARGET_ORDERED_LIST); + + pgpa_format_advice_target(&buf, entry->target); + + if (entry->target->itarget != NULL) + { + appendStringInfoChar(&buf, ' '); + pgpa_format_index_target(&buf, entry->target->itarget); + } + + if (entry->tag != PGPA_TAG_JOIN_ORDER) + appendStringInfoChar(&buf, ')'); + + return buf.data; +} + +/* + * Set PGPA_TE_* flags on a set of trove entries. + */ +void +pgpa_trove_set_flags(pgpa_trove_entry *entries, Bitmapset *indexes, int flags) +{ + int i = -1; + + while ((i = bms_next_member(indexes, i)) >= 0) + { + pgpa_trove_entry *entry = &entries[i]; + + entry->flags |= flags; + } +} + +/* + * Append a string representation of the specified PGPA_TE_* flags to the + * given StringInfo. + */ +void +pgpa_trove_append_flags(StringInfo buf, int flags) +{ + if ((flags & PGPA_TE_MATCH_FULL) != 0) + { + Assert((flags & PGPA_TE_MATCH_PARTIAL) != 0); + appendStringInfo(buf, "matched"); + } + else if ((flags & PGPA_TE_MATCH_PARTIAL) != 0) + appendStringInfo(buf, "partially matched"); + else + appendStringInfo(buf, "not matched"); + if ((flags & PGPA_TE_INAPPLICABLE) != 0) + appendStringInfo(buf, ", inapplicable"); + if ((flags & PGPA_TE_CONFLICTING) != 0) + appendStringInfo(buf, ", conflicting"); + if ((flags & PGPA_TE_FAILED) != 0) + appendStringInfo(buf, ", failed"); +} + +/* + * Add a new advice target to an existing pgpa_trove_slice object. + */ +static void +pgpa_trove_add_to_slice(pgpa_trove_slice *tslice, + pgpa_advice_tag_type tag, + pgpa_advice_target *target) +{ + pgpa_trove_entry *entry; + + if (tslice->nused >= tslice->nallocated) + { + int new_allocated; + + new_allocated = tslice->nallocated * 2; + tslice->entries = repalloc_array(tslice->entries, pgpa_trove_entry, + new_allocated); + tslice->nallocated = new_allocated; + } + + entry = &tslice->entries[tslice->nused]; + entry->tag = tag; + entry->target = target; + entry->flags = 0; + + pgpa_trove_add_to_hash(tslice->hash, target, tslice->nused); + + tslice->nused++; +} + +/* + * Update the hash table for a newly-added advice target. + */ +static void +pgpa_trove_add_to_hash(pgpa_trove_entry_hash *hash, pgpa_advice_target *target, + int index) +{ + pgpa_trove_entry_key key; + pgpa_trove_entry_element *element; + bool found; + + /* For non-identifiers, add entries for all descendents. */ + if (target->ttype != PGPA_TARGET_IDENTIFIER) + { + foreach_ptr(pgpa_advice_target, child_target, target->children) + { + pgpa_trove_add_to_hash(hash, child_target, index); + } + return; + } + + /* Sanity checks. */ + Assert(target->rid.occurrence > 0); + Assert(target->rid.alias_name != NULL); + + /* Add an entry for this relation identifier. */ + key.alias_name = target->rid.alias_name; + key.partition_name = target->rid.partrel; + key.plan_name = target->rid.plan_name; + element = pgpa_trove_entry_insert(hash, key, &found); + if (!found) + element->indexes = NULL; + element->indexes = bms_add_member(element->indexes, index); +} + +/* + * Create and initialize a new pgpa_trove_slice object. + */ +static void +pgpa_init_trove_slice(pgpa_trove_slice *tslice) +{ + /* + * In an ideal world, we'll make tslice->nallocated big enough that the + * array and hash table will be large enough to contain the number of + * advice items in this trove slice, but a generous default value is not + * good for performance, because pgpa_init_trove_slice() has to zero an + * amount of memory proportional to tslice->nallocated. Hence, we keep the + * starting value quite small, on the theory that advice strings will + * often be relatively short. + */ + tslice->nallocated = 16; + tslice->nused = 0; + tslice->entries = palloc_array(pgpa_trove_entry, tslice->nallocated); + tslice->hash = pgpa_trove_entry_create(CurrentMemoryContext, + tslice->nallocated, NULL); +} + +/* + * Fast hash function for a key consisting of alias_name, partition_name, + * and plan_name. + */ +static uint32 +pgpa_trove_entry_hash_key(pgpa_trove_entry_key key) +{ + fasthash_state hs; + int sp_len; + + fasthash_init(&hs, 0); + + /* alias_name may not be NULL */ + sp_len = fasthash_accum_cstring(&hs, key.alias_name); + + /* partition_name and plan_name, however, can be NULL */ + if (key.partition_name != NULL) + sp_len += fasthash_accum_cstring(&hs, key.partition_name); + if (key.plan_name != NULL) + sp_len += fasthash_accum_cstring(&hs, key.plan_name); + + /* + * hashfn_unstable.h recommends using string length as tweak. It's not + * clear to me what to do if there are multiple strings, so for now I'm + * just using the total of all of the lengths. + */ + return fasthash_final32(&hs, sp_len); +} + +/* + * Look for matching entries. + */ +static Bitmapset * +pgpa_trove_slice_lookup(pgpa_trove_slice *tslice, pgpa_identifier *rid) +{ + pgpa_trove_entry_key key; + pgpa_trove_entry_element *element; + Bitmapset *result = NULL; + + Assert(rid->occurrence >= 1); + + key.alias_name = rid->alias_name; + key.partition_name = rid->partrel; + key.plan_name = rid->plan_name; + + element = pgpa_trove_entry_lookup(tslice->hash, key); + + if (element != NULL) + { + int i = -1; + + while ((i = bms_next_member(element->indexes, i)) >= 0) + { + pgpa_trove_entry *entry = &tslice->entries[i]; + + /* + * We know that this target or one of its descendents matches the + * identifier on the three key fields above, but we don't know + * which descendent or whether the occurence and schema also + * match. + */ + if (pgpa_identifier_matches_target(rid, entry->target)) + result = bms_add_member(result, i); + } + } + + return result; +} diff --git a/contrib/pg_plan_advice/pgpa_trove.h b/contrib/pg_plan_advice/pgpa_trove.h new file mode 100644 index 00000000000..a1b75af724a --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_trove.h @@ -0,0 +1,114 @@ +/*------------------------------------------------------------------------- + * + * pgpa_trove.h + * All of the advice given for a particular query, appropriately + * organized for convenient access. + * + * Copyright (c) 2016-2024, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_trove.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_TROVE_H +#define PGPA_TROVE_H + +#include "pgpa_ast.h" + +#include "nodes/bitmapset.h" + +typedef struct pgpa_trove pgpa_trove; + +/* + * Flags that can be set on a pgpa_trove_entry to indicate what happened when + * trying to plan using advice. + * + * PGPA_TE_MATCH_PARTIAL means that we found some part of the query that at + * least partially matched the target; e.g. given JOIN_ORDER(a b), this would + * be set if we ever saw any joinrel including either "a" or "b". + * + * PGPA_TE_MATCH_FULL means that we found an exact match for the target; e.g. + * given JOIN_ORDER(a b), this would be set if we saw a joinrel containing + * exactly "a" and "b" and nothing else. + * + * PGPA_TE_INAPPLICABLE means that the advice doesn't properly apply to the + * target; e.g. INDEX_SCAN(foo bar_idx) would be so marked if bar_idx does not + * exist on foo. The fact that this bit has been set does not mean that the + * advice had no effect. + * + * PGPA_TE_CONFLICTING means that a conflict was detected between what this + * advice wants and what some other plan advice wants; e.g. JOIN_ORDER(a b) + * would conflict with HASH_JOIN(a), because the former requires "a" to be the + * outer table while the latter requires it to be the inner table. + * + * PGPA_TE_FAILED means that the resulting plan did not conform to the advice. + */ +#define PGPA_TE_MATCH_PARTIAL 0x0001 +#define PGPA_TE_MATCH_FULL 0x0002 +#define PGPA_TE_INAPPLICABLE 0x0004 +#define PGPA_TE_CONFLICTING 0x0008 +#define PGPA_TE_FAILED 0x0010 + +/* + * Each entry in a trove of advice represents the application of a tag to + * a single target. + */ +typedef struct pgpa_trove_entry +{ + pgpa_advice_tag_type tag; + pgpa_advice_target *target; + int flags; +} pgpa_trove_entry; + +/* + * What kind of information does the caller want to find in a trove? + * + * PGPA_TROVE_LOOKUP_SCAN means we're looking for scan advice. + * + * PGPA_TROVE_LOOKUP_JOIN means we're looking for join-related advice. + * This includes join order advice, join method advice, and semijoin-uniqueness + * advice. + * + * PGPA_TROVE_LOOKUP_REL means we're looking for general advice about this + * a RelOptInfo that may correspond to either a scan or a join. This includes + * gather-related advice and partitionwise advice. Note that partitionwise + * advice might seem like join advice, but that's not a helpful way of viewing + * the matter because (1) partitionwise advice is also relevant at the scan + * level and (2) other types of join advice affect only what to do from + * join_path_setup_hook, but partitionwise advice affects what to do in + * joinrel_setup_hook. + */ +typedef enum pgpa_trove_lookup_type +{ + PGPA_TROVE_LOOKUP_JOIN, + PGPA_TROVE_LOOKUP_REL, + PGPA_TROVE_LOOKUP_SCAN +} pgpa_trove_lookup_type; + +/* + * This struct is used to store the result of a trove lookup. For each member + * of "indexes", the entry at the corresponding offset within "entries" is one + * of the results. + */ +typedef struct pgpa_trove_result +{ + pgpa_trove_entry *entries; + Bitmapset *indexes; +} pgpa_trove_result; + +extern pgpa_trove *pgpa_build_trove(List *advice_items); +extern void pgpa_trove_lookup(pgpa_trove *trove, + pgpa_trove_lookup_type type, + int nrids, + pgpa_identifier *rids, + pgpa_trove_result *result); +extern void pgpa_trove_lookup_all(pgpa_trove *trove, + pgpa_trove_lookup_type type, + pgpa_trove_entry **entries, + int *nentries); +extern char *pgpa_cstring_trove_entry(pgpa_trove_entry *entry); +extern void pgpa_trove_set_flags(pgpa_trove_entry *entries, + Bitmapset *indexes, int flags); +extern void pgpa_trove_append_flags(StringInfo buf, int flags); + +#endif diff --git a/contrib/pg_plan_advice/pgpa_walker.c b/contrib/pg_plan_advice/pgpa_walker.c new file mode 100644 index 00000000000..210d30891b2 --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_walker.c @@ -0,0 +1,1006 @@ +/*------------------------------------------------------------------------- + * + * pgpa_walker.c + * Plan tree iteration + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_walker.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pgpa_join.h" +#include "pgpa_scan.h" +#include "pgpa_walker.h" + +#include "nodes/plannodes.h" +#include "parser/parsetree.h" +#include "utils/lsyscache.h" + +static void pgpa_walk_recursively(pgpa_plan_walker_context *walker, Plan *plan, + bool within_join_problem, + pgpa_join_unroller *join_unroller, + List *active_query_features, + bool beneath_any_gather); +static Bitmapset *pgpa_process_unrolled_join(pgpa_plan_walker_context *walker, + pgpa_unrolled_join *ujoin); + +static pgpa_query_feature *pgpa_add_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, + Plan *plan); + +static void pgpa_qf_add_rti(List *active_query_features, Index rti); +static void pgpa_qf_add_rtis(List *active_query_features, Bitmapset *relids); +static void pgpa_qf_add_plan_rtis(List *active_query_features, Plan *plan, + List *rtable); + +static bool pgpa_walker_join_order_matches(pgpa_unrolled_join *ujoin, + Index rtable_length, + pgpa_identifier *rt_identifiers, + pgpa_advice_target *target, + bool toplevel); +static bool pgpa_walker_join_order_matches_member(pgpa_join_member *member, + Index rtable_length, + pgpa_identifier *rt_identifiers, + pgpa_advice_target *target); +static pgpa_scan *pgpa_walker_find_scan(pgpa_plan_walker_context *walker, + pgpa_scan_strategy strategy, + Bitmapset *relids); +static bool pgpa_walker_index_target_matches_plan(pgpa_index_target *itarget, + Plan *plan); +static bool pgpa_walker_contains_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, + Bitmapset *relids); +static bool pgpa_walker_contains_join(pgpa_plan_walker_context *walker, + pgpa_join_strategy strategy, + Bitmapset *relids); +static bool pgpa_walker_contains_no_gather(pgpa_plan_walker_context *walker, + Bitmapset *relids); + +/* + * Top-level entrypoint for the plan tree walk. + * + * Populates walker based on a traversal of the Plan trees in pstmt. + * + * sj_unique_rels is a list of pgpa_sj_unique_rel objects, one for each + * relation we considered making unique as part of semijoin planning. + */ +void +pgpa_plan_walker(pgpa_plan_walker_context *walker, PlannedStmt *pstmt, + List *sj_unique_rels) +{ + ListCell *lc; + List *sj_unique_rtis = NULL; + List *sj_nonunique_qfs = NULL; + + /* Initialization. */ + memset(walker, 0, sizeof(pgpa_plan_walker_context)); + walker->pstmt = pstmt; + + /* Walk the main plan tree. */ + pgpa_walk_recursively(walker, pstmt->planTree, 0, NULL, NIL, false); + + /* Main plan tree walk won't reach subplans, so walk those. */ + foreach(lc, pstmt->subplans) + { + Plan *plan = lfirst(lc); + + if (plan != NULL) + pgpa_walk_recursively(walker, plan, 0, NULL, NIL, false); + } + + /* Adjust RTIs from sj_unique_rels for the flattened range table. */ + foreach_ptr(pgpa_sj_unique_rel, ur, sj_unique_rels) + { + int rtindex = -1; + int rtoffset = 0; + bool dummy = false; + Bitmapset *relids = NULL; + + /* If this is a subplan, find the range table offset. */ + if (ur->plan_name != NULL) + { + foreach_node(SubPlanRTInfo, rtinfo, pstmt->subrtinfos) + { + if (strcmp(ur->plan_name, rtinfo->plan_name) == 0) + { + rtoffset = rtinfo->rtoffset; + dummy = rtinfo->dummy; + break; + } + } + + if (rtoffset == 0) + elog(ERROR, "no rtoffset for plan %s", ur->plan_name); + } + + /* If this entry pertains to a dummy subquery, ignore it. */ + if (dummy) + continue; + + /* Offset each entry from the original set. */ + while ((rtindex = bms_next_member(ur->relids, rtindex)) >= 0) + relids = bms_add_member(relids, rtindex + rtoffset); + + /* Store the resulting set. */ + sj_unique_rtis = lappend(sj_unique_rtis, relids); + } + + /* + * Remove any non-unique semjoin query features for which making the rel + * unique wasn't considered. + */ + foreach_ptr(pgpa_query_feature, qf, + walker->query_features[PGPAQF_SEMIJOIN_NON_UNIQUE]) + { + if (list_member(sj_unique_rtis, qf->relids)) + sj_nonunique_qfs = lappend(sj_nonunique_qfs, qf); + } + walker->query_features[PGPAQF_SEMIJOIN_NON_UNIQUE] = sj_nonunique_qfs; + + /* + * If we find any cases where analysis of the Plan tree shows that the + * semijoin was made unique but this possibility was never observed to be + * considered during planning, then we have a bug somewhere. + */ + foreach_ptr(pgpa_query_feature, qf, + walker->query_features[PGPAQF_SEMIJOIN_UNIQUE]) + { + if (!list_member(sj_unique_rtis, qf->relids)) + { + StringInfoData buf; + + initStringInfo(&buf); + outBitmapset(&buf, qf->relids); + elog(ERROR, + "unique semijoin found for relids %s but not observed during planning", + buf.data); + } + } +} + +/* + * Main workhorse for the plan tree walk. + * + * If within_join_problem is true, we encountered a join at some higher level + * of the tree walk and haven't yet descended out of the portion of the plan + * tree that is part of that same join problem. We're no longer in the same + * join problem if (1) we cross into a different subquery or (2) we descend + * through an Append or MergeAppend node, below which any further joins would + * be partitionwise joins planned separately from the outer join problem. + * + * If join_unroller != NULL, the join unroller code expects us to find a join + * that should be unrolled into that object. This implies that we're within a + * join problem, but the reverse is not true: when we've traversed all the + * joins but are still looking for the scan that is the leaf of the join tree, + * join_unroller will be NULL but within_join_problem will be true. + * + * Each element of active_query_features corresponds to some item of advice + * that needs to enumerate all the relations it affects. We add RTIs we find + * during tree traversal to each of these query features. + * + * If beneath_any_gather == true, some higher level of the tree traversal found + * a Gather or Gather Merge node. + */ +static void +pgpa_walk_recursively(pgpa_plan_walker_context *walker, Plan *plan, + bool within_join_problem, + pgpa_join_unroller *join_unroller, + List *active_query_features, + bool beneath_any_gather) +{ + pgpa_join_unroller *outer_join_unroller = NULL; + pgpa_join_unroller *inner_join_unroller = NULL; + bool join_unroller_toplevel = false; + List *pushdown_query_features = NIL; + ListCell *lc; + List *extraplans = NIL; + List *elided_nodes = NIL; + + Assert(within_join_problem || join_unroller == NULL); + + /* + * If this is a Gather or Gather Merge node, directly add it to the list + * of currently-active query features. (Exception: Disregard single_copy + * Gather nodes. These are created by debug_parallel_query, and having + * them affect the plan advice is counterproductive, as the result will be + * to advise the use of a real Gather node, rather than a single copy + * one.) + * + * Otherwise, check the future_query_features list to see whether this was + * previously identified as a plan node that needs to be treated as a + * query feature. + * + * Note that the caller also has a copy to active_query_features, so we + * can't destructively modify it without making a copy. + */ + if (IsA(plan, Gather) && !((Gather *) plan)->single_copy) + { + active_query_features = + lappend(list_copy(active_query_features), + pgpa_add_feature(walker, PGPAQF_GATHER, plan)); + beneath_any_gather = true; + } + else if (IsA(plan, GatherMerge)) + { + active_query_features = + lappend(list_copy(active_query_features), + pgpa_add_feature(walker, PGPAQF_GATHER_MERGE, plan)); + beneath_any_gather = true; + } + else + { + foreach_ptr(pgpa_query_feature, qf, walker->future_query_features) + { + if (qf->plan == plan) + { + active_query_features = list_copy(active_query_features); + active_query_features = lappend(active_query_features, qf); + walker->future_query_features = + list_delete_ptr(walker->future_query_features, plan); + break; + } + } + } + + /* + * Find all elided nodes for this Plan node. + */ + foreach_node(ElidedNode, n, walker->pstmt->elidedNodes) + { + if (n->plan_node_id == plan->plan_node_id) + elided_nodes = lappend(elided_nodes, n); + } + + /* If we found any elided_nodes, handle them. */ + if (elided_nodes != NIL) + { + int num_elided_nodes = list_length(elided_nodes); + ElidedNode *last_elided_node; + + /* + * RTIs for the final -- and thus logically uppermost -- elided node + * should be collected for query features passed down by the caller. + * However, elided nodes act as barriers to query features, which + * means that (1) the remaining elided nodes, if any, should be + * ignored for purposes of query features and (2) the list of active + * query features should be reset to empty so that we do not add RTIs + * from the plan node that is logically beneath the elided node to the + * query features passed down from the caller. + */ + last_elided_node = list_nth(elided_nodes, num_elided_nodes - 1); + pgpa_qf_add_rtis(active_query_features, + pgpa_filter_out_join_relids(last_elided_node->relids, + walker->pstmt->rtable)); + active_query_features = NIL; + + /* + * If we're within a join problem, the join_unroller is responsible + * for building the scan for the final elided node, so throw it out. + */ + if (within_join_problem) + elided_nodes = list_truncate(elided_nodes, num_elided_nodes - 1); + + /* Build scans for all (or the remaining) elided nodes. */ + foreach_node(ElidedNode, elided_node, elided_nodes) + { + (void) pgpa_build_scan(walker, plan, elided_node, + beneath_any_gather, within_join_problem); + } + + /* + * If there were any elided nodes, then everything beneath those nodes + * is not part of the same join problem. + * + * In more detail, if an Append or MergeAppend was elided, then a + * partitionwise join was chosen and only a single child survived; if + * a SubqueryScan was elided, the subquery was planned without + * flattening it into the parent. + */ + within_join_problem = false; + join_unroller = NULL; + } + + /* + * If we're within a join problem, the join unroller is responsible for + * building any required scan for this node. If not, we do it here. + */ + if (!within_join_problem) + (void) pgpa_build_scan(walker, plan, NULL, beneath_any_gather, false); + + /* + * If this join needs to unrolled but there's no join unroller already + * available, create one. + */ + if (join_unroller == NULL && pgpa_is_join(plan)) + { + join_unroller = pgpa_create_join_unroller(); + join_unroller_toplevel = true; + within_join_problem = true; + } + + /* + * If this join is to be unrolled, pgpa_unroll_join() will return the join + * unroller object that should be passed down when we recurse into the + * outer and inner sides of the plan. + */ + if (join_unroller != NULL) + pgpa_unroll_join(walker, plan, beneath_any_gather, join_unroller, + &outer_join_unroller, &inner_join_unroller); + + /* Add RTIs from the plan node to all active query features. */ + pgpa_qf_add_plan_rtis(active_query_features, plan, walker->pstmt->rtable); + + /* + * Recurse into the outer and inner subtrees. + * + * As an exception, if this is a ForeignScan, don't recurse. postgres_fdw + * sometimes stores an EPQ recheck plan in plan->leftree, but that's going + * to mention the same set of relations as the ForeignScan itself, and we + * have no way to emit advice targeting the EPQ case vs. the non-EPQ case. + * Moreover, it's not entirely clear what other FDWs might do with the + * left and right subtrees. Maybe some better handling is needed here, but + * for now, we just punt. + */ + if (!IsA(plan, ForeignScan)) + { + if (plan->lefttree != NULL) + pgpa_walk_recursively(walker, plan->lefttree, within_join_problem, + outer_join_unroller, active_query_features, + beneath_any_gather); + if (plan->righttree != NULL) + pgpa_walk_recursively(walker, plan->righttree, within_join_problem, + inner_join_unroller, active_query_features, + beneath_any_gather); + } + + /* + * If we created a join unroller up above, then it's also our join to use + * it to build the final pgpa_unrolled_join, and to destroy the object. + */ + if (join_unroller_toplevel) + { + pgpa_unrolled_join *ujoin; + + ujoin = pgpa_build_unrolled_join(walker, join_unroller); + walker->toplevel_unrolled_joins = + lappend(walker->toplevel_unrolled_joins, ujoin); + pgpa_destroy_join_unroller(join_unroller); + (void) pgpa_process_unrolled_join(walker, ujoin); + } + + /* + * Some plan types can have additional children. Nodes like Append that + * can have any number of children store them in a List; a SubqueryScan + * just has a field for a single additional Plan. + */ + switch (nodeTag(plan)) + { + case T_Append: + { + Append *aplan = (Append *) plan; + + extraplans = aplan->appendplans; + if (bms_is_empty(aplan->apprelids)) + pushdown_query_features = active_query_features; + } + break; + case T_MergeAppend: + { + MergeAppend *maplan = (MergeAppend *) plan; + + extraplans = maplan->mergeplans; + if (bms_is_empty(maplan->apprelids)) + pushdown_query_features = active_query_features; + } + break; + case T_BitmapAnd: + extraplans = ((BitmapAnd *) plan)->bitmapplans; + break; + case T_BitmapOr: + extraplans = ((BitmapOr *) plan)->bitmapplans; + break; + case T_SubqueryScan: + + /* + * We don't pass down active_query_features across here, because + * those are specific to a subquery level. + */ + pgpa_walk_recursively(walker, ((SubqueryScan *) plan)->subplan, + 0, NULL, NIL, beneath_any_gather); + break; + case T_CustomScan: + extraplans = ((CustomScan *) plan)->custom_plans; + break; + default: + break; + } + + /* If we found a list of extra children, iterate over it. */ + foreach(lc, extraplans) + { + Plan *subplan = lfirst(lc); + + pgpa_walk_recursively(walker, subplan, 0, NULL, pushdown_query_features, + beneath_any_gather); + } +} + +/* + * Perform final processing of a newly-constructed pgpa_unrolled_join. This + * only needs to be called for toplevel pgpa_unrolled_join objects, since it + * recurses to sub-joins as needed. + * + * Our goal is to add the set of inner relids to the relevant join_strategies + * list, and to do the same for any sub-joins. To that end, the return value + * is the set of relids found beneath the the join, but it is expected that + * the toplevel caller will ignore this. + */ +static Bitmapset * +pgpa_process_unrolled_join(pgpa_plan_walker_context *walker, + pgpa_unrolled_join *ujoin) +{ + Bitmapset *all_relids = bms_copy(ujoin->outer.scan->relids); + + /* If this fails, we didn't unroll properly. */ + Assert(ujoin->outer.unrolled_join == NULL); + + for (int k = 0; k < ujoin->ninner; ++k) + { + pgpa_join_member *member = &ujoin->inner[k]; + Bitmapset *relids; + + if (member->unrolled_join != NULL) + relids = pgpa_process_unrolled_join(walker, + member->unrolled_join); + else + { + Assert(member->scan != NULL); + relids = member->scan->relids; + } + walker->join_strategies[ujoin->strategy[k]] = + lappend(walker->join_strategies[ujoin->strategy[k]], relids); + all_relids = bms_add_members(all_relids, relids); + } + + return all_relids; +} + +/* + * Arrange for the given plan node to be treated as a query feature when the + * tree walk reaches it. + * + * Make sure to only use this for nodes that the tree walk can't have reached + * yet! + */ +void +pgpa_add_future_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, Plan *plan) +{ + pgpa_query_feature *qf = pgpa_add_feature(walker, type, plan); + + walker->future_query_features = + lappend(walker->future_query_features, qf); +} + +/* + * Return the last of any elided nodes associated with this plan node ID. + * + * The last elided node is the one that would have been uppermost in the plan + * tree had it not been removed during setrefs processig. + */ +ElidedNode * +pgpa_last_elided_node(PlannedStmt *pstmt, Plan *plan) +{ + ElidedNode *elided_node = NULL; + + foreach_node(ElidedNode, n, pstmt->elidedNodes) + { + if (n->plan_node_id == plan->plan_node_id) + elided_node = n; + } + + return elided_node; +} + +/* + * Certain plan nodes can refer to a set of RTIs. Extract and return the set. + */ +Bitmapset * +pgpa_relids(Plan *plan) +{ + if (IsA(plan, Result)) + return ((Result *) plan)->relids; + else if (IsA(plan, ForeignScan)) + return ((ForeignScan *) plan)->fs_relids; + else if (IsA(plan, Append)) + return ((Append *) plan)->apprelids; + else if (IsA(plan, MergeAppend)) + return ((MergeAppend *) plan)->apprelids; + + return NULL; +} + +/* + * Extract the scanned RTI from a plan node. + * + * Returns 0 if there isn't one. + */ +Index +pgpa_scanrelid(Plan *plan) +{ + switch (nodeTag(plan)) + { + case T_SeqScan: + case T_SampleScan: + case T_BitmapHeapScan: + case T_TidScan: + case T_TidRangeScan: + case T_SubqueryScan: + case T_FunctionScan: + case T_TableFuncScan: + case T_ValuesScan: + case T_CteScan: + case T_NamedTuplestoreScan: + case T_WorkTableScan: + case T_ForeignScan: + case T_CustomScan: + case T_IndexScan: + case T_IndexOnlyScan: + return ((Scan *) plan)->scanrelid; + default: + return 0; + } +} + +/* + * Construct a new Bitmapset containing non-RTE_JOIN members of 'relids'. + */ +Bitmapset * +pgpa_filter_out_join_relids(Bitmapset *relids, List *rtable) +{ + int rti = -1; + Bitmapset *result = NULL; + + while ((rti = bms_next_member(relids, rti)) >= 0) + { + RangeTblEntry *rte = rt_fetch(rti, rtable); + + if (rte->rtekind != RTE_JOIN) + result = bms_add_member(result, rti); + } + + return result; +} + +/* + * Create a pgpa_query_feature and add it to the list of all query features + * for this plan. + */ +static pgpa_query_feature * +pgpa_add_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, Plan *plan) +{ + pgpa_query_feature *qf = palloc0_object(pgpa_query_feature); + + qf->type = type; + qf->plan = plan; + + walker->query_features[qf->type] = + lappend(walker->query_features[qf->type], qf); + + return qf; +} + +/* + * Add a single RTI to each active query feature. + */ +static void +pgpa_qf_add_rti(List *active_query_features, Index rti) +{ + foreach_ptr(pgpa_query_feature, qf, active_query_features) + { + qf->relids = bms_add_member(qf->relids, rti); + } +} + +/* + * Add a set of RTIs to each active query feature. + */ +static void +pgpa_qf_add_rtis(List *active_query_features, Bitmapset *relids) +{ + foreach_ptr(pgpa_query_feature, qf, active_query_features) + { + qf->relids = bms_add_members(qf->relids, relids); + } +} + +/* + * Add RTIs directly contained in a plan node to each active query feature, + * but filter out any join RTIs, since advice doesn't mention those. + */ +static void +pgpa_qf_add_plan_rtis(List *active_query_features, Plan *plan, List *rtable) +{ + Bitmapset *relids; + Index rti; + + if ((relids = pgpa_relids(plan)) != NULL) + { + relids = pgpa_filter_out_join_relids(relids, rtable); + pgpa_qf_add_rtis(active_query_features, relids); + } + else if ((rti = pgpa_scanrelid(plan)) != 0) + pgpa_qf_add_rti(active_query_features, rti); +} + +/* + * If we generated plan advice using the provided walker object and array + * of identifiers, would we generate the specified tag/target combination? + * + * If yes, the plan conforms to the advice; if no, it does not. Note that + * we have know way of knowing whether the planner was forced to emit a plan + * that conformed to the advice or just happened to do so. + */ +bool +pgpa_walker_would_advise(pgpa_plan_walker_context *walker, + pgpa_identifier *rt_identifiers, + pgpa_advice_tag_type tag, + pgpa_advice_target *target) +{ + Index rtable_length = list_length(walker->pstmt->rtable); + Bitmapset *relids = NULL; + + if (tag == PGPA_TAG_JOIN_ORDER) + { + foreach_ptr(pgpa_unrolled_join, ujoin, walker->toplevel_unrolled_joins) + { + if (pgpa_walker_join_order_matches(ujoin, rtable_length, + rt_identifiers, target, true)) + return true; + } + + return false; + } + + if (target->ttype == PGPA_TARGET_IDENTIFIER) + { + Index rti; + + rti = pgpa_compute_rti_from_identifier(rtable_length, rt_identifiers, + &target->rid); + if (rti == 0) + return false; + relids = bms_make_singleton(rti); + } + else + { + Assert(target->ttype == PGPA_TARGET_ORDERED_LIST); + foreach_ptr(pgpa_advice_target, child_target, target->children) + { + Index rti; + + Assert(child_target->ttype == PGPA_TARGET_IDENTIFIER); + rti = pgpa_compute_rti_from_identifier(rtable_length, + rt_identifiers, + &child_target->rid); + if (rti == 0) + return false; + relids = bms_add_member(relids, rti); + } + } + + switch (tag) + { + case PGPA_TAG_JOIN_ORDER: + /* should have been handled above */ + pg_unreachable(); + break; + case PGPA_TAG_BITMAP_HEAP_SCAN: + return pgpa_walker_find_scan(walker, + PGPA_SCAN_BITMAP_HEAP, + relids) != NULL; + case PGPA_TAG_FOREIGN_JOIN: + return pgpa_walker_find_scan(walker, + PGPA_SCAN_FOREIGN, + relids) != NULL; + case PGPA_TAG_INDEX_ONLY_SCAN: + { + pgpa_scan *scan; + + scan = pgpa_walker_find_scan(walker, PGPA_SCAN_INDEX_ONLY, + relids); + if (scan == NULL) + return false; + + return pgpa_walker_index_target_matches_plan(target->itarget, scan->plan); + } + case PGPA_TAG_INDEX_SCAN: + { + pgpa_scan *scan; + + scan = pgpa_walker_find_scan(walker, PGPA_SCAN_INDEX, + relids); + if (scan == NULL) + return false; + + return pgpa_walker_index_target_matches_plan(target->itarget, scan->plan); + } + case PGPA_TAG_PARTITIONWISE: + return pgpa_walker_find_scan(walker, + PGPA_SCAN_PARTITIONWISE, + relids) != NULL; + case PGPA_TAG_SEQ_SCAN: + return pgpa_walker_find_scan(walker, + PGPA_SCAN_SEQ, + relids) != NULL; + case PGPA_TAG_TID_SCAN: + return pgpa_walker_find_scan(walker, + PGPA_SCAN_TID, + relids) != NULL; + case PGPA_TAG_GATHER: + return pgpa_walker_contains_feature(walker, + PGPAQF_GATHER, + relids); + case PGPA_TAG_GATHER_MERGE: + return pgpa_walker_contains_feature(walker, + PGPAQF_GATHER_MERGE, + relids); + case PGPA_TAG_SEMIJOIN_NON_UNIQUE: + return pgpa_walker_contains_feature(walker, + PGPAQF_SEMIJOIN_NON_UNIQUE, + relids); + case PGPA_TAG_SEMIJOIN_UNIQUE: + return pgpa_walker_contains_feature(walker, + PGPAQF_SEMIJOIN_UNIQUE, + relids); + case PGPA_TAG_HASH_JOIN: + return pgpa_walker_contains_join(walker, + JSTRAT_HASH_JOIN, + relids); + case PGPA_TAG_MERGE_JOIN_MATERIALIZE: + return pgpa_walker_contains_join(walker, + JSTRAT_MERGE_JOIN_MATERIALIZE, + relids); + case PGPA_TAG_MERGE_JOIN_PLAIN: + return pgpa_walker_contains_join(walker, + JSTRAT_MERGE_JOIN_PLAIN, + relids); + case PGPA_TAG_NESTED_LOOP_MATERIALIZE: + return pgpa_walker_contains_join(walker, + JSTRAT_NESTED_LOOP_MATERIALIZE, + relids); + case PGPA_TAG_NESTED_LOOP_MEMOIZE: + return pgpa_walker_contains_join(walker, + JSTRAT_NESTED_LOOP_MEMOIZE, + relids); + case PGPA_TAG_NESTED_LOOP_PLAIN: + return pgpa_walker_contains_join(walker, + JSTRAT_NESTED_LOOP_PLAIN, + relids); + case PGPA_TAG_NO_GATHER: + return pgpa_walker_contains_no_gather(walker, relids); + } + + /* should not get here */ + return false; +} + +/* + * Does the index target match the Plan? + * + * Should only be called when we know that itarget mandates an Index Scan or + * Index Only Scan and this corresponds to the type of Plan. Here, our job is + * just to check whether it's the same index. + */ +static bool +pgpa_walker_index_target_matches_plan(pgpa_index_target *itarget, Plan *plan) +{ + Oid indexoid = InvalidOid; + + /* Retrieve the index OID from the plan. */ + if (IsA(plan, IndexScan)) + indexoid = ((IndexScan *) plan)->indexid; + else if (IsA(plan, IndexOnlyScan)) + indexoid = ((IndexOnlyScan *) plan)->indexid; + else + elog(ERROR, "unrecognized node type: %d", (int) nodeTag(plan)); + + /* Check whether schema name matches, if specified in index target. */ + if (itarget->indnamespace != NULL) + { + Oid nspoid = get_rel_namespace(indexoid); + char *relnamespace = get_namespace_name_or_temp(nspoid); + + if (strcmp(itarget->indnamespace, relnamespace) != 0) + return false; + } + + /* Check whether relation name matches. */ + return (strcmp(itarget->indname, get_rel_name(indexoid)) == 0); +} + +/* + * Does an unrolled join match the join order specified by an advice target? + */ +static bool +pgpa_walker_join_order_matches(pgpa_unrolled_join *ujoin, + Index rtable_length, + pgpa_identifier *rt_identifiers, + pgpa_advice_target *target, + bool toplevel) +{ + int nchildren = list_length(target->children); + + Assert(target->ttype == PGPA_TARGET_ORDERED_LIST); + + /* At toplevel, we allow a prefix match. */ + if (toplevel) + { + if (nchildren > ujoin->ninner + 1) + return false; + } + else + { + if (nchildren != ujoin->ninner + 1) + return false; + } + + /* Outermost rel must match. */ + if (!pgpa_walker_join_order_matches_member(&ujoin->outer, + rtable_length, + rt_identifiers, + linitial(target->children))) + return false; + + /* Each inner rel must match. */ + for (int n = 0; n < nchildren - 1; ++n) + { + pgpa_advice_target *child_target = list_nth(target->children, n + 1); + + if (!pgpa_walker_join_order_matches_member(&ujoin->inner[n], + rtable_length, + rt_identifiers, + child_target)) + return false; + } + + return true; +} + +/* + * Does one member of an unrolled join match an advice target? + */ +static bool +pgpa_walker_join_order_matches_member(pgpa_join_member *member, + Index rtable_length, + pgpa_identifier *rt_identifiers, + pgpa_advice_target *target) +{ + Bitmapset *relids = NULL; + + if (member->unrolled_join != NULL) + { + if (target->ttype != PGPA_TARGET_ORDERED_LIST) + return false; + return pgpa_walker_join_order_matches(member->unrolled_join, + rtable_length, + rt_identifiers, + target, + false); + } + + Assert(member->scan != NULL); + switch (target->ttype) + { + case PGPA_TARGET_ORDERED_LIST: + /* Could only match an unrolled join */ + return false; + + case PGPA_TARGET_UNORDERED_LIST: + { + foreach_ptr(pgpa_advice_target, child_target, target->children) + { + Index rti; + + rti = pgpa_compute_rti_from_identifier(rtable_length, + rt_identifiers, + &child_target->rid); + if (rti == 0) + return false; + relids = bms_add_member(relids, rti); + } + break; + } + + case PGPA_TARGET_IDENTIFIER: + { + Index rti; + + rti = pgpa_compute_rti_from_identifier(rtable_length, + rt_identifiers, + &target->rid); + if (rti == 0) + return false; + relids = bms_make_singleton(rti); + break; + } + } + + return bms_equal(member->scan->relids, relids); +} + +/* + * Find the scan where the walker says that the given scan strategy should be + * used for the given relid set, if one exists. + * + * Returns the pgpa_scan object, or NULL if none was found. + */ +static pgpa_scan * +pgpa_walker_find_scan(pgpa_plan_walker_context *walker, + pgpa_scan_strategy strategy, + Bitmapset *relids) +{ + List *scans = walker->scans[strategy]; + + foreach_ptr(pgpa_scan, scan, scans) + { + if (bms_equal(scan->relids, relids)) + return scan; + } + + return NULL; +} + +/* + * Does this walker say that the given query feature applies to the given + * relid set? + */ +static bool +pgpa_walker_contains_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, + Bitmapset *relids) +{ + List *query_features = walker->query_features[type]; + + foreach_ptr(pgpa_query_feature, qf, query_features) + { + if (bms_equal(qf->relids, relids)) + return true; + } + + return false; +} + +/* + * Does the walker say that the given join strategy should be used for the + * given relid set? + */ +static bool +pgpa_walker_contains_join(pgpa_plan_walker_context *walker, + pgpa_join_strategy strategy, + Bitmapset *relids) +{ + List *join_strategies = walker->join_strategies[strategy]; + + foreach_ptr(Bitmapset, jsrelids, join_strategies) + { + if (bms_equal(jsrelids, relids)) + return true; + } + + return false; +} + +/* + * Does the walker say that the given relids should be marked as NO_GATHER? + */ +static bool +pgpa_walker_contains_no_gather(pgpa_plan_walker_context *walker, + Bitmapset *relids) +{ + return bms_is_subset(relids, walker->no_gather_scans); +} diff --git a/contrib/pg_plan_advice/pgpa_walker.h b/contrib/pg_plan_advice/pgpa_walker.h new file mode 100644 index 00000000000..b91a36ca3dd --- /dev/null +++ b/contrib/pg_plan_advice/pgpa_walker.h @@ -0,0 +1,141 @@ +/*------------------------------------------------------------------------- + * + * pgpa_walker.h + * Plan tree iteration + * + * Copyright (c) 2016-2025, PostgreSQL Global Development Group + * + * contrib/pg_plan_advice/pgpa_walker.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGPA_WALKER_H +#define PGPA_WALKER_H + +#include "pgpa_ast.h" +#include "pgpa_join.h" +#include "pgpa_scan.h" + +/* + * When generating advice, we should emit either SEMIJOIN_UNIQUE advice or + * SEMIJOIN_NON_UNIQUE advice for each semijoin depending on whether we chose + * to implement it as a semijoin or whether we instead chose to make the + * nullable side unique and then perform an inner join. When the make-unique + * strategy is not chosen, it's not easy to tell from the final plan tree + * whether it was considered. That's awkward, because we don't want to emit + * useless SEMIJOIN_NON_UNIQUE advice when there was no decision to be made. + * + * To avoid that, during planning, we create a pgpa_sj_unique_rel for each + * relation that we considered making unique for purposes of semijoin planning. + */ +typedef struct pgpa_sj_unique_rel +{ + char *plan_name; + Bitmapset *relids; +} pgpa_sj_unique_rel; + +/* + * We use the term "query feature" to refer to plan nodes that are interesting + * in the following way: to generate advice, we'll need to know the set of + * same-subquery, non-join RTIs occuring at or below that plan node, without + * admixture of parent and child RTIs. + * + * For example, Gather nodes, desiginated by PGPAQF_GATHER, and Gather Merge + * nodes, designated by PGPAQF_GATHER_MERGE, are query features, because we'll + * want to admit some kind of advice that describes the portion of the plan + * tree that appears beneath those nodes. + * + * Each semijoin can be implemented either by directly performing a semijoin, + * or by making one side unique and then performing a normal join. Either way, + * we use a query feature to notice what decision was made, so that we can + * describe it by enumerating the RTIs on that side of the join. + * + * To elaborate on the "no admixture of parent and child RTIs" rule, in all of + * these cases, if the entirety of an inheritance hierarchy appears beneath + * the query feature, we only want to name the parent table. But it's also + * possible to have cases where we must name child tables. This is particularly + * likely to happen when partitionwise join is in use, but could happen for + * Gather or Gather Merge even without that, if one of those appears below + * an Append or MergeAppend node for a single table. + */ +typedef enum pgpa_qf_type +{ + PGPAQF_GATHER, + PGPAQF_GATHER_MERGE, + PGPAQF_SEMIJOIN_NON_UNIQUE, + PGPAQF_SEMIJOIN_UNIQUE + /* update NUM_PGPA_QF_TYPES if you add anything here */ +} pgpa_qf_type; + +#define NUM_PGPA_QF_TYPES ((int) PGPAQF_SEMIJOIN_UNIQUE + 1) + +/* + * For each query feature, we keep track of the feature type and the set of + * relids that we found underneath the relevant plan node. See the comments + * on pgpa_qf_type, above, for additional details. + */ +typedef struct pgpa_query_feature +{ + pgpa_qf_type type; + Plan *plan; + Bitmapset *relids; +} pgpa_query_feature; + +/* + * Context object for plan tree walk. + * + * pstmt is the PlannedStmt we're studying. + * + * scans is an array of lists of pgpa_scan objects. The array is indexed by + * the scan's pgpa_scan_strategy. + * + * no_gather_scans is the set of scan RTIs that do not appear beneath any + * Gather or Gather Merge node. + * + * toplevel_unrolled_joins is a list of all pgpa_unrolled_join objects that + * are not a child of some other pgpa_unrolled_join. + * + * join_strategy is an array of lists of Bitmapset objects. Each Bitmapset + * is the set of relids that appears on the inner side of some join (excluding + * RTIs from partition children and subqueries). The array is indexed by + * pgpa_join_strategy. + * + * query_features is an array lists of pgpa_query_feature objects, indexed + * by pgpa_qf_type. + * + * future_query_features is only used during the plan tree walk and should + * be empty when the tree walk concludes. It is a list of pgpa_query_feature + * objects for Plan nodes that the plan tree walk has not yet encountered; + * when encountered, they will be moved to the list of active query features + * that is propagated via the call stack. + */ +typedef struct pgpa_plan_walker_context +{ + PlannedStmt *pstmt; + List *scans[NUM_PGPA_SCAN_STRATEGY]; + Bitmapset *no_gather_scans; + List *toplevel_unrolled_joins; + List *join_strategies[NUM_PGPA_JOIN_STRATEGY]; + List *query_features[NUM_PGPA_QF_TYPES]; + List *future_query_features; +} pgpa_plan_walker_context; + +extern void pgpa_plan_walker(pgpa_plan_walker_context *walker, + PlannedStmt *pstmt, + List *sj_unique_rels); + +extern void pgpa_add_future_feature(pgpa_plan_walker_context *walker, + pgpa_qf_type type, + Plan *plan); + +extern ElidedNode *pgpa_last_elided_node(PlannedStmt *pstmt, Plan *plan); +extern Bitmapset *pgpa_relids(Plan *plan); +extern Index pgpa_scanrelid(Plan *plan); +extern Bitmapset *pgpa_filter_out_join_relids(Bitmapset *relids, List *rtable); + +extern bool pgpa_walker_would_advise(pgpa_plan_walker_context *walker, + pgpa_identifier *rt_identifiers, + pgpa_advice_tag_type tag, + pgpa_advice_target *target); + +#endif diff --git a/contrib/pg_plan_advice/sql/gather.sql b/contrib/pg_plan_advice/sql/gather.sql new file mode 100644 index 00000000000..776666bf196 --- /dev/null +++ b/contrib/pg_plan_advice/sql/gather.sql @@ -0,0 +1,86 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 1; +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +SET min_parallel_table_scan_size = 0; +SET debug_parallel_query = off; + +CREATE TABLE gt_dim (id serial primary key, dim text) + WITH (autovacuum_enabled = false); +INSERT INTO gt_dim (dim) SELECT random()::text FROM generate_series(1,100) g; +VACUUM ANALYZE gt_dim; + +CREATE TABLE gt_fact ( + id int not null, + dim_id integer not null references gt_dim (id) +) WITH (autovacuum_enabled = false); +INSERT INTO gt_fact + SELECT g, (g%3)+1 FROM generate_series(1,100000) g; +VACUUM ANALYZE gt_fact; + +-- By default, we expect Gather Merge with a parallel hash join. +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; + +-- Force Gather or Gather Merge of both relations together. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather_merge((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +SET LOCAL pg_plan_advice.advice = 'gather((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +COMMIT; + +-- Force a separate Gather or Gather Merge operation for each relation. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather_merge(f d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +SET LOCAL pg_plan_advice.advice = 'gather(f d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +SET LOCAL pg_plan_advice.advice = 'gather((d d/d.d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +COMMIT; + +-- Force a Gather or Gather Merge on one relation but no parallelism on other. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather_merge(f) no_gather(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +SET LOCAL pg_plan_advice.advice = 'gather_merge(d) no_gather(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +SET LOCAL pg_plan_advice.advice = 'gather(f) no_gather(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +SET LOCAL pg_plan_advice.advice = 'gather(d) no_gather(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +COMMIT; + +-- Force no Gather or Gather Merge use at all. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'no_gather(f d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +COMMIT; + +-- Can't force Gather Merge without the ORDER BY clause, but just Gather is OK. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather_merge((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'gather((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id; +COMMIT; + +-- Test conflicting advice. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'gather((f d)) no_gather(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id; +COMMIT; diff --git a/contrib/pg_plan_advice/sql/join_order.sql b/contrib/pg_plan_advice/sql/join_order.sql new file mode 100644 index 00000000000..5e16e54efad --- /dev/null +++ b/contrib/pg_plan_advice/sql/join_order.sql @@ -0,0 +1,145 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; + +CREATE TABLE jo_dim1 (id integer primary key, dim1 text, val1 int) + WITH (autovacuum_enabled = false); +INSERT INTO jo_dim1 (id, dim1, val1) + SELECT g, 'some filler text ' || g, (g % 3) + 1 + FROM generate_series(1,100) g; +VACUUM ANALYZE jo_dim1; +CREATE TABLE jo_dim2 (id integer primary key, dim2 text, val2 int) + WITH (autovacuum_enabled = false); +INSERT INTO jo_dim2 (id, dim2, val2) + SELECT g, 'some filler text ' || g, (g % 7) + 1 + FROM generate_series(1,1000) g; +VACUUM ANALYZE jo_dim2; + +CREATE TABLE jo_fact ( + id int primary key, + dim1_id integer not null references jo_dim1 (id), + dim2_id integer not null references jo_dim2 (id) +) WITH (autovacuum_enabled = false); +INSERT INTO jo_fact + SELECT g, (g%100)+1, (g%100)+1 FROM generate_series(1,100000) g; +VACUUM ANALYZE jo_fact; + +-- We expect to join to d2 first and then d1, since the condition on d2 +-- is more selective. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + +-- Force a few different join orders. Some of these are very inefficient, +-- but the planner considers them all viable. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(f d1 d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +SET LOCAL pg_plan_advice.advice = 'join_order(f d2 d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +SET LOCAL pg_plan_advice.advice = 'join_order(d1 f d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +SET LOCAL pg_plan_advice.advice = 'join_order(f (d1 d2))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +SET LOCAL pg_plan_advice.advice = 'join_order(f {d1 d2})'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +COMMIT; + +-- Force a join order by mentioning just a prefix of the join list. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +SET LOCAL pg_plan_advice.advice = 'join_order(d2 d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +COMMIT; + +-- jo_fact is not partitioned, but let's try pretending that it is and +-- verifying that the advice does not apply. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(f/d1 d1 d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +SET LOCAL pg_plan_advice.advice = 'join_order(f/d1 (d1 d2))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; +COMMIT; + +-- The unusual formulation of this query is intended to prevent the query +-- planner from reducing the FULL JOIN to some other join type, so that we +-- can test what happens with a join type that cannot be reordered. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; + +-- We should not be able to force the planner to join f to d1 first, because +-- that is not a valid join order, but we should be able to force the planner +-- to make either d2 or f the driving table. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(f d1 d2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; +SET LOCAL pg_plan_advice.advice = 'join_order(f d2 d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; +SET LOCAL pg_plan_advice.advice = 'join_order(d2 f d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; +COMMIT; + +-- Two incompatible join orders should conflict. In the second case, +-- the conflict is implicit: if d1 is on the inner side of a join of any +-- type, it cannot also be the driving table. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'join_order(f) join_order(d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; +SET LOCAL pg_plan_advice.advice = 'join_order(d1) hash_join(d1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM jo_dim1 d1 + INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0) + ON d1.id = f.dim1_id OR f.dim1_id IS NULL; +COMMIT; diff --git a/contrib/pg_plan_advice/sql/join_strategy.sql b/contrib/pg_plan_advice/sql/join_strategy.sql new file mode 100644 index 00000000000..edd5c4c0e14 --- /dev/null +++ b/contrib/pg_plan_advice/sql/join_strategy.sql @@ -0,0 +1,84 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; + +CREATE TABLE join_dim (id serial primary key, dim text) + WITH (autovacuum_enabled = false); +INSERT INTO join_dim (dim) SELECT random()::text FROM generate_series(1,100) g; +VACUUM ANALYZE join_dim; + +CREATE TABLE join_fact ( + id int primary key, + dim_id integer not null references join_dim (id) +) WITH (autovacuum_enabled = false); +INSERT INTO join_fact + SELECT g, (g%3)+1 FROM generate_series(1,100000) g; +CREATE INDEX join_fact_dim_id ON join_fact (dim_id); +VACUUM ANALYZE join_fact; + +-- We expect a hash join by default. +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + +-- Try forcing each join method in turn with join_dim as the inner table. +-- All of these should work except for MERGE_JOIN_MATERIALIZE; that will +-- fail, because the planner knows that join_dim (id) is unique, and will +-- refuse to add mark/restore overhead. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'HASH_JOIN(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_MATERIALIZE(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_PLAIN(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MATERIALIZE(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MEMOIZE(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +COMMIT; + +-- Now try forcing each join method in turn with join_fact as the inner +-- table. All of these should work. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'HASH_JOIN(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_MATERIALIZE(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_PLAIN(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MATERIALIZE(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MEMOIZE(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +COMMIT; + +-- Non-working cases. We can't force a foreign join between these tables, +-- because they aren't foreign tables. We also can't use two different +-- strategies on the same table, nor can we put both tables on the inner +-- side of the same join. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'FOREIGN_JOIN((f d))'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f) NESTED_LOOP_MATERIALIZE(f)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f d)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; +COMMIT; diff --git a/contrib/pg_plan_advice/sql/local_collector.sql b/contrib/pg_plan_advice/sql/local_collector.sql new file mode 100644 index 00000000000..db1e23488d4 --- /dev/null +++ b/contrib/pg_plan_advice/sql/local_collector.sql @@ -0,0 +1,44 @@ +CREATE EXTENSION pg_plan_advice; +SET debug_parallel_query = off; + +-- Try clearing advice before we've collected any. +SELECT pg_clear_collected_local_advice(); + +-- Set a small advice collection limit so that we'll exceed it. +SET pg_plan_advice.local_collection_limit = 2; + +-- Enable the collector. +SET pg_plan_advice.local_collector = on; + +-- Set up a dummy table. +CREATE TABLE dummy_table (a int primary key, b text) + WITH (autovacuum_enabled = false, parallel_workers = 0); + +-- Test queries. +SELECT * FROM dummy_table a, dummy_table b; +SELECT * FROM dummy_table; + +-- Should return the advice from the second test query. +SELECT advice FROM pg_get_collected_local_advice() ORDER BY id DESC LIMIT 1; + +-- Now try clearing advice again. +SELECT pg_clear_collected_local_advice(); + +-- Raise the collection limit so that the collector uses multiple chunks. +SET pg_plan_advice.local_collection_limit = 2000; + +-- Push a bunch of queries through the collector. +DO $$ +BEGIN + FOR x IN 1..2000 LOOP + EXECUTE 'SELECT * FROM dummy_table'; + END LOOP; +END +$$; + +-- Check that the collector worked. +SELECT COUNT(*) FROM pg_get_collected_local_advice(); + +-- And clear one more time, to verify that this doesn't cause a problem +-- even with a larger number of entries. +SELECT pg_clear_collected_local_advice(); diff --git a/contrib/pg_plan_advice/sql/partitionwise.sql b/contrib/pg_plan_advice/sql/partitionwise.sql new file mode 100644 index 00000000000..c51456dbbb5 --- /dev/null +++ b/contrib/pg_plan_advice/sql/partitionwise.sql @@ -0,0 +1,99 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; +SET enable_partitionwise_join = true; + +CREATE TABLE pt1 (id integer primary key, dim1 text, val1 int) + PARTITION BY RANGE (id); +CREATE TABLE pt1a PARTITION OF pt1 FOR VALUES FROM (1) to (1001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt1b PARTITION OF pt1 FOR VALUES FROM (1001) to (2001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt1c PARTITION OF pt1 FOR VALUES FROM (2001) to (3001) + WITH (autovacuum_enabled = false); +INSERT INTO pt1 (id, dim1, val1) + SELECT g, 'some filler text ' || g, (g % 3) + 1 + FROM generate_series(1,3000) g; +VACUUM ANALYZE pt1; + +CREATE TABLE pt2 (id integer primary key, dim2 text, val2 int) + PARTITION BY RANGE (id); +CREATE TABLE pt2a PARTITION OF pt2 FOR VALUES FROM (1) to (1001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt2b PARTITION OF pt2 FOR VALUES FROM (1001) to (2001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt2c PARTITION OF pt2 FOR VALUES FROM (2001) to (3001) + WITH (autovacuum_enabled = false); +INSERT INTO pt2 (id, dim2, val2) + SELECT g, 'some other text ' || g, (g % 5) + 1 + FROM generate_series(1,3000,2) g; +VACUUM ANALYZE pt2; + +CREATE TABLE pt3 (id integer primary key, dim3 text, val3 int) + PARTITION BY RANGE (id); +CREATE TABLE pt3a PARTITION OF pt3 FOR VALUES FROM (1) to (1001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt3b PARTITION OF pt3 FOR VALUES FROM (1001) to (2001) + WITH (autovacuum_enabled = false); +CREATE TABLE pt3c PARTITION OF pt3 FOR VALUES FROM (2001) to (3001) + WITH (autovacuum_enabled = false); +INSERT INTO pt3 (id, dim3, val3) + SELECT g, 'a third random text ' || g, (g % 7) + 1 + FROM generate_series(1,3000,3) g; +VACUUM ANALYZE pt3; + +CREATE TABLE ptmismatch (id integer primary key, dimm text, valm int) + PARTITION BY RANGE (id); +CREATE TABLE ptmismatcha PARTITION OF ptmismatch + FOR VALUES FROM (1) to (1501) + WITH (autovacuum_enabled = false); +CREATE TABLE ptmismatchb PARTITION OF ptmismatch + FOR VALUES FROM (1501) to (3001) + WITH (autovacuum_enabled = false); +INSERT INTO ptmismatch (id, dimm, valm) + SELECT g, 'yet another text ' || g, (g % 2) + 1 + FROM generate_series(1,3000) g; +VACUUM ANALYZE ptmismatch; + +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; + +-- Suppress partitionwise join, or do it just partially. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE(pt1 pt2 pt3)'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; +SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 pt2) pt3)'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; +COMMIT; + +-- Test conflicting advice. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 pt2) (pt1 pt3))'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; +COMMIT; + +-- Can't force a partitionwise join with a mismatched table. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 ptmismatch))'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, ptmismatch WHERE pt1.id = ptmismatch.id; +COMMIT; + +-- Force join order for a particular branch of the partitionwise join with +-- and without mentioning the schema name. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'JOIN_ORDER(pt3/public.pt3a pt2/public.pt2a pt1/public.pt1a)'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; +SET LOCAL pg_plan_advice.advice = 'JOIN_ORDER(pt3/pt3a pt2/pt2a pt1/pt1a)'; +EXPLAIN (PLAN_ADVICE, COSTS OFF) +SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id + AND val1 = 1 AND val2 = 1 AND val3 = 1; +COMMIT; diff --git a/contrib/pg_plan_advice/sql/prepared.sql b/contrib/pg_plan_advice/sql/prepared.sql new file mode 100644 index 00000000000..3ec30eedee5 --- /dev/null +++ b/contrib/pg_plan_advice/sql/prepared.sql @@ -0,0 +1,37 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; + +CREATE TABLE ptab (id integer, val text) WITH (autovacuum_enabled = false); + +SET pg_plan_advice.always_store_advice_details = false; + +-- Not prepared, so advice should be generated. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM ptab; + +-- Prepared, so advice should not be generated. +PREPARE pt1 AS SELECT * FROM ptab; +EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt1; + +SET pg_plan_advice.always_store_advice_details = true; + +-- Prepared, but always_store_advice_details = true, so should show advice. +PREPARE pt2 AS SELECT * FROM ptab; +EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt2; + +-- Not prepared, so feedback should be generated. +SET pg_plan_advice.always_store_advice_details = false; +SET pg_plan_advice.advice = 'SEQ_SCAN(ptab)'; +EXPLAIN (COSTS OFF) +SELECT * FROM ptab; + +-- Prepared, so advice should not be generated. +PREPARE pt3 AS SELECT * FROM ptab; +EXPLAIN (COSTS OFF) EXECUTE pt1; + +SET pg_plan_advice.always_store_advice_details = true; + +-- Prepared, but always_store_advice_details = true, so should show feedback. +PREPARE pt4 AS SELECT * FROM ptab; +EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt2; + diff --git a/contrib/pg_plan_advice/sql/scan.sql b/contrib/pg_plan_advice/sql/scan.sql new file mode 100644 index 00000000000..4fc494c7d8e --- /dev/null +++ b/contrib/pg_plan_advice/sql/scan.sql @@ -0,0 +1,195 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; +SET seq_page_cost = 0.1; +SET random_page_cost = 0.1; +SET cpu_tuple_cost = 0; +SET cpu_index_tuple_cost = 0; + +CREATE TABLE scan_table (a int primary key, b text) + WITH (autovacuum_enabled = false); +INSERT INTO scan_table + SELECT g, 'some text ' || g FROM generate_series(1, 100000) g; +CREATE INDEX scan_table_b ON scan_table USING brin (b); +VACUUM ANALYZE scan_table; + +-- Sequential scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; + +-- Index scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; + +-- Index-only scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; + +-- Bitmap heap scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE b > 'some text 8'; + +-- TID scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)'; + +-- TID range scan +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE ctid > '(1,1)' AND ctid < '(2,1)'; + +-- Try forcing each of our test queries to use the scan type they +-- wanted to use anyway. This should succeed. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE b > 'some text 8'; +SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE ctid > '(1,1)' AND ctid < '(2,1)'; +COMMIT; + +-- Try to force a full scan of the table to use some other scan type. All +-- of these will fail. An index scan or bitmap heap scan could potentially +-- generate the correct answer, but the planner does not even consider these +-- possibilities due to the lack of a WHERE clause. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; +SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; +SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table; +COMMIT; + +-- Try again to force index use. This should now succeed for the INDEX_SCAN +-- and BITMAP_HEAP_SCAN, but the INDEX_ONLY_SCAN can't be forced because the +-- query fetches columns not included in the index. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0; +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0; +SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0; +COMMIT; + +-- We can force a primary key lookup to use a sequential scan, but we +-- can't force it to use an index-only scan (due to the column list) +-- or a TID scan (due to the absence of a TID qual). +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +COMMIT; + +-- We can forcibly downgrade an index-only scan to an index scan, but we can't +-- force the use of an index that the planner thinks is inapplicable. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table public.scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_b)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +COMMIT; + +-- We can force the use of a sequential scan in place of a bitmap heap scan, +-- but a plain index scan on a BRIN index is not possible. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE b > 'some text 8'; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_b)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +COMMIT; + +-- We can force the use of a sequential scan rather than a TID scan or +-- TID range scan. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table + WHERE ctid > '(1,1)' AND ctid < '(2,1)'; +COMMIT; + +-- Test more complex scenarios with index scans. +BEGIN; +-- Should still work if we mention the schema. +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table public.scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +-- But not if we mention the wrong schema. +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table cilbup.scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +-- It's OK to repeat the same advice. +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey scan_table scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +-- But it doesn't work if the index target is even notionally different. +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey scan_table public.scan_table_pkey)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1; +COMMIT; + +-- Test assorted incorrect advice. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(nothing)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(nothing whatsoever)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table bogus)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(nothing whatsoever)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; +SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table bogus)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1; +COMMIT; + +-- Test our ability to refer to multiple instances of the same alias. +BEGIN; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x + LEFT JOIN scan_table s ON g = s.a; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x + LEFT JOIN scan_table s ON g = s.a; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s#2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x + LEFT JOIN scan_table s ON g = s.a; +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s) SEQ_SCAN(s#2)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x + LEFT JOIN scan_table s ON g = s.a; +COMMIT; + +-- Test our ability to refer to scans within a subquery. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0); +BEGIN; +-- Should not match. +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0); +-- Should match first query only. +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s@x)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0); +-- Should match second query only. +SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s@unnamed_subquery)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0); +COMMIT; diff --git a/contrib/pg_plan_advice/sql/semijoin.sql b/contrib/pg_plan_advice/sql/semijoin.sql new file mode 100644 index 00000000000..5a4ae52d1d9 --- /dev/null +++ b/contrib/pg_plan_advice/sql/semijoin.sql @@ -0,0 +1,118 @@ +LOAD 'pg_plan_advice'; +SET max_parallel_workers_per_gather = 0; + +CREATE TABLE sj_wide ( + id integer primary key, + val1 integer, + padding text storage plain +) WITH (autovacuum_enabled = false); +INSERT INTO sj_wide + SELECT g, g%10+1, repeat(' ', 300) FROM generate_series(1, 1000) g; +CREATE INDEX ON sj_wide (val1); +VACUUM ANALYZE sj_wide; + +CREATE TABLE sj_narrow ( + id integer primary key, + val1 integer +) WITH (autovacuum_enabled = false); +INSERT INTO sj_narrow + SELECT g, g%10+1 FROM generate_series(1, 1000) g; +CREATE INDEX ON sj_narrow (val1); +VACUUM ANALYZE sj_narrow; + +-- We expect this to make the VALUES list unique and use index lookups to +-- find the rows in sj_wide, so as to avoid a full scan of sj_wide. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_wide + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + +-- If we ask for a unique semijoin, we should get the same plan as with +-- no advice. If we ask for a non-unique semijoin, we should see a Semi +-- Join operation in the plan tree. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique("*VALUES*")'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_wide + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique("*VALUES*")'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_wide + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); +COMMIT; + +-- Because this table is narrower than the previous one, a sequential scan +-- is less expensive, and we choose a straightforward Semi Join plan by +-- default. (Note that this is also very sensitive to the length of the IN +-- list, which affects how many index lookups the alternative plan will need.) +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_narrow + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + +-- Here, we expect advising a unique semijoin to swith to the same plan that +-- we got with sj_wide, and advising a non-unique semijoin should not change +-- the plan. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique("*VALUES*")'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_narrow + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique("*VALUES*")'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM sj_narrow + WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); +COMMIT; + +-- In the above example, we made the outer side of the join unique, but here, +-- we should make the inner side unique. +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); + +-- We should be able to force a plan with or without the make-unique strategy, +-- with either side as the driving table. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow) join_order(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(sj_narrow) join_order(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); +COMMIT; + +-- However, mentioning the wrong side of the join should result in an advice +-- failure. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(g)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); +SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(g)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); +COMMIT; + +-- Test conflicting advice. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow) semijoin_non_unique(sj_narrow)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g + WHERE g in (select val1 from sj_narrow); +COMMIT; + +-- Try applying SEMIJOIN_UNIQUE() to a non-semijoin. +BEGIN; +SET LOCAL pg_plan_advice.advice = 'semijoin_unique(g)'; +EXPLAIN (COSTS OFF, PLAN_ADVICE) +SELECT * FROM generate_series(1,1000) g, sj_narrow s WHERE g = s.val1; +COMMIT; diff --git a/contrib/pg_plan_advice/sql/syntax.sql b/contrib/pg_plan_advice/sql/syntax.sql new file mode 100644 index 00000000000..56a5d54e2b5 --- /dev/null +++ b/contrib/pg_plan_advice/sql/syntax.sql @@ -0,0 +1,68 @@ +LOAD 'pg_plan_advice'; + +-- An empty string is allowed. Empty target lists are allowed for most advice +-- tags, but not for JOIN_ORDER. "Supplied Plan Advice" should be omitted in +-- text format when there is no actual advice, but not in non-text format. +SET pg_plan_advice.advice = ''; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = 'SEQ_SCAN()'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = 'NESTED_LOOP_PLAIN()'; +EXPLAIN (COSTS OFF, FORMAT JSON) SELECT 1; +SET pg_plan_advice.advice = 'JOIN_ORDER()'; + +-- Test assorted variations in capitalization, whitespace, and which parts of +-- the relation identifier are included. These should all work. +SET pg_plan_advice.advice = 'SEQ_SCAN(x)'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = 'seq_scan(x@y)'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = 'SEQ_scan(x#2)'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = 'SEQ_SCAN (x/y)'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = ' SEQ_SCAN ( x / y . z ) '; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = 'SEQ_SCAN("x"#2/"y"."z"@"t")'; +EXPLAIN (COSTS OFF) SELECT 1; + +-- Syntax errors. +SET pg_plan_advice.advice = 'SEQUENTIAL_SCAN(x)'; +SET pg_plan_advice.advice = 'SEQ_SCAN'; +SET pg_plan_advice.advice = 'SEQ_SCAN('; +SET pg_plan_advice.advice = 'SEQ_SCAN("'; +SET pg_plan_advice.advice = 'SEQ_SCAN("")'; +SET pg_plan_advice.advice = 'SEQ_SCAN("a"'; +SET pg_plan_advice.advice = 'SEQ_SCAN(#'; +SET pg_plan_advice.advice = '()'; +SET pg_plan_advice.advice = '123'; + +-- Tags like SEQ_SCAN and NO_GATHER don't allow sublists at all; other tags, +-- except for JOIN_ORDER, allow at most one level of sublist. Hence, these +-- examples should error out. +SET pg_plan_advice.advice = 'SEQ_SCAN((x))'; +SET pg_plan_advice.advice = 'GATHER(((x)))'; + +-- Legal comments. +SET pg_plan_advice.advice = '/**/'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = 'HASH_JOIN(_)/***/'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = '/* comment */ HASH_JOIN(/*x*/y)'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = '/* comment */ HASH_JOIN(y//*x*/z)'; +EXPLAIN (COSTS OFF) SELECT 1; + +-- Unterminated comments. +SET pg_plan_advice.advice = '/*'; +SET pg_plan_advice.advice = 'JOIN_ORDER("fOO") /* oops'; + +-- Nested comments are not supported, so the first of these is legal and +-- the second is not. +SET pg_plan_advice.advice = '/*/*/'; +EXPLAIN (COSTS OFF) SELECT 1; +SET pg_plan_advice.advice = '/*/* stuff */*/'; + +-- Foreign join requires multiple relation identifiers. +SET pg_plan_advice.advice = 'FOREIGN_JOIN(a)'; +SET pg_plan_advice.advice = 'FOREIGN_JOIN((a))'; diff --git a/contrib/pg_plan_advice/t/001_regress.pl b/contrib/pg_plan_advice/t/001_regress.pl new file mode 100644 index 00000000000..67595cddf75 --- /dev/null +++ b/contrib/pg_plan_advice/t/001_regress.pl @@ -0,0 +1,148 @@ +# Copyright (c) 2021-2025, PostgreSQL Global Development Group + +# Run the core regression tests under pg_plan_advice to check for problems. +use strict; +use warnings FATAL => 'all'; + +use Cwd qw(abs_path); +use File::Basename qw(dirname); + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Initialize the primary node +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init(); + +# Set up our desired configuration. +# +# We run with pg_plan_advice.shared_collection_limit set to ensure that the +# plan tree walker code runs against every query in the regression tests. If +# we're unable to properly analyze any of those plan trees, this test should fail. +# +# We set pg_plan_advice.advice to an advice string that will cause the advice +# trove to be populated with a few entries of various sorts, but which we do +# not expect to match anything in the regression test queries. This way, the +# planner hooks will be called, improving code coverage, but no plans should +# actually change. +# +# pg_plan_advice.always_explain_supplied_advice=false is needed to avoid breaking +# regression test queries that use EXPLAIN. In the real world, it seems like +# users will want EXPLAIN output to show supplied advice so that it's clear +# whether normal planner behavior has been altered, but here that's undesirable. +$node->append_conf('postgresql.conf', <start; + +my $srcdir = abs_path("../.."); + +# --dlpath is needed to be able to find the location of regress.so +# and any libraries the regression tests require. +my $dlpath = dirname($ENV{REGRESS_SHLIB}); + +# --outputdir points to the path where to place the output files. +my $outputdir = $PostgreSQL::Test::Utils::tmp_check; + +# --inputdir points to the path of the input files. +my $inputdir = "$srcdir/src/test/regress"; + +# Run the tests. +my $rc = + system($ENV{PG_REGRESS} . " " + . "--bindir= " + . "--dlpath=\"$dlpath\" " + . "--host=" . $node->host . " " + . "--port=" . $node->port . " " + . "--schedule=$srcdir/src/test/regress/parallel_schedule " + . "--max-concurrent-tests=20 " + . "--inputdir=\"$inputdir\" " + . "--outputdir=\"$outputdir\""); + +# Dump out the regression diffs file, if there is one +if ($rc != 0) +{ + my $diffs = "$outputdir/regression.diffs"; + if (-e $diffs) + { + print "=== dumping $diffs ===\n"; + print slurp_file($diffs); + print "=== EOF ===\n"; + } +} + +# Report results +is($rc, 0, 'regression tests pass'); + +# Create the extension so we can access the collector +$node->safe_psql('postgres', 'CREATE EXTENSION pg_plan_advice'); + +# Verify that a large amount of advice was collected +my $all_query_count = $node->safe_psql('postgres', <', 20000, "copious advice collected"); + +# Verify that lots of different advice strings were collected +my $distinct_query_count = $node->safe_psql('postgres', <', 3000, "diverse advice collected"); + +# We want to test for the presence of our known tags in the collected advice. +# Put all tags into the hash that follows; map any tags that aren't tested +# by the core regression tests to 0, and others to 1. +my %tag_map = ( + BITMAP_HEAP_SCAN => 1, + FOREIGN_JOIN => 0, + GATHER => 1, + GATHER_MERGE => 1, + HASH_JOIN => 1, + INDEX_ONLY_SCAN => 1, + INDEX_SCAN => 1, + JOIN_ORDER => 1, + MERGE_JOIN_MATERIALIZE => 1, + MERGE_JOIN_PLAIN => 1, + NESTED_LOOP_MATERIALIZE => 1, + NESTED_LOOP_MEMOIZE => 1, + NESTED_LOOP_PLAIN => 1, + NO_GATHER => 1, + PARTITIONWISE => 1, + SEMIJOIN_NON_UNIQUE => 1, + SEMIJOIN_UNIQUE => 1, + SEQ_SCAN => 1, + TID_SCAN => 1, +); +for my $tag (sort keys %tag_map) +{ + my $checkit = $tag_map{$tag}; + + # Search for the given tag. This is not entirely robust: it could get thrown + # off by a table alias such as "FOREIGN_JOIN(", but that probably won't + # happen in the core regression tests. + my $tag_count = $node->safe_psql('postgres', <', 10, "multiple uses of $tag") if $checkit; + + # Regardless, note the exact count in the log, for human consumption. + note("found $tag_count advice strings containing $tag"); +} + +# Trigger a partial cleanup of the shared advice collector, and then a full +# cleanup. +$node->safe_psql('postgres', < 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +-- Index 50 translations of the word "Mathematics" +CREATE TEMP TABLE mb (s text); +\copy mb from 'data/trgm_utf8.data' +CREATE INDEX ON mb USING gist(s gist_trgm_ops); diff --git a/contrib/pg_trgm/expected/pg_utf8_trgm_1.out b/contrib/pg_trgm/expected/pg_utf8_trgm_1.out new file mode 100644 index 00000000000..8505c4fa552 --- /dev/null +++ b/contrib/pg_trgm/expected/pg_utf8_trgm_1.out @@ -0,0 +1,3 @@ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/contrib/pg_trgm/meson.build b/contrib/pg_trgm/meson.build index 3cc299d5eaa..3ecf95ba862 100644 --- a/contrib/pg_trgm/meson.build +++ b/contrib/pg_trgm/meson.build @@ -39,6 +39,7 @@ tests += { 'regress': { 'sql': [ 'pg_trgm', + 'pg_utf8_trgm', 'pg_word_trgm', 'pg_strict_word_trgm', ], diff --git a/contrib/pg_trgm/sql/pg_utf8_trgm.sql b/contrib/pg_trgm/sql/pg_utf8_trgm.sql new file mode 100644 index 00000000000..0dd962ced83 --- /dev/null +++ b/contrib/pg_trgm/sql/pg_utf8_trgm.sql @@ -0,0 +1,9 @@ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +-- Index 50 translations of the word "Mathematics" +CREATE TEMP TABLE mb (s text); +\copy mb from 'data/trgm_utf8.data' +CREATE INDEX ON mb USING gist(s gist_trgm_ops); diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h index ca017585369..ca23aad4dd9 100644 --- a/contrib/pg_trgm/trgm.h +++ b/contrib/pg_trgm/trgm.h @@ -47,7 +47,7 @@ typedef char trgm[3]; } while(0) extern int (*CMPTRGM) (const void *a, const void *b); -#define ISWORDCHR(c) (t_isalnum(c)) +#define ISWORDCHR(c, len) (t_isalnum_with_len(c, len)) #define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') ) #define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) ) diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c index 2f0d61985a5..685275a0f9b 100644 --- a/contrib/pg_trgm/trgm_gist.c +++ b/contrib/pg_trgm/trgm_gist.c @@ -701,10 +701,13 @@ gtrgm_penalty(PG_FUNCTION_ARGS) if (ISARRKEY(newval)) { char *cache = (char *) fcinfo->flinfo->fn_extra; - TRGM *cachedVal = (TRGM *) (cache + MAXALIGN(siglen)); + TRGM *cachedVal = NULL; Size newvalsize = VARSIZE(newval); BITVECP sign; + if (cache != NULL) + cachedVal = (TRGM *) (cache + MAXALIGN(siglen)); + /* * Cache the sign data across multiple calls with the same newval. */ diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index 81182a15e07..5fba594b61f 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -66,6 +66,78 @@ typedef uint8 TrgmBound; #define WORD_SIMILARITY_STRICT 0x02 /* force bounds of extent to match * word bounds */ +/* + * A growable array of trigrams + * + * The actual array of trigrams is in 'datum'. Note that the other fields in + * 'datum', i.e. datum->flags and the varlena length, are not kept up to date + * when items are added to the growable array. We merely reserve the space + * for them here. You must fill those other fields before using 'datum' as a + * proper TRGM datum. + */ +typedef struct +{ + TRGM *datum; /* trigram array */ + int length; /* number of trigrams in the array */ + int allocated; /* allocated size of 'datum' (# of trigrams) */ +} growable_trgm_array; + +/* + * Allocate a new growable array. + * + * 'slen' is the size of the source string that we're extracting the trigrams + * from. It is used to choose the initial size of the array. + */ +static void +init_trgm_array(growable_trgm_array *arr, int slen) +{ + size_t init_size; + + /* + * In the extreme case, the input string consists entirely of one + * character words, like "a b c", where each word is expanded to two + * trigrams. This is not a strict upper bound though, because when + * IGNORECASE is defined, we convert the input string to lowercase before + * extracting the trigrams, which in rare cases can expand one input + * character into multiple characters. + */ + init_size = (size_t) slen + 1; + + /* + * Guard against possible overflow in the palloc request. (We don't worry + * about the additive constants, since palloc can detect requests that are + * a little above MaxAllocSize --- we just need to prevent integer + * overflow in the multiplications.) + */ + if (init_size > MaxAllocSize / sizeof(trgm)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of memory"))); + + arr->datum = palloc(CALCGTSIZE(ARRKEY, init_size)); + arr->allocated = init_size; + arr->length = 0; +} + +/* Make sure the array can hold at least 'needed' more trigrams */ +static void +enlarge_trgm_array(growable_trgm_array *arr, int needed) +{ + size_t new_needed = (size_t) arr->length + needed; + + if (new_needed > arr->allocated) + { + /* Guard against possible overflow, like in init_trgm_array */ + if (new_needed > MaxAllocSize / sizeof(trgm)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of memory"))); + + arr->datum = repalloc(arr->datum, CALCGTSIZE(ARRKEY, new_needed)); + arr->allocated = new_needed; + } +} + /* * Module load callback */ @@ -220,22 +292,31 @@ comp_trgm(const void *a, const void *b) * endword points to the character after word */ static char * -find_word(char *str, int lenstr, char **endword, int *charlen) +find_word(char *str, int lenstr, char **endword) { char *beginword = str; + const char *endstr = str + lenstr; - while (beginword - str < lenstr && !ISWORDCHR(beginword)) - beginword += pg_mblen(beginword); + while (beginword < endstr) + { + int clen = pg_mblen_range(beginword, endstr); - if (beginword - str >= lenstr) + if (ISWORDCHR(beginword, clen)) + break; + beginword += clen; + } + + if (beginword >= endstr) return NULL; *endword = beginword; - *charlen = 0; - while (*endword - str < lenstr && ISWORDCHR(*endword)) + while (*endword < endstr) { - *endword += pg_mblen(*endword); - (*charlen)++; + int clen = pg_mblen_range(*endword, endstr); + + if (!ISWORDCHR(*endword, clen)) + break; + *endword += clen; } return beginword; @@ -269,78 +350,138 @@ compact_trigram(trgm *tptr, char *str, int bytelen) } /* - * Adds trigrams from words (already padded). + * Adds trigrams from the word in 'str' (already padded if necessary). */ -static trgm * -make_trigrams(trgm *tptr, char *str, int bytelen, int charlen) +static void +make_trigrams(growable_trgm_array *dst, char *str, int bytelen) { + trgm *tptr; char *ptr = str; - if (charlen < 3) - return tptr; + if (bytelen < 3) + return; - if (bytelen > charlen) - { - /* Find multibyte character boundaries and apply compact_trigram */ - int lenfirst = pg_mblen(str), - lenmiddle = pg_mblen(str + lenfirst), - lenlast = pg_mblen(str + lenfirst + lenmiddle); + /* max number of trigrams = strlen - 2 */ + enlarge_trgm_array(dst, bytelen - 2); + tptr = GETARR(dst->datum) + dst->length; - while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen) + if (pg_encoding_max_length(GetDatabaseEncoding()) == 1) + { + while (ptr < str + bytelen - 2) { - compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast); - - ptr += lenfirst; + CPTRGM(tptr, ptr); + ptr++; tptr++; - - lenfirst = lenmiddle; - lenmiddle = lenlast; - lenlast = pg_mblen(ptr + lenfirst + lenmiddle); } } else { - /* Fast path when there are no multibyte characters */ - Assert(bytelen == charlen); + int lenfirst, + lenmiddle, + lenlast; + char *endptr; - while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ ) + /* + * Fast path as long as there are no multibyte characters + */ + if (!IS_HIGHBIT_SET(ptr[0]) && !IS_HIGHBIT_SET(ptr[1])) { - CPTRGM(tptr, ptr); - ptr++; + while (!IS_HIGHBIT_SET(ptr[2])) + { + CPTRGM(tptr, ptr); + ptr++; + tptr++; + + if (ptr == str + bytelen - 2) + goto done; + } + + lenfirst = 1; + lenmiddle = 1; + lenlast = pg_mblen_unbounded(ptr + 2); + } + else + { + lenfirst = pg_mblen_unbounded(ptr); + if (ptr + lenfirst >= str + bytelen) + goto done; + lenmiddle = pg_mblen_unbounded(ptr + lenfirst); + if (ptr + lenfirst + lenmiddle >= str + bytelen) + goto done; + lenlast = pg_mblen_unbounded(ptr + lenfirst + lenmiddle); + } + + /* + * Slow path to handle any remaining multibyte characters + * + * As we go, 'ptr' points to the beginning of the current + * three-character string and 'endptr' points to just past it. + */ + endptr = ptr + lenfirst + lenmiddle + lenlast; + while (endptr <= str + bytelen) + { + compact_trigram(tptr, ptr, endptr - ptr); tptr++; + + /* Advance to the next character */ + if (endptr == str + bytelen) + break; + ptr += lenfirst; + lenfirst = lenmiddle; + lenmiddle = lenlast; + lenlast = pg_mblen_unbounded(endptr); + endptr += lenlast; } } - return tptr; +done: + dst->length = tptr - GETARR(dst->datum); + Assert(dst->length <= dst->allocated); } /* * Make array of trigrams without sorting and removing duplicate items. * - * trg: where to return the array of trigrams. + * dst: where to return the array of trigrams. * str: source string, of length slen bytes. - * bounds: where to return bounds of trigrams (if needed). - * - * Returns length of the generated array. + * bounds_p: where to return bounds of trigrams (if needed). */ -static int -generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds) +static void +generate_trgm_only(growable_trgm_array *dst, char *str, int slen, TrgmBound **bounds_p) { - trgm *tptr; + size_t buflen; char *buf; - int charlen, - bytelen; + int bytelen; char *bword, *eword; + TrgmBound *bounds = NULL; + int bounds_allocated = 0; - if (slen + LPADDING + RPADDING < 3 || slen == 0) - return 0; + init_trgm_array(dst, slen); - tptr = trg; + /* + * If requested, allocate an array for the bounds, with the same size as + * the trigram array. + */ + if (bounds_p) + { + bounds_allocated = dst->allocated; + bounds = *bounds_p = palloc0_array(TrgmBound, bounds_allocated); + } - /* Allocate a buffer for case-folded, blank-padded words */ - buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4); + if (slen + LPADDING + RPADDING < 3 || slen == 0) + return; + /* + * Allocate a buffer for case-folded, blank-padded words. + * + * As an initial guess, allocate a buffer large enough to hold the + * original string with padding, which is always enough when compiled with + * !IGNORECASE. If the case-folding produces a string longer than the + * original, we'll grow the buffer. + */ + buflen = (size_t) slen + 4; + buf = (char *) palloc(buflen); if (LPADDING > 0) { *buf = ' '; @@ -349,52 +490,59 @@ generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds) } eword = str; - while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL) + while ((bword = find_word(eword, slen - (eword - str), &eword)) != NULL) { + int oldlen; + + /* Convert word to lower case before extracting trigrams from it */ #ifdef IGNORECASE - bword = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID); - bytelen = strlen(bword); + { + char *lowered; + + lowered = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID); + bytelen = strlen(lowered); + + /* grow the buffer if necessary */ + if (bytelen > buflen - 4) + { + pfree(buf); + buflen = (size_t) bytelen + 4; + buf = (char *) palloc(buflen); + if (LPADDING > 0) + { + *buf = ' '; + if (LPADDING > 1) + *(buf + 1) = ' '; + } + } + memcpy(buf + LPADDING, lowered, bytelen); + pfree(lowered); + } #else bytelen = eword - bword; -#endif - memcpy(buf + LPADDING, bword, bytelen); - -#ifdef IGNORECASE - pfree(bword); #endif buf[LPADDING + bytelen] = ' '; buf[LPADDING + bytelen + 1] = ' '; /* Calculate trigrams marking their bounds if needed */ + oldlen = dst->length; + make_trigrams(dst, buf, bytelen + LPADDING + RPADDING); if (bounds) - bounds[tptr - trg] |= TRGM_BOUND_LEFT; - tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING, - charlen + LPADDING + RPADDING); - if (bounds) - bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT; + { + if (bounds_allocated < dst->length) + { + bounds = repalloc0_array(bounds, TrgmBound, bounds_allocated, dst->allocated); + bounds_allocated = dst->allocated; + } + + bounds[oldlen] |= TRGM_BOUND_LEFT; + bounds[dst->length - 1] |= TRGM_BOUND_RIGHT; + } } pfree(buf); - - return tptr - trg; -} - -/* - * Guard against possible overflow in the palloc requests below. (We - * don't worry about the additive constants, since palloc can detect - * requests that are a little above MaxAllocSize --- we just need to - * prevent integer overflow in the multiplications.) - */ -static void -protect_out_of_mem(int slen) -{ - if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) || - (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length())) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("out of memory"))); } /* @@ -408,19 +556,14 @@ TRGM * generate_trgm(char *str, int slen) { TRGM *trg; + growable_trgm_array arr; int len; - protect_out_of_mem(slen); - - trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3); + generate_trgm_only(&arr, str, slen, NULL); + len = arr.length; + trg = arr.datum; trg->flag = ARRKEY; - len = generate_trgm_only(GETARR(trg), str, slen, NULL); - SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len)); - - if (len == 0) - return trg; - /* * Make trigrams unique. */ @@ -675,8 +818,8 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2, { bool *found; pos_trgm *ptrg; - trgm *trg1; - trgm *trg2; + growable_trgm_array trg1; + growable_trgm_array trg2; int len1, len2, len, @@ -685,27 +828,21 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2, ulen1; int *trg2indexes; float4 result; - TrgmBound *bounds; - - protect_out_of_mem(slen1 + slen2); + TrgmBound *bounds = NULL; /* Make positional trigrams */ - trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3); - trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3); - if (flags & WORD_SIMILARITY_STRICT) - bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3); - else - bounds = NULL; - len1 = generate_trgm_only(trg1, str1, slen1, NULL); - len2 = generate_trgm_only(trg2, str2, slen2, bounds); + generate_trgm_only(&trg1, str1, slen1, NULL); + len1 = trg1.length; + generate_trgm_only(&trg2, str2, slen2, (flags & WORD_SIMILARITY_STRICT) ? &bounds : NULL); + len2 = trg2.length; - ptrg = make_positional_trgm(trg1, len1, trg2, len2); + ptrg = make_positional_trgm(GETARR(trg1.datum), len1, GETARR(trg2.datum), len2); len = len1 + len2; qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm); - pfree(trg1); - pfree(trg2); + pfree(trg1.datum); + pfree(trg2.datum); /* * Merge positional trigrams array: enumerate each trigram and find its @@ -761,20 +898,20 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2, * str: source string, of length lenstr bytes (need not be null-terminated) * buf: where to return the substring (must be long enough) * *bytelen: receives byte length of the found substring - * *charlen: receives character length of the found substring * * Returns pointer to end+1 of the found substring in the source string. - * Returns NULL if no word found (in which case buf, bytelen, charlen not set) + * Returns NULL if no word found (in which case buf, bytelen is not set) * * If the found word is bounded by non-word characters or string boundaries * then this function will include corresponding padding spaces into buf. */ static const char * get_wildcard_part(const char *str, int lenstr, - char *buf, int *bytelen, int *charlen) + char *buf, int *bytelen) { const char *beginword = str; const char *endword; + const char *endstr = str + lenstr; char *s = buf; bool in_leading_wildcard_meta = false; bool in_trailing_wildcard_meta = false; @@ -787,11 +924,13 @@ get_wildcard_part(const char *str, int lenstr, * from this loop to the next one, since we may exit at a word character * that is in_escape. */ - while (beginword - str < lenstr) + while (beginword < endstr) { + clen = pg_mblen_range(beginword, endstr); + if (in_escape) { - if (ISWORDCHR(beginword)) + if (ISWORDCHR(beginword, clen)) break; in_escape = false; in_leading_wildcard_meta = false; @@ -802,12 +941,12 @@ get_wildcard_part(const char *str, int lenstr, in_escape = true; else if (ISWILDCARDCHAR(beginword)) in_leading_wildcard_meta = true; - else if (ISWORDCHR(beginword)) + else if (ISWORDCHR(beginword, clen)) break; else in_leading_wildcard_meta = false; } - beginword += pg_mblen(beginword); + beginword += clen; } /* @@ -820,18 +959,13 @@ get_wildcard_part(const char *str, int lenstr, * Add left padding spaces if preceding character wasn't wildcard * meta-character. */ - *charlen = 0; if (!in_leading_wildcard_meta) { if (LPADDING > 0) { *s++ = ' '; - (*charlen)++; if (LPADDING > 1) - { *s++ = ' '; - (*charlen)++; - } } } @@ -840,15 +974,14 @@ get_wildcard_part(const char *str, int lenstr, * string boundary. Strip escapes during copy. */ endword = beginword; - while (endword - str < lenstr) + while (endword < endstr) { - clen = pg_mblen(endword); + clen = pg_mblen_range(endword, endstr); if (in_escape) { - if (ISWORDCHR(endword)) + if (ISWORDCHR(endword, clen)) { memcpy(s, endword, clen); - (*charlen)++; s += clen; } else @@ -873,10 +1006,9 @@ get_wildcard_part(const char *str, int lenstr, in_trailing_wildcard_meta = true; break; } - else if (ISWORDCHR(endword)) + else if (ISWORDCHR(endword, clen)) { memcpy(s, endword, clen); - (*charlen)++; s += clen; } else @@ -894,12 +1026,8 @@ get_wildcard_part(const char *str, int lenstr, if (RPADDING > 0) { *s++ = ' '; - (*charlen)++; if (RPADDING > 1) - { *s++ = ' '; - (*charlen)++; - } } } @@ -918,24 +1046,21 @@ TRGM * generate_wildcard_trgm(const char *str, int slen) { TRGM *trg; - char *buf, - *buf2; - trgm *tptr; + growable_trgm_array arr; + char *buf; int len, - charlen, bytelen; const char *eword; - protect_out_of_mem(slen); - - trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3); - trg->flag = ARRKEY; - SET_VARSIZE(trg, TRGMHDRSIZE); - if (slen + LPADDING + RPADDING < 3 || slen == 0) + { + trg = (TRGM *) palloc(TRGMHDRSIZE); + trg->flag = ARRKEY; + SET_VARSIZE(trg, TRGMHDRSIZE); return trg; + } - tptr = GETARR(trg); + init_trgm_array(&arr, slen); /* Allocate a buffer for blank-padded, but not yet case-folded, words */ buf = palloc_array(char, slen + 4); @@ -945,39 +1070,41 @@ generate_wildcard_trgm(const char *str, int slen) */ eword = str; while ((eword = get_wildcard_part(eword, slen - (eword - str), - buf, &bytelen, &charlen)) != NULL) + buf, &bytelen)) != NULL) { + char *word; + #ifdef IGNORECASE - buf2 = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID); - bytelen = strlen(buf2); + word = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID); + bytelen = strlen(word); #else - buf2 = buf; + word = buf; #endif /* * count trigrams */ - tptr = make_trigrams(tptr, buf2, bytelen, charlen); + make_trigrams(&arr, word, bytelen); #ifdef IGNORECASE - pfree(buf2); + pfree(word); #endif } pfree(buf); - if ((len = tptr - GETARR(trg)) == 0) - return trg; - /* * Make trigrams unique. */ + trg = arr.datum; + len = arr.length; if (len > 1) { qsort(GETARR(trg), len, sizeof(trgm), comp_trgm); len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm); } + trg->flag = ARRKEY; SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len)); return trg; diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c index 1d1b5fe304d..efee4cf5fb4 100644 --- a/contrib/pg_trgm/trgm_regexp.c +++ b/contrib/pg_trgm/trgm_regexp.c @@ -483,7 +483,7 @@ static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph, static void RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation); static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA); -static bool convertPgWchar(pg_wchar c, trgm_mb_char *result); +static int convertPgWchar(pg_wchar c, trgm_mb_char *result); static void transformGraph(TrgmNFA *trgmNFA); static void processState(TrgmNFA *trgmNFA, TrgmState *state); static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key); @@ -807,10 +807,11 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA) for (j = 0; j < charsCount; j++) { trgm_mb_char c; + int clen = convertPgWchar(chars[j], &c); - if (!convertPgWchar(chars[j], &c)) + if (!clen) continue; /* ok to ignore it altogether */ - if (ISWORDCHR(c.bytes)) + if (ISWORDCHR(c.bytes, clen)) colorInfo->wordChars[colorInfo->wordCharsCount++] = c; else colorInfo->containsNonWord = true; @@ -822,13 +823,15 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA) /* * Convert pg_wchar to multibyte format. - * Returns false if the character should be ignored completely. + * Returns 0 if the character should be ignored completely, else returns its + * byte length. */ -static bool +static int convertPgWchar(pg_wchar c, trgm_mb_char *result) { /* "s" has enough space for a multibyte character and a trailing NUL */ char s[MAX_MULTIBYTE_CHAR_LEN + 1]; + int clen; /* * We can ignore the NUL character, since it can never appear in a PG text @@ -836,11 +839,11 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result) * reconstructing trigrams. */ if (c == 0) - return false; + return 0; /* Do the conversion, making sure the result is NUL-terminated */ memset(s, 0, sizeof(s)); - pg_wchar2mb_with_len(&c, s, 1); + clen = pg_wchar2mb_with_len(&c, s, 1); /* * In IGNORECASE mode, we can ignore uppercase characters. We assume that @@ -857,12 +860,12 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result) */ #ifdef IGNORECASE { - char *lowerCased = str_tolower(s, strlen(s), DEFAULT_COLLATION_OID); + char *lowerCased = str_tolower(s, clen, DEFAULT_COLLATION_OID); if (strcmp(lowerCased, s) != 0) { pfree(lowerCased); - return false; + return 0; } pfree(lowerCased); } @@ -870,7 +873,7 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result) /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */ memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN); - return true; + return clen; } diff --git a/contrib/pg_visibility/expected/pg_visibility.out b/contrib/pg_visibility/expected/pg_visibility.out index e10f1706015..d26f0ab7589 100644 --- a/contrib/pg_visibility/expected/pg_visibility.out +++ b/contrib/pg_visibility/expected/pg_visibility.out @@ -207,7 +207,7 @@ select pg_truncate_visibility_map('test_partition'); -- test the case where vacuum phase I does not need to modify the heap buffer -- and only needs to set the VM -create table test_vac_unmodified_heap(a int); +create temp table test_vac_unmodified_heap(a int); insert into test_vac_unmodified_heap values (1); vacuum (freeze) test_vac_unmodified_heap; select pg_visibility_map_summary('test_vac_unmodified_heap'); diff --git a/contrib/pg_visibility/sql/pg_visibility.sql b/contrib/pg_visibility/sql/pg_visibility.sql index 57af8a0c5b6..0888adb96a6 100644 --- a/contrib/pg_visibility/sql/pg_visibility.sql +++ b/contrib/pg_visibility/sql/pg_visibility.sql @@ -97,7 +97,7 @@ select pg_truncate_visibility_map('test_partition'); -- test the case where vacuum phase I does not need to modify the heap buffer -- and only needs to set the VM -create table test_vac_unmodified_heap(a int); +create temp table test_vac_unmodified_heap(a int); insert into test_vac_unmodified_heap values (1); vacuum (freeze) test_vac_unmodified_heap; select pg_visibility_map_summary('test_vac_unmodified_heap'); diff --git a/contrib/pgcrypto/Makefile b/contrib/pgcrypto/Makefile index 69afa375011..17d2b0c5ed1 100644 --- a/contrib/pgcrypto/Makefile +++ b/contrib/pgcrypto/Makefile @@ -44,7 +44,8 @@ REGRESS = init md5 sha1 hmac-md5 hmac-sha1 blowfish rijndael \ sha2 des 3des cast5 \ crypt-des crypt-md5 crypt-blowfish crypt-xdes \ pgp-armor pgp-decrypt pgp-encrypt pgp-encrypt-md5 $(CF_PGP_TESTS) \ - pgp-pubkey-decrypt pgp-pubkey-encrypt pgp-info crypt-shacrypt + pgp-pubkey-decrypt pgp-pubkey-encrypt pgp-pubkey-session \ + pgp-info crypt-shacrypt ifdef USE_PGXS PG_CONFIG = pg_config diff --git a/contrib/pgcrypto/crypt-sha.c b/contrib/pgcrypto/crypt-sha.c index 7ec21771a83..e8f32bc3896 100644 --- a/contrib/pgcrypto/crypt-sha.c +++ b/contrib/pgcrypto/crypt-sha.c @@ -328,7 +328,7 @@ px_crypt_shacrypt(const char *pw, const char *salt, char *passwd, unsigned dstle ereport(ERROR, errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid character in salt string: \"%.*s\"", - pg_mblen(ep), ep)); + pg_mblen_cstr(ep), ep)); } else { diff --git a/contrib/pgcrypto/expected/pgp-decrypt.out b/contrib/pgcrypto/expected/pgp-decrypt.out index eb049ba9d44..8ce6466f2e9 100644 --- a/contrib/pgcrypto/expected/pgp-decrypt.out +++ b/contrib/pgcrypto/expected/pgp-decrypt.out @@ -315,7 +315,7 @@ SaV9L04ky1qECNDx3XjnoKLC+H7IOQ== \xda39a3ee5e6b4b0d3255bfef95601890afd80709 (1 row) -select digest(pgp_sym_decrypt(dearmor(' +select digest(pgp_sym_decrypt_bytea(dearmor(' -----BEGIN PGP MESSAGE----- Comment: dat3.aes.sha1.mdc.s2k3.z0 @@ -387,6 +387,28 @@ ERROR: Wrong key or corrupt data select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1'); NOTICE: dbg: parse_literal_data: data type=b ERROR: Not text data +-- NUL byte in text decrypt. Ciphertext source: +-- printf 'a\x00\xc' | gpg --homedir /nonexistent \ +-- --personal-compress-preferences uncompressed --textmode \ +-- --personal-cipher-preferences aes --no-emit-version --batch \ +-- --symmetric --passphrase key --armor +do $$ +begin + perform pgp_sym_decrypt(dearmor(' +-----BEGIN PGP MESSAGE----- + +jA0EBwMCXLc8pozB10Fg0jQBVUID59TLvWutJp0j6eh9ZgjqIRzdYaIymFB8y4XH +vu0YlJP5D5BX7yqZ+Pry7TlDmiFO +=rV7z +-----END PGP MESSAGE----- +'), 'key', 'debug=1'); +exception when others then + raise '%', + regexp_replace(sqlerrm, 'encoding "[^"]*"', 'encoding [REDACTED]'); +end +$$; +ERROR: invalid byte sequence for encoding [REDACTED]: 0x00 +CONTEXT: PL/pgSQL function inline_code_block line 12 at RAISE -- Decryption with a certain incorrect key yields an apparent BZip2-compressed -- plaintext. Ciphertext source: iterative pgp_sym_encrypt('secret', 'key') -- until the random prefix gave rise to that property. diff --git a/contrib/pgcrypto/expected/pgp-decrypt_1.out b/contrib/pgcrypto/expected/pgp-decrypt_1.out index 80a4c48613d..ee57ad43cb7 100644 --- a/contrib/pgcrypto/expected/pgp-decrypt_1.out +++ b/contrib/pgcrypto/expected/pgp-decrypt_1.out @@ -311,7 +311,7 @@ SaV9L04ky1qECNDx3XjnoKLC+H7IOQ== \xda39a3ee5e6b4b0d3255bfef95601890afd80709 (1 row) -select digest(pgp_sym_decrypt(dearmor(' +select digest(pgp_sym_decrypt_bytea(dearmor(' -----BEGIN PGP MESSAGE----- Comment: dat3.aes.sha1.mdc.s2k3.z0 @@ -383,6 +383,28 @@ ERROR: Wrong key or corrupt data select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1'); NOTICE: dbg: parse_literal_data: data type=b ERROR: Not text data +-- NUL byte in text decrypt. Ciphertext source: +-- printf 'a\x00\xc' | gpg --homedir /nonexistent \ +-- --personal-compress-preferences uncompressed --textmode \ +-- --personal-cipher-preferences aes --no-emit-version --batch \ +-- --symmetric --passphrase key --armor +do $$ +begin + perform pgp_sym_decrypt(dearmor(' +-----BEGIN PGP MESSAGE----- + +jA0EBwMCXLc8pozB10Fg0jQBVUID59TLvWutJp0j6eh9ZgjqIRzdYaIymFB8y4XH +vu0YlJP5D5BX7yqZ+Pry7TlDmiFO +=rV7z +-----END PGP MESSAGE----- +'), 'key', 'debug=1'); +exception when others then + raise '%', + regexp_replace(sqlerrm, 'encoding "[^"]*"', 'encoding [REDACTED]'); +end +$$; +ERROR: invalid byte sequence for encoding [REDACTED]: 0x00 +CONTEXT: PL/pgSQL function inline_code_block line 12 at RAISE -- Decryption with a certain incorrect key yields an apparent BZip2-compressed -- plaintext. Ciphertext source: iterative pgp_sym_encrypt('secret', 'key') -- until the random prefix gave rise to that property. diff --git a/contrib/pgcrypto/expected/pgp-pubkey-session.out b/contrib/pgcrypto/expected/pgp-pubkey-session.out new file mode 100644 index 00000000000..f724d98eb24 --- /dev/null +++ b/contrib/pgcrypto/expected/pgp-pubkey-session.out @@ -0,0 +1,47 @@ +-- Test for overflow with session key at decrypt. +-- Data automatically generated by scripts/pgp_session_data.py. +-- See this file for details explaining how this data is generated. +SELECT pgp_pub_decrypt_bytea( +'\xc1c04c030000000000000000020800a46f5b9b1905b49457a6485474f71ed9b46c2527e1 +da08e1f7871e12c3d38828f2076b984a595bf60f616599ca5729d547de06a258bfbbcd30 +94a321e4668cd43010f0ca8ecf931e5d39bda1152c50c367b11c723f270729245d3ebdbd +0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5060af7603cfd9ed186ebadd616 +3b50ae42bea5f6d14dda24e6d4687b434c175084515d562e896742b0ba9a1c87d5642e10 +a5550379c71cc490a052ada483b5d96526c0a600fc51755052aa77fdf72f7b4989b920e7 +b90f4b30787a46482670d5caecc7a515a926055ad5509d135702ce51a0e4c1033f2d939d +8f0075ec3428e17310da37d3d2d7ad1ce99adcc91cd446c366c402ae1ee38250343a7fcc +0f8bc28020e603d7a4795ef0dcc1c04c030000000000000000020800a46f5b9b1905b494 +57a6485474f71ed9b46c2527e1da08e1f7871e12c3d38828f2076b984a595bf60f616599 +ca5729d547de06a258bfbbcd3094a321e4668cd43010f0ca8ecf931e5d39bda1152c50c3 +67b11c723f270729245d3ebdbd0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5 +060af7603cfd9ed186ebadd6163b50ae42bea5f6d14dda24e6d4687b434c175084515d56 +2e896742b0ba9a1c87d5642e10a5550379c71cc490a052ada483b5d96526c0a600fc5175 +5052aa77fdf72f7b4989b920e7b90f4b30787a46482670d5caecc7a515a926055ad5509d +135702ce51a0e4c1033f2d939d8f0075ec3428e17310da37d3d2d7ad1ce99adc'::bytea, +'\xc7c2d8046965d657020800eef8bf1515adb1a3ee7825f75c668ea8dd3e3f9d13e958f6ad +9c55adc0c931a4bb00abe1d52cf7bb0c95d537949d277a5292ede375c6b2a67a3bf7d19f +f975bb7e7be35c2d8300dacba360a0163567372f7dc24000cc7cb6170bedc8f3b1f98c12 +07a6cb4de870a4bc61319b139dcc0e20c368fd68f8fd346d2c0b69c5aed560504e2ec6f1 +23086fe3c5540dc4dd155c0c67257c4ada862f90fe172ace344089da8135e92aca5c2709 +f1c1bc521798bb8c0365841496e709bd184132d387e0c9d5f26dc00fd06c3a76ef66a75c +138285038684707a847b7bd33cfbefbf1d336be954a8048946af97a66352adef8e8b5ae4 +c4748c6f2510265b7a8267bc370dbb00110100010007ff7e72d4f95d2d39901ac12ca5c5 +18e767e719e72340c3fab51c8c5ab1c40f31db8eaffe43533fa61e2dbca2c3f4396c0847 +e5434756acbb1f68128f4136bb135710c89137d74538908dac77967de9e821c559700dd9 +de5a2727eec1f5d12d5d74869dd1de45ed369d94a8814d23861dd163f8c27744b26b98f0 +239c2e6dd1e3493b8cc976fdc8f9a5e250f715aa4c3d7d5f237f8ee15d242e8fa941d1a0 +ed9550ab632d992a97518d142802cb0a97b251319bf5742db8d9d8cbaa06cdfba2d75bc9 +9d77a51ff20bd5ba7f15d7af6e85b904de2855d19af08d45f39deb85403033c69c767a8e +74a343b1d6c8911d34ea441ac3850e57808ed3d885835cbe6c79d10400ef16256f3d5c4c +3341516a2d2aa888df81b603f48a27f3666b40f992a857c1d11ff639cd764a9b42d5a1f8 +58b4aeee36b85508bb5e8b91ef88a7737770b330224479d9b44eae8c631bc43628b69549 +507c0a1af0be0dd7696015abea722b571eb35eefc4ab95595378ec12814727443f625fcd +183bb9b3bccf53b54dd0e5e7a50400ffe08537b2d4e6074e4a1727b658cfccdec8962302 +25e300c05690de45f7065c3d40d86f544a64d51a3e94424f9851a16d1322ebdb41fa8a45 +3131f3e2dc94e858e6396722643df382680f815e53bcdcde5da622f50530a83b217f1103 +cdd6e5e9babe1e415bbff28d44bd18c95f43bbd04afeb2a2a99af38a571c7540de21df03 +ff62c0a33d9143dd3f639893f47732c11c5a12c6052d1935f4d507b7ae1f76ab0e9a69b8 +7305a7f7c19bd509daf4903bff614bc26d118f03e461469c72c12d3a2bb4f78e4d342ce8 +487723649a01ed2b9eb11c662134502c098d55dfcd361939d8370873422c3da75a515a75 +9ffedfe7df44fb3c20f81650801a30d43b5c90b98b3eee'::bytea); +ERROR: Public key too big diff --git a/contrib/pgcrypto/meson.build b/contrib/pgcrypto/meson.build index c9c48f16f90..4f255c8cb05 100644 --- a/contrib/pgcrypto/meson.build +++ b/contrib/pgcrypto/meson.build @@ -52,6 +52,7 @@ pgcrypto_regress = [ 'pgp-encrypt-md5', 'pgp-pubkey-decrypt', 'pgp-pubkey-encrypt', + 'pgp-pubkey-session', 'pgp-info', 'crypt-shacrypt' ] diff --git a/contrib/pgcrypto/pgp-pgsql.c b/contrib/pgcrypto/pgp-pgsql.c index 3e47b9364ab..d3e7895b0d9 100644 --- a/contrib/pgcrypto/pgp-pgsql.c +++ b/contrib/pgcrypto/pgp-pgsql.c @@ -631,6 +631,7 @@ pgp_sym_decrypt_text(PG_FUNCTION_ARGS) arg = PG_GETARG_TEXT_PP(2); res = decrypt_internal(0, 1, data, key, NULL, arg); + pg_verifymbstr(VARDATA_ANY(res), VARSIZE_ANY_EXHDR(res), false); PG_FREE_IF_COPY(data, 0); PG_FREE_IF_COPY(key, 1); @@ -732,6 +733,7 @@ pgp_pub_decrypt_text(PG_FUNCTION_ARGS) arg = PG_GETARG_TEXT_PP(3); res = decrypt_internal(1, 1, data, key, psw, arg); + pg_verifymbstr(VARDATA_ANY(res), VARSIZE_ANY_EXHDR(res), false); PG_FREE_IF_COPY(data, 0); PG_FREE_IF_COPY(key, 1); diff --git a/contrib/pgcrypto/pgp-pubdec.c b/contrib/pgcrypto/pgp-pubdec.c index a0a5738a40e..2a13aa3e6ad 100644 --- a/contrib/pgcrypto/pgp-pubdec.c +++ b/contrib/pgcrypto/pgp-pubdec.c @@ -157,6 +157,7 @@ pgp_parse_pubenc_sesskey(PGP_Context *ctx, PullFilter *pkt) uint8 *msg; int msglen; PGP_MPI *m; + unsigned sess_key_len; pk = ctx->pub_key; if (pk == NULL) @@ -220,11 +221,19 @@ pgp_parse_pubenc_sesskey(PGP_Context *ctx, PullFilter *pkt) if (res < 0) goto out; + sess_key_len = msglen - 3; + if (sess_key_len > PGP_MAX_KEY) + { + px_debug("incorrect session key length=%u", sess_key_len); + res = PXE_PGP_KEY_TOO_BIG; + goto out; + } + /* * got sesskey */ ctx->cipher_algo = *msg; - ctx->sess_key_len = msglen - 3; + ctx->sess_key_len = sess_key_len; memcpy(ctx->sess_key, msg + 1, ctx->sess_key_len); out: diff --git a/contrib/pgcrypto/px.c b/contrib/pgcrypto/px.c index 4d668d4e496..d9bf1aae81e 100644 --- a/contrib/pgcrypto/px.c +++ b/contrib/pgcrypto/px.c @@ -65,6 +65,7 @@ static const struct error_desc px_err_list[] = { {PXE_PGP_UNEXPECTED_PKT, "Unexpected packet in key data"}, {PXE_PGP_MATH_FAILED, "Math operation failed"}, {PXE_PGP_SHORT_ELGAMAL_KEY, "Elgamal keys must be at least 1024 bits long"}, + {PXE_PGP_KEY_TOO_BIG, "Public key too big"}, {PXE_PGP_UNKNOWN_PUBALGO, "Unknown public-key encryption algorithm"}, {PXE_PGP_WRONG_KEY, "Wrong key"}, {PXE_PGP_MULTIPLE_KEYS, diff --git a/contrib/pgcrypto/px.h b/contrib/pgcrypto/px.h index 4b81fceab8e..a09533a3582 100644 --- a/contrib/pgcrypto/px.h +++ b/contrib/pgcrypto/px.h @@ -75,7 +75,7 @@ /* -108 is unused */ #define PXE_PGP_MATH_FAILED -109 #define PXE_PGP_SHORT_ELGAMAL_KEY -110 -/* -111 is unused */ +#define PXE_PGP_KEY_TOO_BIG -111 #define PXE_PGP_UNKNOWN_PUBALGO -112 #define PXE_PGP_WRONG_KEY -113 #define PXE_PGP_MULTIPLE_KEYS -114 diff --git a/contrib/pgcrypto/scripts/pgp_session_data.py b/contrib/pgcrypto/scripts/pgp_session_data.py new file mode 100644 index 00000000000..999350bb2bc --- /dev/null +++ b/contrib/pgcrypto/scripts/pgp_session_data.py @@ -0,0 +1,491 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# +# Generate PGP data to check the session key length of the input data provided +# to pgp_pub_decrypt_bytea(). +# +# First, the crafted data is generated from valid RSA data, freshly generated +# by this script each time it is run, see generate_rsa_keypair(). +# Second, the crafted PGP data is built, see build_message_data() and +# build_key_data(). Finally, the resulting SQL script is generated. +# +# This script generates in stdout the SQL file that is used in the regression +# tests of pgcrypto. The following command can be used to regenerate the file +# which should never be manually manipulated: +# python3 scripts/pgp_session_data.py > sql/pgp-pubkey-session.sql + +import os +import re +import struct +import secrets +import sys +import time + +# pwn for binary manipulation (p32, p64) +from pwn import * + +# Cryptographic libraries, to craft the PGP data. +from Crypto.Cipher import AES +from Crypto.PublicKey import RSA +from Crypto.Util.number import inverse + +# AES key used for session key encryption (16 bytes for AES-128) +AES_KEY = b'\x01' * 16 + +def generate_rsa_keypair(key_size: int = 2048) -> dict: + """ + Generate a fresh RSA key pair. + + The generated key includes all components needed for PGP operations: + - n: public modulus (p * q) + - e: public exponent (typically 65537) + - d: private exponent (e^-1 mod phi(n)) + - p, q: prime factors of n + - u: coefficient (p^-1 mod q) for CRT optimization + + The caller can pass the wanted key size in input, for a default of 2048 + bytes. This function returns the RSA key components, after performing + some validation on them. + """ + + start_time = time.time() + + # Generate RSA key + key = RSA.generate(key_size) + + # Extract all key components + rsa_components = { + 'n': key.n, # Public modulus (p * q) + 'e': key.e, # Public exponent (typically 65537) + 'd': key.d, # Private exponent (e^-1 mod phi(n)) + 'p': key.p, # First prime factor + 'q': key.q, # Second prime factor + 'u': inverse(key.p, key.q) # Coefficient for CRT: p^-1 mod q + } + + # Validate key components for correctness + validate_rsa_key(rsa_components) + + return rsa_components + +def validate_rsa_key(rsa: dict) -> None: + """ + Validate a generated RSA key. + + This function performs basic validation to ensure the RSA key is properly + constructed and all components are consistent, at least mathematically. + + Validations performed: + 1. n = p * q (modulus is product of primes) + 2. gcd(e, phi(n)) = 1 (public exponent is coprime to phi(n)) + 3. (d * e) mod(phi(n)) = 1 (private exponent is multiplicative inverse) + 4. (u * p) (mod q) = 1 (coefficient is correct for CRT) + """ + + n, e, d, p, q, u = rsa['n'], rsa['e'], rsa['d'], rsa['p'], rsa['q'], rsa['u'] + + # Check that n = p * q + if n != p * q: + raise ValueError("RSA validation failed: n <> p * q") + + # Check that p and q are different + if p == q: + raise ValueError("RSA validation failed: p = q (not allowed)") + + # Calculate phi(n) = (p-1)(q-1) + phi_n = (p - 1) * (q - 1) + + # Check that gcd(e, phi(n)) = 1 + def gcd(a, b): + while b: + a, b = b, a % b + return a + + if gcd(e, phi_n) != 1: + raise ValueError("RSA validation failed: gcd(e, phi(n)) <> 1") + + # Check that (d * e) mod(phi(n)) = 1 + if (d * e) % phi_n != 1: + raise ValueError("RSA validation failed: d * e <> 1 (mod phi(n))") + + # Check that (u * p) (mod q) = 1 + if (u * p) % q != 1: + raise ValueError("RSA validation failed: u * p <> 1 (mod q)") + +def mpi_encode(x: int) -> bytes: + """ + Encode an integer as an OpenPGP Multi-Precision Integer (MPI). + + Format (RFC 4880, Section 3.2): + - 2 bytes: bit length of the integer (big-endian) + - N bytes: the integer in big-endian format + + This is used to encode RSA key components (n, e, d, p, q, u) in PGP + packets. + + The integer to encode is given in input, returning an MPI-encoded + integer. + + For example: + mpi_encode(65537) -> b'\x00\x11\x01\x00\x01' + (17 bits, value 0x010001) + """ + if x < 0: + raise ValueError("MPI cannot encode negative integers") + + if x == 0: + # Special case: zero has 0 bits and empty magnitude + bits = 0 + mag = b"" + else: + # Calculate bit length and convert to bytes + bits = x.bit_length() + mag = x.to_bytes((bits + 7) // 8, 'big') + + # Pack: 2-byte bit length + magnitude bytes + return struct.pack('>H', bits) + mag + +def new_packet(tag: int, payload: bytes) -> bytes: + """ + Create a new OpenPGP packet with a proper header. + + OpenPGP packet format (RFC 4880, Section 4.2): + - New packet format: 0xC0 | tag + - Length encoding depends on payload size: + * 0-191: single byte + * 192-8383: two bytes (192 + ((length - 192) >> 8), (length - 192) & 0xFF) + * 8384+: five bytes (0xFF + 4-byte big-endian length) + + The packet is built from a "tag" (1-63) and some "payload" data. The + result generated is a complete OpenPGP packet. + + For example: + new_packet(1, b'data') -> b'\xC1\x04data' + (Tag 1, length 4, payload 'data') + """ + # New packet format: set bit 7 and 6, clear bit 5, tag in bits 0-5 + first = 0xC0 | (tag & 0x3F) + ln = len(payload) + + # Encode length according to OpenPGP specification + if ln <= 191: + # Single byte length for small packets + llen = bytes([ln]) + elif ln <= 8383: + # Two-byte length for medium packets + ln2 = ln - 192 + llen = bytes([192 + (ln2 >> 8), ln2 & 0xFF]) + else: + # Five-byte length for large packets + llen = bytes([255]) + struct.pack('>I', ln) + + return bytes([first]) + llen + payload + +def build_key_data(rsa: dict) -> bytes: + """ + Build the key data, containing an RSA private key. + + The RSA contents should have been generated previously. + + Format (see RFC 4880, Section 5.5.3): + - 1 byte: version (4) + - 4 bytes: creation time (current Unix timestamp) + - 1 byte: public key algorithm (2 = RSA encrypt) + - MPI: RSA public modulus n + - MPI: RSA public exponent e + - 1 byte: string-to-key usage (0 = no encryption) + - MPI: RSA private exponent d + - MPI: RSA prime p + - MPI: RSA prime q + - MPI: RSA coefficient u = p^-1 mod q + - 2 bytes: checksum of private key material + + This function takes a set of RSA key components in input (n, e, d, p, q, u) + and returns a secret key packet. + """ + + # Public key portion + ver = bytes([4]) # Version 4 key + ctime = struct.pack('>I', int(time.time())) # Current Unix timestamp + algo = bytes([2]) # RSA encrypt algorithm + n_mpi = mpi_encode(rsa['n']) # Public modulus + e_mpi = mpi_encode(rsa['e']) # Public exponent + pub = ver + ctime + algo + n_mpi + e_mpi + + # Private key portion + hide_type = bytes([0]) # No string-to-key encryption + d_mpi = mpi_encode(rsa['d']) # Private exponent + p_mpi = mpi_encode(rsa['p']) # Prime p + q_mpi = mpi_encode(rsa['q']) # Prime q + u_mpi = mpi_encode(rsa['u']) # Coefficient u = p^-1 mod q + + # Calculate checksum of private key material (simple sum mod 65536) + private_data = d_mpi + p_mpi + q_mpi + u_mpi + cksum = sum(private_data) & 0xFFFF + + secret = hide_type + private_data + struct.pack('>H', cksum) + payload = pub + secret + + return new_packet(7, payload) + +def pgp_cfb_encrypt_resync(key, plaintext): + """ + Implement OpenPGP CFB mode with resync. + + OpenPGP CFB mode is a variant of standard CFB with a resync operation + after the first two blocks. + + Algorithm (RFC 4880, Section 13.9): + 1. Block 1: FR=zeros, encrypt full block_size bytes + 2. Block 2: FR=block1, encrypt only 2 bytes + 3. Resync: FR = block1[2:] + block2 + 4. Remaining blocks: standard CFB mode + + This function uses the following arguments: + - key: AES encryption key (16 bytes for AES-128) + - plaintext: Data to encrypt + """ + block_size = 16 # AES block size + cipher = AES.new(key[:16], AES.MODE_ECB) # Use ECB for manual CFB + ciphertext = b'' + + # Block 1: FR=zeros, encrypt full 16 bytes + FR = b'\x00' * block_size + FRE = cipher.encrypt(FR) # Encrypt the feedback register + block1 = bytes(a ^ b for a, b in zip(FRE, plaintext[0:16])) + ciphertext += block1 + + # Block 2: FR=block1, encrypt only 2 bytes + FR = block1 + FRE = cipher.encrypt(FR) + block2 = bytes(a ^ b for a, b in zip(FRE[0:2], plaintext[16:18])) + ciphertext += block2 + + # Resync: FR = block1[2:16] + block2[0:2] + # This is the key difference from standard CFB mode + FR = block1[2:] + block2 + + # Block 3+: Continue with standard CFB mode + pos = 18 + while pos < len(plaintext): + FRE = cipher.encrypt(FR) + chunk_len = min(block_size, len(plaintext) - pos) + chunk = plaintext[pos:pos+chunk_len] + enc_chunk = bytes(a ^ b for a, b in zip(FRE[:chunk_len], chunk)) + ciphertext += enc_chunk + + # Update feedback register for next iteration + if chunk_len == block_size: + FR = enc_chunk + else: + # Partial block: pad with old FR bytes + FR = enc_chunk + FR[chunk_len:] + pos += chunk_len + + return ciphertext + +def build_literal_data_packet(data: bytes) -> bytes: + """ + Build a literal data packet containing a message. + + Format (RFC 4880, Section 5.9): + - 1 byte: data format ('b' = binary, 't' = text, 'u' = UTF-8 text) + - 1 byte: filename length (0 = no filename) + - N bytes: filename (empty in this case) + - 4 bytes: date (current Unix timestamp) + - M bytes: literal data + + The data used to build the packet is given in input, with the generated + result returned. + """ + body = bytes([ + ord('b'), # Binary data format + 0, # Filename length (0 = no filename) + ]) + struct.pack('>I', int(time.time())) + data # Current timestamp + data + + return new_packet(11, body) + +def build_symenc_data_packet(sess_key: bytes, cipher_algo: int, payload: bytes) -> bytes: + """ + Build a symmetrically-encrypted data packet using AES-128-CFB. + + This packet contains encrypted data using the session key. The format + includes a random prefix, for security (see RFC 4880, Section 5.7). + + Packet structure: + - Random prefix (block_size bytes) + - Prefix repeat (last 2 bytes of prefix repeated) + - Encrypted literal data packet + + This function uses the following set of arguments: + - sess_key: Session key for encryption + - cipher_algo: Cipher algorithm identifier (7 = AES-128) + - payload: Data to encrypt (wrapped in literal data packet) + """ + block_size = 16 # AES-128 block size + key = sess_key[:16] # Use first 16 bytes for AES-128 + + # Create random prefix + repeat last 2 bytes (total 18 bytes) + # This is required by OpenPGP for integrity checking + prefix_random = secrets.token_bytes(block_size) + prefix = prefix_random + prefix_random[-2:] # 18 bytes total + + # Wrap payload in literal data packet + literal_pkt = build_literal_data_packet(payload) + + # Plaintext = prefix + literal data packet + plaintext = prefix + literal_pkt + + # Encrypt using OpenPGP CFB mode with resync + ciphertext = pgp_cfb_encrypt_resync(key, plaintext) + + return new_packet(9, ciphertext) + +def build_tag1_packet(rsa: dict, sess_key: bytes) -> bytes: + """ + Build a public-key encrypted key. + + This is a very important function, as it is able to create the packet + triggering the overflow check. This function can also be used to create + "legit" packet data. + + Format (RFC 4880, Section 5.1): + - 1 byte: version (3) + - 8 bytes: key ID (0 = any key accepted) + - 1 byte: public key algorithm (2 = RSA encrypt) + - MPI: RSA-encrypted session key + + This uses in arguments the generated RSA key pair, and the session key + to encrypt. The latter is manipulated to trigger the overflow. + + This function returns a complete packet encrypted by a session key. + """ + + # Calculate RSA modulus size in bytes + n_bytes = (rsa['n'].bit_length() + 7) // 8 + + # Session key message format: + # - 1 byte: symmetric cipher algorithm (7 = AES-128) + # - N bytes: session key + # - 2 bytes: checksum (simple sum of session key bytes) + algo_byte = bytes([7]) # AES-128 algorithm identifier + cksum = sum(sess_key) & 0xFFFF # 16-bit checksum + M = algo_byte + sess_key + struct.pack('>H', cksum) + + # PKCS#1 v1.5 padding construction + # Format: 0x02 || PS || 0x00 || M + # Total padded message must be exactly n_bytes long. + total_len = n_bytes # Total length must equal modulus size in bytes + ps_len = total_len - len(M) - 2 # Subtract 2 for 0x02 and 0x00 bytes + + if ps_len < 8: + raise ValueError(f"Padding string too short ({ps_len} bytes); need at least 8 bytes. " + f"Message length: {len(M)}, Modulus size: {n_bytes} bytes") + + # Create padding string with *ALL* bytes being 0xFF (no zero separator!) + PS = bytes([0xFF]) * ps_len + + # Construct the complete padded message + # Normal PKCS#1 v1.5 padding: 0x02 || PS || 0x00 || M + padded = bytes([0x02]) + PS + bytes([0x00]) + M + + # Verify padding construction + if len(padded) != n_bytes: + raise ValueError(f"Padded message length ({len(padded)}) doesn't match RSA modulus size ({n_bytes})") + + # Convert padded message to integer and encrypt with RSA + m_int = int.from_bytes(padded, 'big') + + # Ensure message is smaller than modulus (required for RSA) + if m_int >= rsa['n']: + raise ValueError("Padded message is larger than RSA modulus") + + # RSA encryption: c = m^e mod n + c_int = pow(m_int, rsa['e'], rsa['n']) + + # Encode encrypted result as MPI + c_mpi = mpi_encode(c_int) + + # Build complete packet + ver = bytes([3]) # Version 3 packet + key_id = b"\x00" * 8 # Key ID (0 = any key accepted) + algo = bytes([2]) # RSA encrypt algorithm + payload = ver + key_id + algo + c_mpi + + return new_packet(1, payload) + +def build_message_data(rsa: dict) -> bytes: + """ + This function creates a crafted message, with a long session key + length. + + This takes in input the RSA key components generated previously, + returning a concatenated set of PGP packets crafted for the purpose + of this test. + """ + + # Base prefix for session key (AES key + padding + size). + # Note that the crafted size is the important part for this test. + prefix = AES_KEY + b"\x00" * 16 + p32(0x10) + + # Build encrypted data packet, legit. + sedata = build_symenc_data_packet(AES_KEY, cipher_algo=7, payload=b"\x0a\x00") + + # Build multiple packets + packets = [ + # First packet, legit. + build_tag1_packet(rsa, prefix), + + # Encrypted data packet, legit. + sedata, + + # Second packet: information payload. + # + # This packet contains a longer-crafted session key, able to trigger + # the overflow check in pgcrypto. This is the critical part, and + # and you are right to pay a lot of attention here if you are + # reading this code. + build_tag1_packet(rsa, prefix) + ] + + return b"".join(packets) + +def main(): + # Default key size. + # This number can be set to a higher number if wanted, like 4096. We + # just do not need to do that here. + key_size = 2048 + + # Generate fresh RSA key pair + rsa = generate_rsa_keypair(key_size) + + # Generate the message data. + print("### Building message data", file=sys.stderr) + message_data = build_message_data(rsa) + + # Build the key containing the RSA private key + print("### Building key data", file=sys.stderr) + key_data = build_key_data(rsa) + + # Convert to hexadecimal, for the bytea used in the SQL file. + message_data = message_data.hex() + key_data = key_data.hex() + + # Split each value into lines of 72 characters, for readability. + message_data = re.sub("(.{72})", "\\1\n", message_data, 0, re.DOTALL) + key_data = re.sub("(.{72})", "\\1\n", key_data, 0, re.DOTALL) + + # Get the script filename for documentation + file_basename = os.path.basename(__file__) + + # Output the SQL test case + print(f'''-- Test for overflow with session key at decrypt. +-- Data automatically generated by scripts/{file_basename}. +-- See this file for details explaining how this data is generated. +SELECT pgp_pub_decrypt_bytea( +'\\x{message_data}'::bytea, +'\\x{key_data}'::bytea);''', + file=sys.stdout) + +if __name__ == "__main__": + main() diff --git a/contrib/pgcrypto/sql/pgp-decrypt.sql b/contrib/pgcrypto/sql/pgp-decrypt.sql index 49a0267bbcb..b499bf757b0 100644 --- a/contrib/pgcrypto/sql/pgp-decrypt.sql +++ b/contrib/pgcrypto/sql/pgp-decrypt.sql @@ -228,7 +228,7 @@ SaV9L04ky1qECNDx3XjnoKLC+H7IOQ== -----END PGP MESSAGE----- '), '0123456789abcdefghij'), 'sha1'); -select digest(pgp_sym_decrypt(dearmor(' +select digest(pgp_sym_decrypt_bytea(dearmor(' -----BEGIN PGP MESSAGE----- Comment: dat3.aes.sha1.mdc.s2k3.z0 @@ -282,6 +282,27 @@ VsxxqLSPzNLAeIspJk5G -- Routine text/binary mismatch. select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1'); +-- NUL byte in text decrypt. Ciphertext source: +-- printf 'a\x00\xc' | gpg --homedir /nonexistent \ +-- --personal-compress-preferences uncompressed --textmode \ +-- --personal-cipher-preferences aes --no-emit-version --batch \ +-- --symmetric --passphrase key --armor +do $$ +begin + perform pgp_sym_decrypt(dearmor(' +-----BEGIN PGP MESSAGE----- + +jA0EBwMCXLc8pozB10Fg0jQBVUID59TLvWutJp0j6eh9ZgjqIRzdYaIymFB8y4XH +vu0YlJP5D5BX7yqZ+Pry7TlDmiFO +=rV7z +-----END PGP MESSAGE----- +'), 'key', 'debug=1'); +exception when others then + raise '%', + regexp_replace(sqlerrm, 'encoding "[^"]*"', 'encoding [REDACTED]'); +end +$$; + -- Decryption with a certain incorrect key yields an apparent BZip2-compressed -- plaintext. Ciphertext source: iterative pgp_sym_encrypt('secret', 'key') -- until the random prefix gave rise to that property. diff --git a/contrib/pgcrypto/sql/pgp-pubkey-session.sql b/contrib/pgcrypto/sql/pgp-pubkey-session.sql new file mode 100644 index 00000000000..51792f1f4d8 --- /dev/null +++ b/contrib/pgcrypto/sql/pgp-pubkey-session.sql @@ -0,0 +1,46 @@ +-- Test for overflow with session key at decrypt. +-- Data automatically generated by scripts/pgp_session_data.py. +-- See this file for details explaining how this data is generated. +SELECT pgp_pub_decrypt_bytea( +'\xc1c04c030000000000000000020800a46f5b9b1905b49457a6485474f71ed9b46c2527e1 +da08e1f7871e12c3d38828f2076b984a595bf60f616599ca5729d547de06a258bfbbcd30 +94a321e4668cd43010f0ca8ecf931e5d39bda1152c50c367b11c723f270729245d3ebdbd +0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5060af7603cfd9ed186ebadd616 +3b50ae42bea5f6d14dda24e6d4687b434c175084515d562e896742b0ba9a1c87d5642e10 +a5550379c71cc490a052ada483b5d96526c0a600fc51755052aa77fdf72f7b4989b920e7 +b90f4b30787a46482670d5caecc7a515a926055ad5509d135702ce51a0e4c1033f2d939d +8f0075ec3428e17310da37d3d2d7ad1ce99adcc91cd446c366c402ae1ee38250343a7fcc +0f8bc28020e603d7a4795ef0dcc1c04c030000000000000000020800a46f5b9b1905b494 +57a6485474f71ed9b46c2527e1da08e1f7871e12c3d38828f2076b984a595bf60f616599 +ca5729d547de06a258bfbbcd3094a321e4668cd43010f0ca8ecf931e5d39bda1152c50c3 +67b11c723f270729245d3ebdbd0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5 +060af7603cfd9ed186ebadd6163b50ae42bea5f6d14dda24e6d4687b434c175084515d56 +2e896742b0ba9a1c87d5642e10a5550379c71cc490a052ada483b5d96526c0a600fc5175 +5052aa77fdf72f7b4989b920e7b90f4b30787a46482670d5caecc7a515a926055ad5509d +135702ce51a0e4c1033f2d939d8f0075ec3428e17310da37d3d2d7ad1ce99adc'::bytea, +'\xc7c2d8046965d657020800eef8bf1515adb1a3ee7825f75c668ea8dd3e3f9d13e958f6ad +9c55adc0c931a4bb00abe1d52cf7bb0c95d537949d277a5292ede375c6b2a67a3bf7d19f +f975bb7e7be35c2d8300dacba360a0163567372f7dc24000cc7cb6170bedc8f3b1f98c12 +07a6cb4de870a4bc61319b139dcc0e20c368fd68f8fd346d2c0b69c5aed560504e2ec6f1 +23086fe3c5540dc4dd155c0c67257c4ada862f90fe172ace344089da8135e92aca5c2709 +f1c1bc521798bb8c0365841496e709bd184132d387e0c9d5f26dc00fd06c3a76ef66a75c +138285038684707a847b7bd33cfbefbf1d336be954a8048946af97a66352adef8e8b5ae4 +c4748c6f2510265b7a8267bc370dbb00110100010007ff7e72d4f95d2d39901ac12ca5c5 +18e767e719e72340c3fab51c8c5ab1c40f31db8eaffe43533fa61e2dbca2c3f4396c0847 +e5434756acbb1f68128f4136bb135710c89137d74538908dac77967de9e821c559700dd9 +de5a2727eec1f5d12d5d74869dd1de45ed369d94a8814d23861dd163f8c27744b26b98f0 +239c2e6dd1e3493b8cc976fdc8f9a5e250f715aa4c3d7d5f237f8ee15d242e8fa941d1a0 +ed9550ab632d992a97518d142802cb0a97b251319bf5742db8d9d8cbaa06cdfba2d75bc9 +9d77a51ff20bd5ba7f15d7af6e85b904de2855d19af08d45f39deb85403033c69c767a8e +74a343b1d6c8911d34ea441ac3850e57808ed3d885835cbe6c79d10400ef16256f3d5c4c +3341516a2d2aa888df81b603f48a27f3666b40f992a857c1d11ff639cd764a9b42d5a1f8 +58b4aeee36b85508bb5e8b91ef88a7737770b330224479d9b44eae8c631bc43628b69549 +507c0a1af0be0dd7696015abea722b571eb35eefc4ab95595378ec12814727443f625fcd +183bb9b3bccf53b54dd0e5e7a50400ffe08537b2d4e6074e4a1727b658cfccdec8962302 +25e300c05690de45f7065c3d40d86f544a64d51a3e94424f9851a16d1322ebdb41fa8a45 +3131f3e2dc94e858e6396722643df382680f815e53bcdcde5da622f50530a83b217f1103 +cdd6e5e9babe1e415bbff28d44bd18c95f43bbd04afeb2a2a99af38a571c7540de21df03 +ff62c0a33d9143dd3f639893f47732c11c5a12c6052d1935f4d507b7ae1f76ab0e9a69b8 +7305a7f7c19bd509daf4903bff614bc26d118f03e461469c72c12d3a2bb4f78e4d342ce8 +487723649a01ed2b9eb11c662134502c098d55dfcd361939d8370873422c3da75a515a75 +9ffedfe7df44fb3c20f81650801a30d43b5c90b98b3eee'::bytea); diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index 6066510c7c0..7cad5e67d09 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -698,12 +698,12 @@ EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = -c1; -- Op Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8 FROM "S 1"."T 1" WHERE (("C 1" = (- "C 1"))) (3 rows) -EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE (c3 IS NOT NULL) IS DISTINCT FROM (c3 IS NOT NULL); -- DistinctExpr - QUERY PLAN --------------------------------------------------------------------------------------------------------------------------------------- +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c3 IS DISTINCT FROM c3; -- DistinctExpr + QUERY PLAN +---------------------------------------------------------------------------------------------------------- Foreign Scan on public.ft1 t1 Output: c1, c2, c3, c4, c5, c6, c7, c8 - Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8 FROM "S 1"."T 1" WHERE (((c3 IS NOT NULL) IS DISTINCT FROM (c3 IS NOT NULL))) + Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8 FROM "S 1"."T 1" WHERE ((c3 IS DISTINCT FROM c3)) (3 rows) EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = ANY(ARRAY[c2, 1, c1 + 0]); -- ScalarArrayOpExpr diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql index 4f7ab2ed0ac..eff25bd2baa 100644 --- a/contrib/postgres_fdw/sql/postgres_fdw.sql +++ b/contrib/postgres_fdw/sql/postgres_fdw.sql @@ -340,7 +340,7 @@ EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c3 IS NULL; -- Nu EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c3 IS NOT NULL; -- NullTest EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE round(abs(c1), 0) = 1; -- FuncExpr EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = -c1; -- OpExpr(l) -EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE (c3 IS NOT NULL) IS DISTINCT FROM (c3 IS NOT NULL); -- DistinctExpr +EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c3 IS DISTINCT FROM c3; -- DistinctExpr EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = ANY(ARRAY[c2, 1, c1 + 0]); -- ScalarArrayOpExpr EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = (ARRAY[c1,c2,3])[1]; -- SubscriptingRef EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c6 = E'foo''s\\bar'; -- check special chars diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c index e25c8a5aa26..69b173e4498 100644 --- a/contrib/unaccent/unaccent.c +++ b/contrib/unaccent/unaccent.c @@ -156,7 +156,7 @@ initTrie(const char *filename) state = 0; for (ptr = line; *ptr; ptr += ptrlen) { - ptrlen = pg_mblen(ptr); + ptrlen = pg_mblen_cstr(ptr); /* ignore whitespace, but end src or trg */ if (isspace((unsigned char) *ptr)) { @@ -382,6 +382,7 @@ unaccent_lexize(PG_FUNCTION_ARGS) char *srcchar = (char *) PG_GETARG_POINTER(1); int32 len = PG_GETARG_INT32(2); char *srcstart = srcchar; + const char *srcend = srcstart + len; TSLexeme *res; StringInfoData buf; @@ -409,7 +410,7 @@ unaccent_lexize(PG_FUNCTION_ARGS) } else { - matchlen = pg_mblen(srcchar); + matchlen = pg_mblen_range(srcchar, srcend); if (buf.data != NULL) appendBinaryStringInfo(&buf, srcchar, matchlen); } diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 5560b95ee60..37342986969 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2412,6 +2412,43 @@ include_dir 'conf.d' + + file_extend_method (enum) + + file_extend_method configuration parameter + + + + + Specifies the method used to extend data files during bulk operations + such as COPY. The first available option is used as + the default, depending on the operating system: + + + + posix_fallocate (Unix) uses the standard POSIX + interface for allocating disk space, but is missing on some systems. + If it is present but the underlying file system doesn't support it, + this option silently falls back to write_zeros. + Current versions of BTRFS are known to disable compression when + this option is used. + This is the default on systems that have the function. + + + + + write_zeros extends files by writing out blocks + of zero bytes. This is the default on systems that don't have the + function posix_fallocate. + + + + The write_zeros method is always used when data + files are extended by 8 blocks or fewer. + + + + max_notify_queue_pages (integer) @@ -4722,45 +4759,6 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows - - synchronized_standby_slots (string) - - synchronized_standby_slots configuration parameter - - - - - A comma-separated list of streaming replication standby server slot names - that logical WAL sender processes will wait for. Logical WAL sender processes - will send decoded changes to plugins only after the specified replication - slots confirm receiving WAL. This guarantees that logical replication - failover slots do not consume changes until those changes are received - and flushed to corresponding physical standbys. If a - logical replication connection is meant to switch to a physical standby - after the standby is promoted, the physical replication slot for the - standby should be listed here. Note that logical replication will not - proceed if the slots specified in the - synchronized_standby_slots do not exist or are invalidated. - Additionally, the replication management functions - - pg_replication_slot_advance, - - pg_logical_slot_get_changes, and - - pg_logical_slot_peek_changes, - when used with logical failover slots, will block until all - physical slots specified in synchronized_standby_slots have - confirmed WAL receipt. - - - The standbys corresponding to the physical replication slots in - synchronized_standby_slots must configure - sync_replication_slots = true so they can receive - logical failover slot changes from the primary. - - - - @@ -4909,6 +4907,45 @@ ANY num_sync ( + synchronized_standby_slots (string) + + synchronized_standby_slots configuration parameter + + + + + A comma-separated list of streaming replication standby server slot names + that logical WAL sender processes will wait for. Logical WAL sender processes + will send decoded changes to plugins only after the specified replication + slots confirm receiving WAL. This guarantees that logical replication + failover slots do not consume changes until those changes are received + and flushed to corresponding physical standbys. If a + logical replication connection is meant to switch to a physical standby + after the standby is promoted, the physical replication slot for the + standby should be listed here. Note that logical replication will not + proceed if the slots specified in the + synchronized_standby_slots do not exist or are invalidated. + Additionally, the replication management functions + + pg_replication_slot_advance, + + pg_logical_slot_get_changes, and + + pg_logical_slot_peek_changes, + when used with logical failover slots, will block until all + physical slots specified in synchronized_standby_slots have + confirmed WAL receipt. + + + The standbys corresponding to the physical replication slots in + synchronized_standby_slots must configure + sync_replication_slots = true so they can receive + logical failover slot changes from the primary. + + @@ -7083,27 +7120,57 @@ local0.* /var/log/postgresql - log_min_messages (enum) + log_min_messages (string) log_min_messages configuration parameter - Controls which message - levels are written to the server log. - Valid values are DEBUG5, DEBUG4, - DEBUG3, DEBUG2, DEBUG1, - INFO, NOTICE, WARNING, - ERROR, LOG, FATAL, and - PANIC. Each level includes all the levels that - follow it. The later the level, the fewer messages are sent - to the log. The default is WARNING. Note that - LOG has a different rank here than in + Controls which + message levels + are written to the server log. The value is a comma-separated + list of zero or more + process type:level + entries and exactly one mandatory + level entry, + which becomes the default for process types not listed. + Valid process types are listed in the table below. + + archiver + autovacuum + backend + bgworker + bgwriter + checkpointer + ioworker + postmaster + syslogger + slotsyncworker + startup + walreceiver + walsender + walsummarizer + walwriter + + Valid level values are DEBUG5, + DEBUG4, DEBUG3, DEBUG2, + DEBUG1, INFO, NOTICE, + WARNING, ERROR, LOG, + FATAL, and PANIC. Each level includes + all the levels that follow it. The later the level, the fewer messages are sent + to the log. The default is WARNING, which + applies that level to all process types. + Note that LOG has a different rank here than in . Only superusers and users with the appropriate SET privilege can change this setting. + + Example: To log walsender and autovacuum + at level DEBUG1 and everything else at ERROR, + set log_min_messages to error, walsender:debug1, autovacuum:debug1. + diff --git a/doc/src/sgml/contrib.sgml b/doc/src/sgml/contrib.sgml index 24b706b29ad..bdd4865f53f 100644 --- a/doc/src/sgml/contrib.sgml +++ b/doc/src/sgml/contrib.sgml @@ -156,6 +156,7 @@ CREATE EXTENSION extension_name; &pgfreespacemap; &pglogicalinspect; &pgoverexplain; + &pgplanadvice; &pgprewarm; &pgrowlocks; &pgstatstatements; diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml index ac66fcbdb57..d90b4338d2a 100644 --- a/doc/src/sgml/filelist.sgml +++ b/doc/src/sgml/filelist.sgml @@ -149,6 +149,7 @@ + diff --git a/doc/src/sgml/oid2name.sgml b/doc/src/sgml/oid2name.sgml index 54cc9be2b82..9340d7376aa 100644 --- a/doc/src/sgml/oid2name.sgml +++ b/doc/src/sgml/oid2name.sgml @@ -118,7 +118,7 @@ display more information about each object shown: tablespace name, - schema name, and OID. + schema name, OID and path. @@ -299,10 +299,10 @@ From database "alvherre": $ # you can mix the options, and get more details with -x $ oid2name -d alvherre -t accounts -f 1155291 -x From database "alvherre": - Filenode Table Name Oid Schema Tablespace ------------------------------------------------------- - 155173 accounts 155173 public pg_default - 1155291 accounts_pkey 1155291 public pg_default + Filenode Table Name Oid Schema Tablespace Path +-------------------------------------------------------------------------- + 155173 accounts 155173 public pg_default base/17228/155173 + 1155291 accounts_pkey 1155291 public pg_default base/17228/1155291 $ # show disk space for every db object $ du [0-9]* | diff --git a/doc/src/sgml/pgplanadvice.sgml b/doc/src/sgml/pgplanadvice.sgml new file mode 100644 index 00000000000..a5f605b3f19 --- /dev/null +++ b/doc/src/sgml/pgplanadvice.sgml @@ -0,0 +1,969 @@ + + + + pg_plan_advice — help the planner get the right plan + + + pg_plan_advice + + + + The pg_plan_advice allows key planner decisions to be + described, reproduced, and altered using a special-purpose "plan advice" + mini-language. It is intended to allow stabilization of plan choices that + the user believes to be good, as well as experimentation with plans that + the planner believes to be non-optimal. + + + + Note that, since the planner often makes good decisions, overriding its + judgement can easily backfire. For example, if the distribution of the + underlying data changes, the planner normally has the option to adjust the + plan in an attempt to preserve good performance. If the plan advice prevents + this, a very poor plan may be chosen. It is important to use plan advice + only when the risks of constraining the planner's choices are outweighed by + the benefits. + + + + Getting Started + + + In order to use this module, the pg_plan_advice module + must be loaded. You can do this on a system-wide basis by adding + pg_plan_advice to + and restarting the + server, or by adding it to + and starting a new session, + or by loading it into an individual session using the + LOAD command. If you + wish to use the + collector interface, + you must also install the pg_plan_advice extension + in the database where you wish to use the collector. Use the command + CREATE EXTENSION pg_plan_advice to do this. If you do + not wish to use the collector interface, this step is not required. + + + + Once the pg_plan_advice module is loaded, + EXPLAIN will support + a PLAN_ADVICE option. You can use this option to see + a plan advice string for the chosen plan. For example: + + + +EXPLAIN (COSTS OFF, PLAN_ADVICE) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +------------------------------------ + Hash Join + Hash Cond: (f.dim_id = d.id) + -> Seq Scan on join_fact f + -> Hash + -> Seq Scan on join_dim d + Generated Plan Advice: + JOIN_ORDER(f d) + HASH_JOIN(d) + SEQ_SCAN(f d) + NO_GATHER(f d) + + + + In this example, the user has not specified any advice; instead, the + planner has been permitted to make whatever decisions it thinks best, and + those decisions are memorialized in the form of an advice string. + JOIN_ORDER(f d) means that f should + be the driving table, and the first table to which it should be joined is + d. HASH_JOIN(d) means that + d should appear on the inner side of a hash join. + SEQ_SCAN(f d) means that both f + and d should be accessed via a sequential scan. + NO_GATHER(f d) means that neither f + nor d should appear beneath a Gather + or Gather Merge node. For more details on the plan + advice mini-language, see the information on + advice targets and + advice tags, below. + + + + If you want to see the advice strings for a large number of queries, or + an entire workload, running EXPLAIN (PLAN_ADVICE) for + each one may not be convenient. In such situations, it can be more + convenient to use an + advice collector. + + + + Once you have an advice string for a query, you can use it to control how + that query is planned. You can do this by setting + pg_plan_advice.advice to the advice string you've + chosen. This can be an advice string that was generated by the system, + or one you've written yourself. One good way of creating your own advice + string is to take the string generated by the system and pick out just + those elements that you wish to enforce. In the example above, + pg_plan_advice emits advice for the join order, the + join method, the scan method, and the use of parallelism, but you might + only want to control the join order: + + + +SET pg_plan_advice.advice = 'JOIN_ORDER(f d)'; +EXPLAIN (COSTS OFF) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +------------------------------------ + Hash Join + Hash Cond: (f.dim_id = d.id) + -> Seq Scan on join_fact f + -> Hash + -> Seq Scan on join_dim d + Supplied Plan Advice: + JOIN_ORDER(f d) /* matched */ + + + + Since the PLAN_ADVICE option to + EXPLAIN was not specified, no advice string is generated + for the plan. However, the supplied plan advice is still shown so that + anyone looking at the EXPLAIN output knows that the + chosen plan was influenced by plan advice. If information about supplied + plan advice is not desired, it can be suppressed by configuring + pg_plan_advice.always_explain_supplied_advice = false. + For each piece of supplied advice, the output shows + advice feedback indicating + whether or not the advice was successfully applied to the query. In this + case, the feedback says /* matched */, which means that + f and d were found in the query and + that the resulting query plan conforms to the specified advice. + + + + + + How It Works + + + Plan advice is written imperatively; that is, it specifies what should be + done. However, at an implementation level, + pg_plan_advice works by telling the core planner what + should not be done. In other words, it operates by constraining the + planner's choices, not by replacing it. Therefore, no matter what advice + you provide, you will only ever get a plan that the core planner would have + considered for the query in question. If you attempt to force what you + believe to be the correct plan by supplying an advice string, and the + planner still fails to produce the desired plan, this means that either + there is a bug in your advice string, or the plan in question was not + considered viable by the core planner. This commonly happens for one of two + reasons. First, it might be the planner believes that the plan you're trying + to force would be semantically incorrect - that is, it would produce the + wrong results - and for that reason it wasn't considered. Second, it might + be that the planner rejected the plan you were hoping to generate on some + grounds other than cost. For example, given a very simple query such as + SELECT * FROM some_table, the query planner will + decide that the use of an index is worthless here before it performs any + costing calculations. You cannot force it to use an index for this query + even if you set enable_seqscan = false, and you can't + force it to use an index using plan advice, either. + + + + Specifying plan advice should never cause planner failure. However, if you + specify plan advice that asks for something impossible, you may get a plan + where some plan nodes are flagged as Disabled: true in + the EXPLAIN output. In some cases, such plans will be + basically the same plan you would have gotten with no supplied advice at + all, but in other cases, they may be much worse. For example: + + + +SET pg_plan_advice.advice = 'JOIN_ORDER(x f d)'; +EXPLAIN (COSTS OFF) + SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id; + QUERY PLAN +---------------------------------------------------- + Nested Loop + Disabled: true + -> Seq Scan on join_fact f + -> Index Scan using join_dim_pkey on join_dim d + Index Cond: (id = f.dim_id) + Supplied Plan Advice: + JOIN_ORDER(x f d) /* partially matched */ + + + + Because neither f nor d is the + first table in the JOIN_ORDER() specification, the + planner disables all direct joins between the two of them, thinking that + the join to x should happen first. Since planning isn't + allowed to fail, a disabled plan between the two tables is eventually + selected anyway, but here it's a Nested Loop rather than + the Hash Join that was chosen in the above example where + no advice was specified. There are several different ways that this kind + of thing can happen; when it does, the resulting plan is generally worse + than if no advice had been specified at all. Therefore, it is a good idea + to validate that the advice you specify applies to the query to which it + is applied and that the results are as expected. + + + + + + Advice Targets + + + An advice target uniquely identifies a particular + instance of a particular table involved in a particular query. In simple + cases, such as the examples shown above, the advice target is simply the + relation alias. However, a more complex syntax is required when subqueries + are used, when tables are partitioned, or when the same relation alias is + mentioned more than once in the same subquery (e.g., (foo JOIN bar + ON foo.a = bar.a) x JOIN foo ON x.b = foo.b). Any combination of + these three things can occur simultaneously: a relation could be mentioned + more than once, be partitioned, and be used inside of a subquery. + + + + Because of this, the general syntax for a relation identifier is: + + + +alias_name#occurrence_number/partition_schema.partition_name@plan_name + + + + All components except for the alias_name are optional + and are included only when required. When a component is omitted, the + preceding punctuation must also be omitted. For the first occurrence of a + table within a given subquery, generated advice will omit the occurrence + number, but it is legal to write #1, if desired. The + partition schema and partition name are included only for children of + partitioned tables. In generated advice, pg_plan_advice + always includes both, but it is legal to omit the schema. The plan name is + omitted for the top-level plan, and must be included for any subplan. + + + + It is not always easy to determine the correct advice target by examining + the query. For instance, if the planner pulls up a subquery into the parent + query level, everything inside of it becomes part of the parent query level, + and uses the parent query's subplan name (or no subplan name, if pulled up + to the top level). Furthermore, the correct subquery name is sometimes not + obvious. For example, when two queries are joined using an operation such as + UNION or INTERSECT, no name for the + subqueries is present in the SQL syntax; instead, a system-generated name is + assigned to each branch. The easiest way to discover the proper advice + targets is to use EXPLAIN (PLAN_ADVICE) and examine the + generated advice. + + + + + + Advice Tags + + + An advice tag specifies a particular behavior that + should be enforced for some portion of the query, such as a particular + join order or join method. All advice tags take + advice targets as arguments, + and many allow lists of advice targets, which in some cases can be nested + multiple levels deep. Several different classes of advice targets exist, + each controlling a different aspect of query planning. + + + + Scan Method Advice + +SEQ_SCAN(target [ ... ]) +TID_SCAN(target [ ... ]) +INDEX_SCAN(target index_name [ ... ]) +INDEX_ONLY_SCAN(target index_name [ ... ]) +FOREIGN_SCAN((target [ ... ]) [ ... ]) +BITMAP_HEAP_SCAN(target [ ... ]) + + + SEQ_SCAN specifies that each target table should be + scanned using a Seq Scan. TID_SCAN + specifies that each target table should be scanned using a + TID Scan or TID Range Scan. + BITMAP_HEAP_SCAN specifies that each target table + should be scanned using a Bitmap Heap Scan. + + + + INDEX_SCAN specifies that each target table should + be scanned using an Index Scan on the given index + name. INDEX_ONLY_SCAN is similar, but specifies the + use of an Index Only Scan. In either case, the index + name can be, but does not have to be, schema-qualified. + + + + FOREIGN_SCAN specifies that a join between two or + more foreign tables should be pushed down to a remote server so + that it can be implemented as a single Foreign Scan. + Specifying FOREIGN_SCAN for a single foreign table is + neither necessary nor permissible: a Foreign Scan will + need to be used regardless. If you want to prevent a join from being + pushed down, consider using the JOIN_ORDER tag for + that purpose. + + + + The planner supports many types of scans other than those listed here; + however, in most of those cases, there is no meaningful decision to be + made, and hence no need for advice. For example, the output of a + set-returning function that appears in the FROM clause + can only ever be scanned using a Function Scan, so + there is no opportunity for advice to change anything. + + + + + + Join Order Advice + +JOIN_ORDER(join_order_item [ ... ]) + +where join_order_item is: + +advice_target | +( join_order_item [ ... ] ) | +{ join_order_item [ ... ] } + + + When JOIN_ORDER is used without any sublists, it + specifies an outer-deep join with the first advice target as the driving + table, joined to each subsequent advice target in turn in the order + specified. For instance, JOIN_ORDER(a b c) means that + a should be the driving table, and that it should be + joined first to b and then to c. + If there are more tables in the query than a, + b, and c, the rest can be joined + afterwards in any manner. + + + + If a JOIN_ORDER list contains a parenthesized sublist, + it specifies a non-outer-deep join. The tables in the sublist must first + be joined to each other much as if the sublist were a top-level + JOIN_ORDER list, and the resulting join product must + then appear on the inner side of a join at the appropriate point in the + join order. For example, JOIN_ORDER(a (b c) d) requires + a plan of this form: + + + +Join + -> Join + -> Scan on a + -> Join + -> Scan on b + -> Scan on c + -> Scan on d + + + + If a JOIN_ORDER list contains a sublist surrounded by + curly braces, this also specifies a non-outer-deep join. However, the join + order within the sublist is not constrained. For example, specifiying + JOIN_ORDER(a {b c} d) would allow the scans of + b and c to be swapped in the + previous example, which is not allowed when parentheses are used. + + + + Parenthesized sublists can be arbitrarily nested, but sublists surrounded + by curly braces cannot themselves contain sublists. + + + + Multiple instances of JOIN_ORDER() can sometimes be + needed in order to fully constraint the join order. This occurs when there + are multiple join problems that are optimized separately by the planner. + This can happen due to the presence of subqueries, or because there is a + partitionwise join. In the latter case, each branch of the partitionwise + join can have its own join order, independent of every other branch. + + + + + + Join Method Advice + +join_method_name(join_method_item [ ... ]) + +where join_method_name is: + +{ MERGE_JOIN_MATERIALIZE | MERGE_JOIN_PLAIN | NESTED_LOOP_MATERIALIZE | NESTED_LOOP_PLAIN | HASH_JOIN } + +and join_method_item is: + +{ advice_target | +( advice_target [ ... ] ) } + + + Join method advice specifies the table, or set of tables, that should + appear on the inner side of a join using the named join method. For + example, HASH_JOIN(a b) means that each of + a and b should appear on the inner + side of a hash join; a conforming plan must contain at least two hash + joins, one of which has a and nothing else on the + inner side, and the other of which has b and nothing + else on the inner side. On the other hand, + HASH_JOIN((a b)) means that the join product of + a and b should appear together + on the inner side of a single hash join. + + + + Note that join method advice implies a negative join order constraint. + Since the named table or tables must be on the inner side of a join using + the specified method, none of them can be the driving table for the entire + join problem. Moreover, no table inside the set should be joined to any + table outside the set until all tables within the set have been joined to + each other. For example, if the advice specifies + HASH_JOIN((a b)) and the system begins by joining either + of those tables to some third table c, the resulting + plan could never be compliant with the request to put exactly those two + tables on the inner side of a hash join. When using both join order advice + and join method advice for the same query, it is a good idea to make sure + that they do not mandate incompatible join orders. + + + + + + Partitionwise Advice + +PARTITIONWISE(partitionwise_item [ ... ]) + +where partitionwise_item is: + +{ advice_target | +( advice_target [ ... ] ) } + + + When applied to a single target, PARTITIONWISE + specifies that the specified table should not be part of any partitionwise + join. When applied to a list of targets, PARTITIONWISE + specifies that exactly that set of tables should be joined in + partitionwise fashion. Note that, regardless of what advice is specified, + no partitionwise joins will be possible if + enable_partitionwise_join = off. + + + + + + Semijoin Uniqueness Advice + +SEMIJOIN_UNIQUE(sj_unique_item [ ... ]) +SEMIJOIN_NON_UNIQUE(sj_unique_item [ ... ]) + +where sj_unique_item is: + +{ advice_target | +( advice_target [ ... ] ) } + + + The planner sometimes has a choice between implementing a semijoin + directly and implememnting a semijoin by making the nullable side unique + and then performing an inner join. SEMIJOIN_UNIQUE + specifies the latter strategy, while SEMIJOIN_NON_UNIQUE + specifies the former strategy. In either case, the argument is the single + table or list of tables that appear beneath the nullable side of the join. + + + + + + Parallel Query Advice + +GATHER(gather_item [ ... ]) +GATHER_MERGE(gather_item [ ... ]) +NO_GATHER(advice_target [ ... ]) + +where gather_item is: + +{ advice_target | +( advice_target [ ... ] ) } + + + GATHER or GATHER_MERGE specifies + that Gather or Gather Merge, + respectively, should be placed on top of the single table specified as + a target, or on top of the join between the list of tables specified as + a target. This means that GATHER(a b c) is a request + for three different Gather nodes, while + GATHER((a b c)) is a request for a single + Gather node on top of a 3-way join. + + + + NO_GATHER specifies that none of the tables given + as arguments should appear beneath a Gather or + Gather Merge node. + + + + + + + + Advice Feedback + + + EXPLAIN provides feedback on whether supplied advice was + successfully applied to the query in the form of a comment on each piece + of supplied advice. For example: + + + +SET pg_plan_advice.advice = 'hash_join(f g) join_order(f g) index_scan(f no_such_index)'; +SET +rhaas=# EXPLAIN (COSTS OFF) SELECT * FROM jo_fact f + LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id + LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id + WHERE val1 = 1 AND val2 = 1; + QUERY PLAN +------------------------------------------------------------------- + Hash Join + Hash Cond: ((d1.id = f.dim1_id) AND (d2.id = f.dim2_id)) + -> Nested Loop + -> Seq Scan on jo_dim2 d2 + Filter: (val2 = 1) + -> Materialize + -> Seq Scan on jo_dim1 d1 + Filter: (val1 = 1) + -> Hash + -> Seq Scan on jo_fact f + Supplied Plan Advice: + INDEX_SCAN(f no_such_index) /* matched, inapplicable, failed */ + HASH_JOIN(f) /* matched */ + HASH_JOIN(g) /* not matched */ + JOIN_ORDER(f g) /* partially matched */ + + + + For this query, f is a valid advice target, but + g is not. Therefore, the request to place + f on the inner side of a hash join is listed as + matched, but the request to place g + on the inner side of a hash join is listed as + not matched. The JOIN_ORDER advice + tag involves one valid target and one invald target, and so is listed as + partially matched. Note that + HASH_JOIN(f g) is actually a request for two logically + separate behaviors, whereas JOIN_ORDER(f g) is a single + request. When providing advice feedback, EXPLAIN shows + each logical request separately, together with all the feedback applicable + to that request type. + + + + Advice feedback can include any of the folllowing: + + + + + + + matched means that all of the specified advice targets + were observed during query planning. + + + + + + partially matched means that some but not all of the + specified advice targets were observed during query planning. + + + + + + not matched means that none of the + specified advice targets were observed during query planning. This may + happen if the advice simply doesn't match the query, or it may + occur if the relevant portion of the query was not planned, perhaps + because it was gated by a condition that was simplified to constant false. + + + + + + inapplicable means that the advice tag could not + be applied to the advice targets for some reason. For example, this will + happen if the use of a nonexistent index is requested, or if an attempt + is made to control semijoin uniquness for a non-semijoin. + + + + + + conflicting means that two or more pieces of advice + request incompatible behaviors. For example, if you advise a sequential + scan and an index scan for the same table, both requests will be flagged + as conflicting. This also commonly happens if join method advice or + semijoin uniqueness advice implies a join order incompatible with the + one explicitly specified; see + . + + + + + + failed means that query plan does not comply with + the advice. This only occurs for entries that are also shown as + matched. It frequently occurs for entries that are + also marked as conflicting or + inapplicable. However, it can also occur when the + advice is valid insofar as pg_plan_advice is able + to determine, but the planner is not able to construct a legal + plan that can comply with the advice. It is important to note that the + sanity checks performed by pg_plan_advice are fairly + superficial and focused mostly on looking for logical inconsistencies in + the advice string; only the planner knows what will actually work. + + + + + + + All advice should be marked as exactly one of matched, + partially matched, or not matched. + + + + + + Advice Collectors + + + pg_plan_advice can be configured to automatically + generate advice every time a query is planned and store the query and + the generated advice string either in local or shared memory. + + + + To enable a collector, you must first set a collection limit. When the + number of queries for which advice has been stored exceeds the collection + limit, the oldest queries and the corresponding advice will be discarded. + Then, you must adjust a separate setting to actually enable advice + collection. For the local collector, set the collection limit by configuring + pg_plan_advice.local_collection_limit to a value + greater than zero, and then enable advice collection by setting + pg_plan_advice.local_collector = true. For the shared + collector, the procedure is the same, except that the names of the settings + are pg_plan_advice.shared_collection_limit and + pg_plan_advice.shared_collector. Note that the local + collector stores query texts and advice strings in backend-local memory, + and the shared collector does the same in dynamic shared memory, so + configuring large limits may result in considerable memory consumption. + + + + Once the collector is enabled, you can run any queries for which you wish + to see the generated plan advice. Then, you can examine what has been + collected using whichever of + SELECT * FROM pg_get_collected_local_advice() or + SELECT * FROM pg_get_collected_shared_advice() + corresponds to the collector you enabled. To discard the collected advice + and release memory, you can call + pg_clear_collected_local_advice() + or pg_clear_collected_shared_advice(). + + + + In addition to the query texts an advice strings, the advice collectors + will also store the OID of the role that caused the query to be planned, + the OID of the database in which the query was planned, the query ID, + and the time at which the collection occurred. This module does not + automatically enable query ID computation; therefore, if you want the + query ID value to be populated in collected advice, be sure to configure + enable_query_id = on. Otherwise, the query ID may + always show as 0. + + + + + + Functions + + + Note that these functions will only be available if the + pg_plan_advice extension has been installed in the + current database, which is not mandatory, since much of the functionality + of this module can be used without installing the extension. + + + + + + + pg_clear_collected_local_advice() returns void + + pg_clear_collected_local_advice + + + + + + Removes all collected query texts and advice strings from backend-local + memory. + + + + + + + pg_get_collected_local_advice() returns setof (id bigint, + userid oid, dbid oid, queryid bigint, collection_time timestamptz, + query text, advice text) + + pg_get_collected_local_advice + + + + + + Returns all query texts and advice strings stored in the local + advice collector. + + + + + + + pg_clear_collected_shared_advice() returns void + + pg_clear_collected_shared_advice + + + + + + Removes all collected query texts and advice strings from shared + memory. + + + + + + + pg_get_collected_shared_advice() returns setof (id bigint, + userid oid, dbid oid, queryid bigint, collection_time timestamptz, + query text, advice text) + + pg_get_collected_shared_advice + + + + + + Returns all query texts and advice strings stored in the shared + advice collector. + + + + + + + + + + Configuration Parameters + + + + + + pg_plan_advice.advice (string) + + pg_plan_advice.advice configuration parameter + + + + + + pg_plan_advice.advice is an advice string to be + used during query planning. + + + + + + + pg_plan_advice.always_explain_supplied_advice (boolean) + + pg_plan_advice.always_explain_supplied_advice configuration parameter + + + + + + pg_plan_advice.always_explain_supplied_advice causes + EXPLAIN to always show any supplied advice and the + associated + advice feedback. + The default value is true. If set to + false, this information will be displayed only when + EXPLAIN (PLAN_ADVICE) is used. + + + + + + + pg_plan_advice.always_store_advice_details (boolean) + + pg_plan_advice.always_store_advice_details configuration parameter + + + + + + pg_plan_advice.always_store_advice_details allows + EXPLAIN to show details related to plan advice even + when prepared queries are used. The default value is + false. When planning a prepared query, it is not + possible to know whether EXPLAIN will later be used, + so by default, to reduce overhead, pg_plan_advice + will not generate plan advice or feedback on supplied advice. This means + that if EXPLAIN EXECUTE is used on the prepared query, + it will not be able to show this information. Changing this setting to + true avoids this problem, but adds additional + overhead. It is probably a good idea to enable this option only in + sessions where it is needed, rather than on a system-wide basis. + + + + + + + pg_plan_advice.feedback_warnings (boolean) + + pg_plan_advice.feedback_warnings configuration parameter + + + + + + When set to true, pg_plan_advice.feedback_warnings + emits a warning whenever supplied plan advice is not successfully + enforced. The default value is false. + + + + + + + pg_plan_advice.local_collector (boolean) + + pg_plan_advice.local_collector configuration parameter + + + + + + pg_plan_advice.local_collector enables the + local advice collector. + The default value is false. + + + + + + + pg_plan_advice.local_collection_limit (integer) + + pg_plan_advice.local_collection_limit configuration parameter + + + + + + pg_plan_advice.local_collection_limit sets the + maximum number of query texts and advice strings retained by the + local advice collector. + The default value is 0. + + + + + + + pg_plan_advice.shared_collector (boolean) + + pg_plan_advice.shared_collector configuration parameter + + + + + + pg_plan_advice.shared_collector enables the + shared advice collector. + The default value is false. Only superusers and users + with the appropriate SET privilege can change this + setting. + + + + + + + pg_plan_advice.shared_collection_limit (integer) + + pg_plan_advice.shared_collection_limit configuration parameter + + + + + + pg_plan_advice.shared_collection_limit sets the + maximum number of query texts and advice strings retained by the + shared advice collector. + The default value is 0. Only superusers and users + with the appropriate SET privilege can change this + setting. + + + + + + + pg_plan_advice.trace_mask (boolean) + + pg_plan_advice.trace_mask configuration parameter + + + + + + When pg_plan_advice.trace_mask is + true, pg_plan_advice will print + messages during query planning each time that + pg_plan_advice alters the mask of allowable query + plan types in response to supplied plan advice. The default values is + false. The messages printed by this setting are not + excepted to be useful except for purposes of debugging this module. + + + + + + + + + + Author + + + Robert Haas rhaas@postgresql.org + + + + diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml index a2b528c481e..89ac680efd5 100644 --- a/doc/src/sgml/protocol.sgml +++ b/doc/src/sgml/protocol.sgml @@ -223,10 +223,12 @@ shows the currently supported protocol versions. + + documents protocol versions that are unsupported or otherwise reserved. - Protocol Versions + Supported Protocol Versions @@ -248,6 +250,39 @@ + 3.0 + PostgreSQL 7.4 and later + + + +
+ + + Other Protocol Versions + + + + + Version + Supported by + Description + + + + + + 3.9999 + - + Reserved for protocol greasing. libpq may use this version, which + is higher than any minor version the project ever expects to use, to + test that servers and middleware properly implement protocol version + negotiation. Servers must not add special-case + logic for this version; they should simply compare it to their latest + supported version (which will always be smaller) and downgrade via a + NegotiateProtocolVersion message. + + + 3.1 - Reserved. Version 3.1 has not been used by any PostgreSQL @@ -257,15 +292,89 @@ - 3.0 - PostgreSQL 7.4 and later - - 2.0 up to PostgreSQL 13 - See previous releases of + Obsolete. See previous releases of the PostgreSQL documentation for - details + details. + + + +
+ + + + Protocol Extensions + + + Servers and clients may additionally negotiate individual extensions to the + protocol version in use. These are offered by the client in the startup + message, as specially-named parameters with a _pq_. + prefix. Servers reject any unknown or unsupported extensions by sending a + NegotiateProtocolVersion message containing the list of rejected parameter + names, at which point the client may choose whether to continue with the + connection. and + document the supported + and reserved protocol extension parameters, respectively. + + + + Supported Protocol Extensions + + + + + + + Parameter Name + Values + Supported by + Description + + + + + + + (No supported protocol extensions are currently defined.) + + + + +
+ + + Reserved Protocol Extensions + + + + + Parameter Name + Description + + + + + + _pq_.[name] + Any other parameter names beginning with _pq_., + that are not defined above, are reserved for future protocol expansion. + Servers must reject any that are received from a + client, by sending a NegotiateProtocolVersion message during the + startup flow, and should + otherwise continue the connection. + + + + + _pq_.test_protocol_negotiation + Reserved for protocol greasing. libpq may send this extension to + test that servers and middleware properly implement protocol extension + negotiation. Servers must not add special-case + logic for this parameter; they should simply send the list of all + unsupported options (including this one) via a NegotiateProtocolVersion + message. + @@ -295,8 +404,8 @@ To begin a session, a frontend opens a connection to the server and sends a startup message. This message includes the names of the user and of the database the user wants to connect to; it also identifies the particular - protocol version to be used. (Optionally, the startup message can include - additional settings for run-time parameters.) + protocol version to be used. (Optionally, the startup message can request + protocol extensions and include additional settings for run-time parameters.) The server then uses this information and the contents of its configuration files (such as pg_hba.conf) to determine @@ -6151,7 +6260,9 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" In addition to the above, other parameters may be listed. Parameter names beginning with _pq_. are - reserved for use as protocol extensions, while others are + reserved for use as + protocol extensions, + while others are treated as run-time parameters to be set at backend start time. Such settings will be applied during backend start (after parsing the command-line arguments if any) and will diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml index 688e23c0e90..7f538e90194 100644 --- a/doc/src/sgml/ref/pg_dump.sgml +++ b/doc/src/sgml/ref/pg_dump.sgml @@ -263,12 +263,10 @@ PostgreSQL documentation - When is specified, - pg_dump makes no attempt to dump any other - database objects that the selected extension(s) might depend upon. - Therefore, there is no guarantee that the results of a - specific-extension dump can be successfully restored by themselves - into a clean database. + pg_dump does not dump the extension's + underlying installation files (such as shared libraries or control + files). These must be available on the destination system for the + restore to succeed. @@ -445,16 +443,6 @@ PostgreSQL documentation below. - - - When is specified, pg_dump - makes no attempt to dump any other database objects that the selected - schema(s) might depend upon. Therefore, there is no guarantee - that the results of a specific-schema dump can be successfully - restored by themselves into a clean database. - - - Non-schema objects such as large objects are not dumped when is @@ -596,16 +584,6 @@ PostgreSQL documentation be dumped. - - - When is specified, pg_dump - makes no attempt to dump any other database objects that the selected - table(s) might depend upon. Therefore, there is no guarantee - that the results of a specific-table dump can be successfully - restored by themselves into a clean database. - - - @@ -1689,6 +1667,17 @@ CREATE DATABASE foo WITH TEMPLATE template0; + + When options , or + are specified, pg_dump makes no attempt to dump + any other database objects that the selected object(s) might depend upon. + Therefore, there is no guarantee that the results of a dump so generated + can be successfully restored by themselves into a clean database. + For example, if a table whose definition includes a foreign key is + specified to be restored, the table referenced by the foreign key is + not automatically restored. + + When a dump without schema is chosen and the option is used, pg_dump emits commands diff --git a/doc/src/sgml/ref/pg_restore.sgml b/doc/src/sgml/ref/pg_restore.sgml index 2c295bbf8dc..420a308a7c7 100644 --- a/doc/src/sgml/ref/pg_restore.sgml +++ b/doc/src/sgml/ref/pg_restore.sgml @@ -452,16 +452,6 @@ PostgreSQL documentation specify table(s) in a particular schema. - - - When is specified, pg_restore - makes no attempt to restore any other database objects that the - selected table(s) might depend upon. Therefore, there is no - guarantee that a specific-table restore into a clean database will - succeed. - - - This flag does not behave identically to the @@ -1089,6 +1079,16 @@ PostgreSQL documentation Notes + + When options or are specified, + pg_restore makes no attempt to restore + any other database objects that the selected table(s) or schema(s) + might depend upon. Therefore, there is no guarantee that a specific-table + restore into a clean database will succeed. For example, if a table + whose definition includes a foreign key is specified to be restored, the + table referenced by the foreign key is not automatically restored. + + If your installation has any local additions to the template1 database, be careful to load the output of diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml index e464e3b13de..8b1d948ba05 100644 --- a/doc/src/sgml/ref/psql-ref.sgml +++ b/doc/src/sgml/ref/psql-ref.sgml @@ -5075,6 +5075,23 @@ testdb=> INSERT INTO my_table VALUES (:'content'); + + %i + + + Indicates whether the connected server is running in hot standby mode. + The value is shown as standby, if the server is + currently in hot standby and reports + as on, + and primary otherwise. This is useful when + connecting to multiple servers to quickly determine the role of + each connection. A value of ? is shown + when connected to a server running + PostgreSQL 13 or older. + + + + %x diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml index 8b4abef8c68..e5fe423fc61 100644 --- a/doc/src/sgml/system-views.sgml +++ b/doc/src/sgml/system-views.sgml @@ -5045,6 +5045,45 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx non-null elements. (Null for scalar types.) + + + + range_length_histogram anyarray + + + A histogram of the lengths of non-empty and non-null range values of an + expression. (Null for non-range types.) + + + This histogram is calculated using the subtype_diff + range function regardless of whether range bounds are inclusive. + + + + + + range_empty_frac float4 + + + Fraction of expression entries whose values are empty ranges. + (Null for non-range types.) + + + + + + range_bounds_histogram anyarray + + + A histogram of lower and upper bounds of non-empty and non-null range + values. (Null for non-range types.) + + + These two histograms are represented as a single array of ranges, whose + lower bounds represent the histogram of lower bounds, and upper bounds + represent the histogram of upper bounds. + +
diff --git a/meson.build b/meson.build index df907b62da3..96b3869df86 100644 --- a/meson.build +++ b/meson.build @@ -2911,7 +2911,7 @@ gnugetopt_dep = cc.find_library('gnugetopt', required: false) # (i.e., allow '-' as a flag character), so use our version on those platforms # - We want to use system's getopt_long() only if the system provides struct # option -always_replace_getopt = host_system in ['windows', 'cygwin', 'openbsd', 'solaris'] +always_replace_getopt = host_system in ['windows', 'cygwin', 'openbsd', 'sunos'] always_replace_getopt_long = host_system in ['windows', 'cygwin'] or not cdata.has('HAVE_STRUCT_OPTION') # Required on BSDs diff --git a/src/backend/Makefile b/src/backend/Makefile index baa9b05d021..05642dc02e3 100644 --- a/src/backend/Makefile +++ b/src/backend/Makefile @@ -136,9 +136,6 @@ parser/gram.h: parser/gram.y storage/lmgr/lwlocknames.h: storage/lmgr/generate-lwlocknames.pl ../include/storage/lwlocklist.h utils/activity/wait_event_names.txt $(MAKE) -C storage/lmgr lwlocknames.h -utils/activity/wait_event_types.h: utils/activity/generate-wait_event_types.pl utils/activity/wait_event_names.txt - $(MAKE) -C utils/activity wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c - # run this unconditionally to avoid needing to know its dependencies here: submake-catalog-headers: $(MAKE) -C ../include/catalog generated-headers @@ -163,18 +160,13 @@ submake-utils-headers: .PHONY: generated-headers -generated-headers: $(top_builddir)/src/include/storage/lwlocknames.h $(top_builddir)/src/include/utils/wait_event_types.h submake-catalog-headers submake-nodes-headers submake-utils-headers parser/gram.h +generated-headers: $(top_builddir)/src/include/storage/lwlocknames.h submake-catalog-headers submake-nodes-headers submake-utils-headers parser/gram.h $(top_builddir)/src/include/storage/lwlocknames.h: storage/lmgr/lwlocknames.h prereqdir=`cd '$(dir $<)' >/dev/null && pwd` && \ cd '$(dir $@)' && rm -f $(notdir $@) && \ $(LN_S) "$$prereqdir/$(notdir $<)" . -$(top_builddir)/src/include/utils/wait_event_types.h: utils/activity/wait_event_types.h - prereqdir=`cd '$(dir $<)' >/dev/null && pwd` && \ - cd '$(dir $@)' && rm -f $(notdir $@) && \ - $(LN_S) "$$prereqdir/$(notdir $<)" . - utils/probes.o: utils/probes.d $(SUBDIROBJS) $(DTRACE) $(DTRACEFLAGS) -C -G -s $(call expand_subsys,$^) -o $@ diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c index 94b4f1f9975..b69d10f0a45 100644 --- a/src/backend/access/common/tupdesc.c +++ b/src/backend/access/common/tupdesc.c @@ -86,25 +86,8 @@ populate_compact_attribute_internal(Form_pg_attribute src, IsCatalogRelationOid(src->attrelid) ? ATTNULLABLE_VALID : ATTNULLABLE_UNKNOWN; - switch (src->attalign) - { - case TYPALIGN_INT: - dst->attalignby = ALIGNOF_INT; - break; - case TYPALIGN_CHAR: - dst->attalignby = sizeof(char); - break; - case TYPALIGN_DOUBLE: - dst->attalignby = ALIGNOF_DOUBLE; - break; - case TYPALIGN_SHORT: - dst->attalignby = ALIGNOF_SHORT; - break; - default: - dst->attalignby = 0; - elog(ERROR, "invalid attalign value: %c", src->attalign); - break; - } + /* Compute numeric alignment requirement, too */ + dst->attalignby = typalign_to_alignby(src->attalign); } /* diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index d5944205db2..dfffce3e396 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -291,7 +291,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, SplitPageLayout *dist = NULL, *ptr; BlockNumber oldrlink = InvalidBlockNumber; - GistNSN oldnsn = 0; + GistNSN oldnsn = InvalidXLogRecPtr; SplitPageLayout rootpg; bool is_rootsplit; int npage; @@ -654,7 +654,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, /* Start from the root */ firststack.blkno = GIST_ROOT_BLKNO; - firststack.lsn = 0; + firststack.lsn = InvalidXLogRecPtr; firststack.retry_from_parent = false; firststack.parent = NULL; firststack.downlinkoffnum = InvalidOffsetNumber; diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c index 83bda209c42..036421fc664 100644 --- a/src/backend/access/hash/hashfunc.c +++ b/src/backend/access/hash/hashfunc.c @@ -27,6 +27,7 @@ #include "postgres.h" #include "common/hashfn.h" +#include "utils/builtins.h" #include "utils/float.h" #include "utils/fmgrprotos.h" #include "utils/pg_locale.h" @@ -233,6 +234,7 @@ hashoidvector(PG_FUNCTION_ARGS) { oidvector *key = (oidvector *) PG_GETARG_POINTER(0); + check_valid_oidvector(key); return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid)); } @@ -241,6 +243,7 @@ hashoidvectorextended(PG_FUNCTION_ARGS) { oidvector *key = (oidvector *) PG_GETARG_POINTER(0); + check_valid_oidvector(key); return hash_any_extended((unsigned char *) key->values, key->dim1 * sizeof(Oid), PG_GETARG_INT64(1)); diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index f30a56ecf55..3004964ab7f 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -111,11 +111,11 @@ static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool ke /* - * Each tuple lock mode has a corresponding heavyweight lock, and one or two - * corresponding MultiXactStatuses (one to merely lock tuples, another one to - * update them). This table (and the macros below) helps us determine the - * heavyweight lock mode and MultiXactStatus values to use for any particular - * tuple lock strength. + * This table lists the heavyweight lock mode that corresponds to each tuple + * lock mode, as well as one or two corresponding MultiXactStatus values: + * .lockstatus to merely lock tuples, and .updstatus to update them. The + * latter is set to -1 if the corresponding tuple lock mode does not allow + * updating tuples -- see get_mxact_status_for_lock(). * * These interact with InplaceUpdateTupleLock, an alias for ExclusiveLock. * @@ -127,29 +127,30 @@ static const struct LOCKMODE hwlock; int lockstatus; int updstatus; -} +} tupleLockExtraInfo[] = - tupleLockExtraInfo[MaxLockTupleMode + 1] = { - { /* LockTupleKeyShare */ - AccessShareLock, - MultiXactStatusForKeyShare, - -1 /* KeyShare does not allow updating tuples */ + [LockTupleKeyShare] = { + .hwlock = AccessShareLock, + .lockstatus = MultiXactStatusForKeyShare, + /* KeyShare does not allow updating tuples */ + .updstatus = -1 }, - { /* LockTupleShare */ - RowShareLock, - MultiXactStatusForShare, - -1 /* Share does not allow updating tuples */ + [LockTupleShare] = { + .hwlock = RowShareLock, + .lockstatus = MultiXactStatusForShare, + /* Share does not allow updating tuples */ + .updstatus = -1 }, - { /* LockTupleNoKeyExclusive */ - ExclusiveLock, - MultiXactStatusForNoKeyUpdate, - MultiXactStatusNoKeyUpdate + [LockTupleNoKeyExclusive] = { + .hwlock = ExclusiveLock, + .lockstatus = MultiXactStatusForNoKeyUpdate, + .updstatus = MultiXactStatusNoKeyUpdate }, - { /* LockTupleExclusive */ - AccessExclusiveLock, - MultiXactStatusForUpdate, - MultiXactStatusUpdate + [LockTupleExclusive] = { + .hwlock = AccessExclusiveLock, + .lockstatus = MultiXactStatusForUpdate, + .updstatus = MultiXactStatusUpdate } }; @@ -1421,16 +1422,6 @@ heap_getnext(TableScanDesc sscan, ScanDirection direction) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg_internal("only heap AM is supported"))); - /* - * We don't expect direct calls to heap_getnext with valid CheckXidAlive - * for catalog or regular tables. See detailed comments in xact.c where - * these variables are declared. Normally we have such a check at tableam - * level API but this is called from many places so we need to ensure it - * here. - */ - if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) - elog(ERROR, "unexpected heap_getnext call during logical decoding"); - /* Note: no locking manipulations needed */ if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE) diff --git a/src/backend/access/heap/heaptoast.c b/src/backend/access/heap/heaptoast.c index e28fe47a449..6ddf6c6cf9f 100644 --- a/src/backend/access/heap/heaptoast.c +++ b/src/backend/access/heap/heaptoast.c @@ -768,7 +768,7 @@ heap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, chcpyend = (sliceoffset + slicelength - 1) % TOAST_MAX_CHUNK_SIZE; memcpy(VARDATA(result) + - (curchunk * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, + curchunk * TOAST_MAX_CHUNK_SIZE - sliceoffset + chcpystrt, chunkdata + chcpystrt, (chcpyend - chcpystrt) + 1); diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index a29be6f467b..5e89b86a62c 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -420,6 +420,14 @@ systable_beginscan(Relation heapRelation, sysscan->snapshot = NULL; } + /* + * If CheckXidAlive is set then set a flag to indicate that system table + * scan is in-progress. See detailed comments in xact.c where these + * variables are declared. + */ + if (TransactionIdIsValid(CheckXidAlive)) + bsysscan = true; + if (irel) { int i; @@ -468,14 +476,6 @@ systable_beginscan(Relation heapRelation, sysscan->iscan = NULL; } - /* - * If CheckXidAlive is set then set a flag to indicate that system table - * scan is in-progress. See detailed comments in xact.c where these - * variables are declared. - */ - if (TransactionIdIsValid(CheckXidAlive)) - bsysscan = true; - return sysscan; } @@ -707,13 +707,6 @@ systable_beginscan_ordered(Relation heapRelation, elog(ERROR, "column is not in index"); } - sysscan->iscan = index_beginscan(heapRelation, indexRelation, - snapshot, NULL, nkeys, 0); - index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0); - sysscan->scan = NULL; - - pfree(idxkey); - /* * If CheckXidAlive is set then set a flag to indicate that system table * scan is in-progress. See detailed comments in xact.c where these @@ -722,6 +715,13 @@ systable_beginscan_ordered(Relation heapRelation, if (TransactionIdIsValid(CheckXidAlive)) bsysscan = true; + sysscan->iscan = index_beginscan(heapRelation, indexRelation, + snapshot, NULL, nkeys, 0); + index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0); + sysscan->scan = NULL; + + pfree(idxkey); + return sysscan; } diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c index 8425805a292..1d343377e98 100644 --- a/src/backend/access/nbtree/nbtcompare.c +++ b/src/backend/access/nbtree/nbtcompare.c @@ -57,6 +57,7 @@ #include +#include "utils/builtins.h" #include "utils/fmgrprotos.h" #include "utils/skipsupport.h" #include "utils/sortsupport.h" @@ -587,6 +588,9 @@ btoidvectorcmp(PG_FUNCTION_ARGS) oidvector *b = (oidvector *) PG_GETARG_POINTER(1); int i; + check_valid_oidvector(a); + check_valid_oidvector(b); + /* We arbitrarily choose to sort first by vector length */ if (a->dim1 != b->dim1) PG_RETURN_INT32(a->dim1 - b->dim1); diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 90ab4e91b56..3a45508f62e 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -69,8 +69,8 @@ /* * DISABLE_LEADER_PARTICIPATION disables the leader's participation in * parallel index builds. This may be useful as a debugging aid. -#undef DISABLE_LEADER_PARTICIPATION */ +/* #define DISABLE_LEADER_PARTICIPATION */ /* * Status record for spooling/sorting phase. (Note we may have two of diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 87491796523..dfda1af412e 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -117,8 +117,8 @@ table_beginscan_catalog(Relation relation, int nkeys, ScanKeyData *key) Oid relid = RelationGetRelid(relation); Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid)); - return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key, - NULL, flags); + return table_beginscan_common(relation, snapshot, nkeys, key, + NULL, flags); } @@ -184,8 +184,8 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) snapshot = SnapshotAny; } - return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL, - pscan, flags); + return table_beginscan_common(relation, snapshot, 0, NULL, + pscan, flags); } TableScanDesc @@ -214,8 +214,8 @@ table_beginscan_parallel_tidrange(Relation relation, snapshot = SnapshotAny; } - sscan = relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL, - pscan, flags); + sscan = table_beginscan_common(relation, snapshot, 0, NULL, + pscan, flags); return sscan; } @@ -269,14 +269,6 @@ table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid) Relation rel = scan->rs_rd; const TableAmRoutine *tableam = rel->rd_tableam; - /* - * We don't expect direct calls to table_tuple_get_latest_tid with valid - * CheckXidAlive for catalog or regular tables. See detailed comments in - * xact.c where these variables are declared. - */ - if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) - elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding"); - /* * Since this can be called with user-supplied TID, don't trust the input * too much. diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index 01a89104ef0..fe00488487d 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -357,7 +357,7 @@ InitializeParallelDSM(ParallelContext *pcxt) fps->stmt_ts = GetCurrentStatementStartTimestamp(); fps->serializable_xact_handle = ShareSerializableXact(); SpinLockInit(&fps->mutex); - fps->last_xlog_end = 0; + fps->last_xlog_end = InvalidXLogRecPtr; shm_toc_insert(pcxt->toc, PARALLEL_KEY_FIXED, fps); /* We can skip the rest of this if we're not budgeting for any workers. */ @@ -530,7 +530,7 @@ ReinitializeParallelDSM(ParallelContext *pcxt) /* Reset a few bits of fixed parallel state to a clean state. */ fps = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_FIXED, false); - fps->last_xlog_end = 0; + fps->last_xlog_end = InvalidXLogRecPtr; /* Recreate error queues (if they exist). */ if (pcxt->nworkers > 0) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 601ce3faa64..eabc4d48208 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -470,7 +470,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, FullTransactionId fxid, proc->databaseId = databaseid; proc->roleId = owner; proc->tempNamespaceId = InvalidOid; - proc->isRegularBackend = false; + proc->backendType = B_INVALID; proc->lwWaiting = LW_WS_NOT_WAITING; proc->lwWaitMode = 0; proc->waitLock = NULL; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 16614e152dd..13ec6225b85 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2060,7 +2060,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) /* Have to write it ourselves */ TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START(); WriteRqst.Write = OldPageRqstPtr; - WriteRqst.Flush = 0; + WriteRqst.Flush = InvalidXLogRecPtr; XLogWrite(WriteRqst, tli, false); LWLockRelease(WALWriteLock); pgWalUsage.wal_buffers_full++; @@ -3077,7 +3077,7 @@ XLogBackgroundFlush(void) else { /* no flushing, this time round */ - WriteRqst.Flush = 0; + WriteRqst.Flush = InvalidXLogRecPtr; } #ifdef WAL_DEBUG @@ -5207,7 +5207,7 @@ BootStrapXLOG(uint32 data_checksum_version) /* Insert the initial checkpoint record */ recptr = ((char *) page + SizeOfXLogLongPHD); record = (XLogRecord *) recptr; - record->xl_prev = 0; + record->xl_prev = InvalidXLogRecPtr; record->xl_xid = InvalidTransactionId; record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint); record->xl_info = XLOG_CHECKPOINT_SHUTDOWN; diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index 3c3f067aafb..24cfa96d737 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -967,7 +967,7 @@ XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, XLogRecPtr recPtr) /* Book-keeping to avoid readahead on first read. */ prefetcher->begin_ptr = recPtr; - prefetcher->no_readahead_until = 0; + prefetcher->no_readahead_until = InvalidXLogRecPtr; /* This will forget about any queued up records in the decoder. */ XLogBeginRead(prefetcher->reader, recPtr); diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index a81dcbb5d79..4fc37a031d9 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -261,7 +261,7 @@ static TimestampTz XLogReceiptTime = 0; static XLogSource XLogReceiptSource = XLOG_FROM_ANY; /* Local copy of WalRcv->flushedUpto */ -static XLogRecPtr flushedUpto = 0; +static XLogRecPtr flushedUpto = InvalidXLogRecPtr; static TimeLineID receiveTLI = 0; /* @@ -3918,7 +3918,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, RequestXLogStreaming(tli, ptr, PrimaryConnInfo, PrimarySlotName, wal_receiver_create_temp_slot); - flushedUpto = 0; + flushedUpto = InvalidXLogRecPtr; } /* @@ -4096,7 +4096,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) { - static XLogRecPtr lastComplaint = 0; + static XLogRecPtr lastComplaint = InvalidXLogRecPtr; if (readSource == XLOG_FROM_PG_WAL && emode == LOG) { diff --git a/src/backend/backup/walsummary.c b/src/backend/backup/walsummary.c index 21164faac7e..4cd1824fbc6 100644 --- a/src/backend/backup/walsummary.c +++ b/src/backend/backup/walsummary.c @@ -214,7 +214,7 @@ OpenWalSummaryFile(WalSummaryFile *ws, bool missing_ok) LSN_FORMAT_ARGS(ws->end_lsn)); file = PathNameOpenFile(path, O_RDONLY); - if (file < 0 && (errno != EEXIST || !missing_ok)) + if (file < 0 && (errno != ENOENT || !missing_ok)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); @@ -251,7 +251,7 @@ RemoveWalSummaryIfOlderThan(WalSummaryFile *ws, time_t cutoff_time) if (unlink(path) != 0) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not stat file \"%s\": %m", path))); + errmsg("could not remove file \"%s\": %m", path))); ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path))); } diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index dd57624b4f9..7d32cd0e159 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -131,13 +131,13 @@ static const struct typinfo TypInfo[] = { F_OIDVECTORIN, F_OIDVECTOROUT}, {"_int4", INT4ARRAYOID, INT4OID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid, F_ARRAY_IN, F_ARRAY_OUT}, - {"_text", 1009, TEXTOID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, DEFAULT_COLLATION_OID, + {"_text", TEXTARRAYOID, TEXTOID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, DEFAULT_COLLATION_OID, F_ARRAY_IN, F_ARRAY_OUT}, - {"_oid", 1028, OIDOID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid, + {"_oid", OIDARRAYOID, OIDOID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid, F_ARRAY_IN, F_ARRAY_OUT}, - {"_char", 1002, CHAROID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid, + {"_char", CHARARRAYOID, CHAROID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid, F_ARRAY_IN, F_ARRAY_OUT}, - {"_aclitem", 1034, ACLITEMOID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid, + {"_aclitem", ACLITEMARRAYOID, ACLITEMOID, -1, false, TYPALIGN_DOUBLE, TYPSTORAGE_EXTENDED, InvalidOid, F_ARRAY_IN, F_ARRAY_OUT} }; diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 606434823cf..a6ed9849e77 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -2635,6 +2635,7 @@ AddRelationNewConstraints(Relation rel, * requested validity. */ if (AdjustNotNullInheritance(RelationGetRelid(rel), colnum, + cdef->conname, is_local, cdef->is_no_inherit, cdef->skip_validation)) continue; diff --git a/src/backend/catalog/pg_constraint.c b/src/backend/catalog/pg_constraint.c index cbbcf166e45..b12765ae691 100644 --- a/src/backend/catalog/pg_constraint.c +++ b/src/backend/catalog/pg_constraint.c @@ -731,14 +731,15 @@ extractNotNullColumn(HeapTuple constrTup) * If a constraint exists but the connoinherit flag is not what the caller * wants, throw an error about the incompatibility. If the desired * constraint is valid but the existing constraint is not valid, also - * throw an error about that (the opposite case is acceptable). + * throw an error about that (the opposite case is acceptable). If + * the proposed constraint has a different name, also throw an error. * * If everything checks out, we adjust conislocal/coninhcount and return * true. If is_local is true we flip conislocal true, or do nothing if * it's already true; otherwise we increment coninhcount by 1. */ bool -AdjustNotNullInheritance(Oid relid, AttrNumber attnum, +AdjustNotNullInheritance(Oid relid, AttrNumber attnum, const char *new_conname, bool is_local, bool is_no_inherit, bool is_notvalid) { HeapTuple tup; @@ -777,6 +778,22 @@ AdjustNotNullInheritance(Oid relid, AttrNumber attnum, errhint("You might need to validate it using %s.", "ALTER TABLE ... VALIDATE CONSTRAINT")); + /* + * If, for a new constraint that is being defined locally (i.e., not + * being passed down via inheritance), a name was specified, then + * verify that the existing constraint has the same name. Otherwise + * throw an error. Names of inherited constraints are ignored because + * they are not directly user-specified, so matching is not important. + */ + if (is_local && new_conname && + strcmp(new_conname, NameStr(conform->conname)) != 0) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot create not-null constraint \"%s\" on column \"%s\" of table \"%s\"", + new_conname, get_attname(relid, attnum, false), get_rel_name(relid)), + errdetail("A not-null constraint named \"%s\" already exists for this column.", + NameStr(conform->conname))); + if (!is_local) { if (pg_add_s16_overflow(conform->coninhcount, 1, diff --git a/src/backend/catalog/pg_depend.c b/src/backend/catalog/pg_depend.c index 55309d16f15..07c2d41c189 100644 --- a/src/backend/catalog/pg_depend.c +++ b/src/backend/catalog/pg_depend.c @@ -23,12 +23,14 @@ #include "catalog/pg_constraint.h" #include "catalog/pg_depend.h" #include "catalog/pg_extension.h" +#include "catalog/pg_type.h" #include "catalog/partition.h" #include "commands/extension.h" #include "miscadmin.h" #include "utils/fmgroids.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/syscache.h" static bool isObjectPinned(const ObjectAddress *object); @@ -813,6 +815,77 @@ getAutoExtensionsOfObject(Oid classId, Oid objectId) return result; } +/* + * Look up a type belonging to an extension. + * + * Returns the type's OID, or InvalidOid if not found. + * + * Notice that the type is specified by name only, without a schema. + * That's because this will typically be used by relocatable extensions + * which can't make a-priori assumptions about which schema their objects + * are in. As long as the extension only defines one type of this name, + * the answer is unique anyway. + * + * We might later add the ability to look up functions, operators, etc. + */ +Oid +getExtensionType(Oid extensionOid, const char *typname) +{ + Oid result = InvalidOid; + Relation depRel; + ScanKeyData key[3]; + SysScanDesc scan; + HeapTuple tup; + + depRel = table_open(DependRelationId, AccessShareLock); + + ScanKeyInit(&key[0], + Anum_pg_depend_refclassid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(ExtensionRelationId)); + ScanKeyInit(&key[1], + Anum_pg_depend_refobjid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(extensionOid)); + ScanKeyInit(&key[2], + Anum_pg_depend_refobjsubid, + BTEqualStrategyNumber, F_INT4EQ, + Int32GetDatum(0)); + + scan = systable_beginscan(depRel, DependReferenceIndexId, true, + NULL, 3, key); + + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + Form_pg_depend depform = (Form_pg_depend) GETSTRUCT(tup); + + if (depform->classid == TypeRelationId && + depform->deptype == DEPENDENCY_EXTENSION) + { + Oid typoid = depform->objid; + HeapTuple typtup; + + typtup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typoid)); + if (!HeapTupleIsValid(typtup)) + continue; /* should we throw an error? */ + if (strcmp(NameStr(((Form_pg_type) GETSTRUCT(typtup))->typname), + typname) == 0) + { + result = typoid; + ReleaseSysCache(typtup); + break; /* no need to keep searching */ + } + ReleaseSysCache(typtup); + } + } + + systable_endscan(scan); + + table_close(depRel, AccessShareLock); + + return result; +} + /* * Detect whether a sequence is marked as "owned" by a column * diff --git a/src/backend/catalog/pg_proc.c b/src/backend/catalog/pg_proc.c index acff7a0096d..5df4b3f7a91 100644 --- a/src/backend/catalog/pg_proc.c +++ b/src/backend/catalog/pg_proc.c @@ -1206,7 +1206,7 @@ match_prosrc_to_literal(const char *prosrc, const char *literal, if (cursorpos > 0) newcp++; } - chlen = pg_mblen(prosrc); + chlen = pg_mblen_cstr(prosrc); if (strncmp(prosrc, literal, chlen) != 0) goto fail; prosrc += chlen; diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 7553f31fef0..1ea8f1faa9e 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -363,7 +363,28 @@ CREATE VIEW pg_stats_ext_exprs WITH (security_barrier) AS WHEN (stat.a).stakind3 = 5 THEN (stat.a).stanumbers3 WHEN (stat.a).stakind4 = 5 THEN (stat.a).stanumbers4 WHEN (stat.a).stakind5 = 5 THEN (stat.a).stanumbers5 - END) AS elem_count_histogram + END) AS elem_count_histogram, + (CASE + WHEN (stat.a).stakind1 = 6 THEN (stat.a).stavalues1 + WHEN (stat.a).stakind2 = 6 THEN (stat.a).stavalues2 + WHEN (stat.a).stakind3 = 6 THEN (stat.a).stavalues3 + WHEN (stat.a).stakind4 = 6 THEN (stat.a).stavalues4 + WHEN (stat.a).stakind5 = 6 THEN (stat.a).stavalues5 + END) AS range_length_histogram, + (CASE + WHEN (stat.a).stakind1 = 6 THEN (stat.a).stanumbers1[1] + WHEN (stat.a).stakind2 = 6 THEN (stat.a).stanumbers2[1] + WHEN (stat.a).stakind3 = 6 THEN (stat.a).stanumbers3[1] + WHEN (stat.a).stakind4 = 6 THEN (stat.a).stanumbers4[1] + WHEN (stat.a).stakind5 = 6 THEN (stat.a).stanumbers5[1] + END) AS range_empty_frac, + (CASE + WHEN (stat.a).stakind1 = 7 THEN (stat.a).stavalues1 + WHEN (stat.a).stakind2 = 7 THEN (stat.a).stavalues2 + WHEN (stat.a).stakind3 = 7 THEN (stat.a).stavalues3 + WHEN (stat.a).stakind4 = 7 THEN (stat.a).stavalues4 + WHEN (stat.a).stakind5 = 7 THEN (stat.a).stavalues5 + END) AS range_bounds_histogram FROM pg_statistic_ext s JOIN pg_class c ON (c.oid = s.stxrelid) LEFT JOIN pg_statistic_ext_data sd ON (s.oid = sd.stxoid) LEFT JOIN pg_namespace cn ON (cn.oid = c.relnamespace) diff --git a/src/backend/commands/comment.c b/src/backend/commands/comment.c index caacb17e5d7..771aba2a69f 100644 --- a/src/backend/commands/comment.c +++ b/src/backend/commands/comment.c @@ -41,6 +41,7 @@ CommentObject(CommentStmt *stmt) { Relation relation; ObjectAddress address = InvalidObjectAddress; + bool missing_ok; /* * When loading a dump, we may see a COMMENT ON DATABASE for the old name @@ -63,6 +64,14 @@ CommentObject(CommentStmt *stmt) } } + /* + * During binary upgrade, allow nonexistent large objects so that we don't + * have to create them during schema restoration. pg_upgrade will + * transfer the contents of pg_largeobject_metadata via COPY or by + * copying/linking its files from the old cluster later on. + */ + missing_ok = IsBinaryUpgrade && stmt->objtype == OBJECT_LARGEOBJECT; + /* * Translate the parser representation that identifies this object into an * ObjectAddress. get_object_address() will throw an error if the object @@ -70,7 +79,8 @@ CommentObject(CommentStmt *stmt) * against concurrent DROP operations. */ address = get_object_address(stmt->objtype, stmt->object, - &relation, ShareUpdateExclusiveLock, false); + &relation, ShareUpdateExclusiveLock, + missing_ok); /* Require ownership of the target object. */ check_object_ownership(GetUserId(), stmt->objtype, address, diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index 5868a7fa11f..94d6f415a06 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -249,7 +249,9 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread) switch (cstate->copy_src) { case COPY_FILE: + pgstat_report_wait_start(WAIT_EVENT_COPY_FROM_READ); bytesread = fread(databuf, 1, maxread, cstate->copy_file); + pgstat_report_wait_end(); if (ferror(cstate->copy_file)) ereport(ERROR, (errcode_for_file_access(), diff --git a/src/backend/commands/copyto.c b/src/backend/commands/copyto.c index 4ab4a3893d5..9ceeff6d99e 100644 --- a/src/backend/commands/copyto.c +++ b/src/backend/commands/copyto.c @@ -454,6 +454,7 @@ CopySendEndOfRow(CopyToState cstate) switch (cstate->copy_dest) { case COPY_FILE: + pgstat_report_wait_start(WAIT_EVENT_COPY_TO_WRITE); if (fwrite(fe_msgbuf->data, fe_msgbuf->len, 1, cstate->copy_file) != 1 || ferror(cstate->copy_file)) @@ -486,6 +487,7 @@ CopySendEndOfRow(CopyToState cstate) (errcode_for_file_access(), errmsg("could not write to COPY file: %m"))); } + pgstat_report_wait_end(); break; case COPY_FRONTEND: /* Dump the accumulated row as one CopyData message */ diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 87949054f26..33311760df7 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -60,6 +60,7 @@ #include "storage/lmgr.h" #include "storage/md.h" #include "storage/procarray.h" +#include "storage/procsignal.h" #include "storage/smgr.h" #include "utils/acl.h" #include "utils/builtins.h" diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c index 596105ee078..81f24615d51 100644 --- a/src/backend/commands/extension.c +++ b/src/backend/commands/extension.c @@ -45,6 +45,7 @@ #include "catalog/pg_depend.h" #include "catalog/pg_extension.h" #include "catalog/pg_namespace.h" +#include "catalog/pg_proc.h" #include "catalog/pg_type.h" #include "commands/alter.h" #include "commands/comment.h" @@ -62,6 +63,7 @@ #include "utils/builtins.h" #include "utils/conffiles.h" #include "utils/fmgroids.h" +#include "utils/inval.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -141,7 +143,26 @@ typedef struct char *loc; } ExtensionLocation; +/* + * Cache structure for get_function_sibling_type (and maybe later, + * allied lookup functions). + */ +typedef struct ExtensionSiblingCache +{ + struct ExtensionSiblingCache *next; /* list link */ + /* lookup key: requesting function's OID and type name */ + Oid reqfuncoid; + const char *typname; + bool valid; /* is entry currently valid? */ + uint32 exthash; /* cache hash of owning extension's OID */ + Oid typeoid; /* OID associated with typname */ +} ExtensionSiblingCache; + +/* Head of linked list of ExtensionSiblingCache structs */ +static ExtensionSiblingCache *ext_sibling_list = NULL; + /* Local functions */ +static void ext_sibling_callback(Datum arg, int cacheid, uint32 hashvalue); static List *find_update_path(List *evi_list, ExtensionVersionInfo *evi_start, ExtensionVersionInfo *evi_target, @@ -263,6 +284,114 @@ get_extension_schema(Oid ext_oid) return result; } +/* + * get_function_sibling_type - find a type belonging to same extension as func + * + * Returns the type's OID, or InvalidOid if not found. + * + * This is useful in extensions, which won't have fixed object OIDs. + * We work from the calling function's own OID, which it can get from its + * FunctionCallInfo parameter, and look up the owning extension and thence + * a type belonging to the same extension. + * + * Notice that the type is specified by name only, without a schema. + * That's because this will typically be used by relocatable extensions + * which can't make a-priori assumptions about which schema their objects + * are in. As long as the extension only defines one type of this name, + * the answer is unique anyway. + * + * We might later add the ability to look up functions, operators, etc. + * + * This code is simply a frontend for some pg_depend lookups. Those lookups + * are fairly expensive, so we provide a simple cache facility. We assume + * that the passed typname is actually a C constant, or at least permanently + * allocated, so that we need not copy that string. + */ +Oid +get_function_sibling_type(Oid funcoid, const char *typname) +{ + ExtensionSiblingCache *cache_entry; + Oid extoid; + Oid typeoid; + + /* + * See if we have the answer cached. Someday there may be enough callers + * to justify a hash table, but for now, a simple linked list is fine. + */ + for (cache_entry = ext_sibling_list; cache_entry != NULL; + cache_entry = cache_entry->next) + { + if (funcoid == cache_entry->reqfuncoid && + strcmp(typname, cache_entry->typname) == 0) + break; + } + if (cache_entry && cache_entry->valid) + return cache_entry->typeoid; + + /* + * Nope, so do the expensive lookups. We do not expect failures, so we do + * not cache negative results. + */ + extoid = getExtensionOfObject(ProcedureRelationId, funcoid); + if (!OidIsValid(extoid)) + return InvalidOid; + typeoid = getExtensionType(extoid, typname); + if (!OidIsValid(typeoid)) + return InvalidOid; + + /* + * Build, or revalidate, cache entry. + */ + if (cache_entry == NULL) + { + /* Register invalidation hook if this is first entry */ + if (ext_sibling_list == NULL) + CacheRegisterSyscacheCallback(EXTENSIONOID, + ext_sibling_callback, + (Datum) 0); + + /* Momentarily zero the space to ensure valid flag is false */ + cache_entry = (ExtensionSiblingCache *) + MemoryContextAllocZero(CacheMemoryContext, + sizeof(ExtensionSiblingCache)); + cache_entry->next = ext_sibling_list; + ext_sibling_list = cache_entry; + } + + cache_entry->reqfuncoid = funcoid; + cache_entry->typname = typname; + cache_entry->exthash = GetSysCacheHashValue1(EXTENSIONOID, + ObjectIdGetDatum(extoid)); + cache_entry->typeoid = typeoid; + /* Mark it valid only once it's fully populated */ + cache_entry->valid = true; + + return typeoid; +} + +/* + * ext_sibling_callback + * Syscache inval callback function for EXTENSIONOID cache + * + * It seems sufficient to invalidate ExtensionSiblingCache entries when + * the owning extension's pg_extension entry is modified or deleted. + * Neither a requesting function's OID, nor the OID of the object it's + * looking for, could change without an extension update or drop/recreate. + */ +static void +ext_sibling_callback(Datum arg, int cacheid, uint32 hashvalue) +{ + ExtensionSiblingCache *cache_entry; + + for (cache_entry = ext_sibling_list; cache_entry != NULL; + cache_entry = cache_entry->next) + { + if (hashvalue == 0 || + cache_entry->exthash == hashvalue) + cache_entry->valid = false; + } +} + /* * Utility functions to check validity of extension and version names */ @@ -1191,7 +1320,7 @@ execute_extension_script(Oid extensionOid, ExtensionControlFile *control, (void) set_config_option("client_min_messages", "warning", PGC_USERSET, PGC_S_SESSION, GUC_ACTION_SAVE, true, 0, false); - if (log_min_messages < WARNING) + if (log_min_messages[MyBackendType] < WARNING) (void) set_config_option_ext("log_min_messages", "warning", PGC_SUSET, PGC_S_SESSION, BOOTSTRAP_SUPERUSERID, @@ -2557,9 +2686,9 @@ extension_file_exists(const char *extensionName) locations = get_extension_control_directories(); - foreach_ptr(char, location, locations) + foreach_ptr(ExtensionLocation, location, locations) { - dir = AllocateDir(location); + dir = AllocateDir(location->loc); /* * If the control directory doesn't exist, we want to silently return @@ -2571,7 +2700,7 @@ extension_file_exists(const char *extensionName) } else { - while ((de = ReadDir(dir, location)) != NULL) + while ((de = ReadDir(dir, location->loc)) != NULL) { char *extname; diff --git a/src/backend/commands/operatorcmds.c b/src/backend/commands/operatorcmds.c index 9f7e0ed17ce..3e7b09b3494 100644 --- a/src/backend/commands/operatorcmds.c +++ b/src/backend/commands/operatorcmds.c @@ -276,7 +276,6 @@ ValidateRestrictionEstimator(List *restrictionName) { Oid typeId[4]; Oid restrictionOid; - AclResult aclresult; typeId[0] = INTERNALOID; /* PlannerInfo */ typeId[1] = OIDOID; /* operator OID */ @@ -292,11 +291,33 @@ ValidateRestrictionEstimator(List *restrictionName) errmsg("restriction estimator function %s must return type %s", NameListToString(restrictionName), "float8"))); - /* Require EXECUTE rights for the estimator */ - aclresult = object_aclcheck(ProcedureRelationId, restrictionOid, GetUserId(), ACL_EXECUTE); - if (aclresult != ACLCHECK_OK) - aclcheck_error(aclresult, OBJECT_FUNCTION, - NameListToString(restrictionName)); + /* + * If the estimator is not a built-in function, require superuser + * privilege to install it. This protects against using something that is + * not a restriction estimator or has hard-wired assumptions about what + * data types it is working with. (Built-in estimators are required to + * defend themselves adequately against unexpected data type choices, but + * it seems impractical to expect that of extensions' estimators.) + * + * If it is built-in, only require EXECUTE rights. + */ + if (restrictionOid >= FirstGenbkiObjectId) + { + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to specify a non-built-in restriction estimator function"))); + } + else + { + AclResult aclresult; + + aclresult = object_aclcheck(ProcedureRelationId, restrictionOid, + GetUserId(), ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + NameListToString(restrictionName)); + } return restrictionOid; } @@ -312,7 +333,6 @@ ValidateJoinEstimator(List *joinName) Oid typeId[5]; Oid joinOid; Oid joinOid2; - AclResult aclresult; typeId[0] = INTERNALOID; /* PlannerInfo */ typeId[1] = OIDOID; /* operator OID */ @@ -350,11 +370,24 @@ ValidateJoinEstimator(List *joinName) errmsg("join estimator function %s must return type %s", NameListToString(joinName), "float8"))); - /* Require EXECUTE rights for the estimator */ - aclresult = object_aclcheck(ProcedureRelationId, joinOid, GetUserId(), ACL_EXECUTE); - if (aclresult != ACLCHECK_OK) - aclcheck_error(aclresult, OBJECT_FUNCTION, - NameListToString(joinName)); + /* privilege checks are the same as in ValidateRestrictionEstimator */ + if (joinOid >= FirstGenbkiObjectId) + { + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to specify a non-built-in join estimator function"))); + } + else + { + AclResult aclresult; + + aclresult = object_aclcheck(ProcedureRelationId, joinOid, + GetUserId(), ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + NameListToString(joinName)); + } return joinOid; } diff --git a/src/backend/commands/seclabel.c b/src/backend/commands/seclabel.c index 4160f5b6855..5b80396723c 100644 --- a/src/backend/commands/seclabel.c +++ b/src/backend/commands/seclabel.c @@ -118,6 +118,7 @@ ExecSecLabelStmt(SecLabelStmt *stmt) ObjectAddress address; Relation relation; ListCell *lc; + bool missing_ok; /* * Find the named label provider, or if none specified, check whether @@ -159,6 +160,14 @@ ExecSecLabelStmt(SecLabelStmt *stmt) (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("security labels are not supported for this type of object"))); + /* + * During binary upgrade, allow nonexistent large objects so that we don't + * have to create them during schema restoration. pg_upgrade will + * transfer the contents of pg_largeobject_metadata via COPY or by + * copying/linking its files from the old cluster later on. + */ + missing_ok = IsBinaryUpgrade && stmt->objtype == OBJECT_LARGEOBJECT; + /* * Translate the parser representation which identifies this object into * an ObjectAddress. get_object_address() will throw an error if the @@ -166,7 +175,8 @@ ExecSecLabelStmt(SecLabelStmt *stmt) * guard against concurrent modifications. */ address = get_object_address(stmt->objtype, stmt->object, - &relation, ShareUpdateExclusiveLock, false); + &relation, ShareUpdateExclusiveLock, + missing_ok); /* Require ownership of the target object. */ check_object_ownership(GetUserId(), stmt->objtype, address, diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c index 0b064891932..3511a4ec0fd 100644 --- a/src/backend/commands/tablespace.c +++ b/src/backend/commands/tablespace.c @@ -70,6 +70,7 @@ #include "miscadmin.h" #include "postmaster/bgwriter.h" #include "storage/fd.h" +#include "storage/procsignal.h" #include "storage/standby.h" #include "utils/acl.h" #include "utils/builtins.h" diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c index a7a5ac1e83b..61ff5ddc74c 100644 --- a/src/backend/executor/execExprInterp.c +++ b/src/backend/executor/execExprInterp.c @@ -4032,6 +4032,7 @@ ExecEvalScalarArrayOp(ExprState *state, ExprEvalStep *op) int16 typlen; bool typbyval; char typalign; + uint8 typalignby; char *s; bits8 *bitmap; int bitmask; @@ -4086,6 +4087,7 @@ ExecEvalScalarArrayOp(ExprState *state, ExprEvalStep *op) typlen = op->d.scalararrayop.typlen; typbyval = op->d.scalararrayop.typbyval; typalign = op->d.scalararrayop.typalign; + typalignby = typalign_to_alignby(typalign); /* Initialize result appropriately depending on useOr */ result = BoolGetDatum(!useOr); @@ -4111,7 +4113,7 @@ ExecEvalScalarArrayOp(ExprState *state, ExprEvalStep *op) { elt = fetch_att(s, typbyval, typlen); s = att_addlength_pointer(s, typlen, s); - s = (char *) att_align_nominal(s, typalign); + s = (char *) att_nominal_alignby(s, typalignby); fcinfo->args[1].value = elt; fcinfo->args[1].isnull = false; } @@ -4255,6 +4257,7 @@ ExecEvalHashedScalarArrayOp(ExprState *state, ExprEvalStep *op, ExprContext *eco int16 typlen; bool typbyval; char typalign; + uint8 typalignby; int nitems; bool has_nulls = false; char *s; @@ -4272,6 +4275,7 @@ ExecEvalHashedScalarArrayOp(ExprState *state, ExprEvalStep *op, ExprContext *eco &typlen, &typbyval, &typalign); + typalignby = typalign_to_alignby(typalign); oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory); @@ -4318,7 +4322,7 @@ ExecEvalHashedScalarArrayOp(ExprState *state, ExprEvalStep *op, ExprContext *eco element = fetch_att(s, typbyval, typlen); s = att_addlength_pointer(s, typlen, s); - s = (char *) att_align_nominal(s, typalign); + s = (char *) att_nominal_alignby(s, typalignby); saophash_insert(elements_tab->hashtab, element, &hashfound); } diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 772e81f3154..f87978c137e 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -106,7 +106,7 @@ struct SharedExecutorInstrumentation /* array of num_plan_nodes * num_workers Instrumentation objects follows */ }; #define GetInstrumentationArray(sei) \ - (AssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \ + (StaticAssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \ (Instrumentation *) (((char *) sei) + sei->instrument_offset)) /* Context object for ExecParallelEstimate. */ diff --git a/src/backend/jit/llvm/llvmjit_types.c b/src/backend/jit/llvm/llvmjit_types.c index 4636b90cd0f..c8a1f841293 100644 --- a/src/backend/jit/llvm/llvmjit_types.c +++ b/src/backend/jit/llvm/llvmjit_types.c @@ -81,7 +81,7 @@ extern Datum AttributeTemplate(PG_FUNCTION_ARGS); Datum AttributeTemplate(PG_FUNCTION_ARGS) { - AssertVariableIsOfType(&AttributeTemplate, PGFunction); + StaticAssertVariableIsOfType(&AttributeTemplate, PGFunction); PG_RETURN_NULL(); } @@ -99,8 +99,8 @@ ExecEvalSubroutineTemplate(ExprState *state, struct ExprEvalStep *op, ExprContext *econtext) { - AssertVariableIsOfType(&ExecEvalSubroutineTemplate, - ExecEvalSubroutine); + StaticAssertVariableIsOfType(&ExecEvalSubroutineTemplate, + ExecEvalSubroutine); } extern bool ExecEvalBoolSubroutineTemplate(ExprState *state, @@ -111,8 +111,8 @@ ExecEvalBoolSubroutineTemplate(ExprState *state, struct ExprEvalStep *op, ExprContext *econtext) { - AssertVariableIsOfType(&ExecEvalBoolSubroutineTemplate, - ExecEvalBoolSubroutine); + StaticAssertVariableIsOfType(&ExecEvalBoolSubroutineTemplate, + ExecEvalBoolSubroutine); return false; } diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index b4581e54d93..90275e25872 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -128,8 +128,10 @@ static Path *get_cheapest_parameterized_child_path(PlannerInfo *root, Relids required_outer); static void accumulate_append_subpath(Path *path, List **subpaths, - List **special_subpaths); -static Path *get_singleton_append_subpath(Path *path); + List **special_subpaths, + List **child_append_relid_sets); +static Path *get_singleton_append_subpath(Path *path, + List **child_append_relid_sets); static void set_dummy_rel_pathlist(RelOptInfo *rel); static void set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte); @@ -1404,22 +1406,21 @@ void add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, List *live_childrels) { - List *subpaths = NIL; - bool subpaths_valid = true; - List *startup_subpaths = NIL; - bool startup_subpaths_valid = true; - List *partial_subpaths = NIL; - List *pa_partial_subpaths = NIL; - List *pa_nonpartial_subpaths = NIL; - bool partial_subpaths_valid = true; - bool pa_subpaths_valid; + AppendPathInput unparameterized = {0}; + AppendPathInput startup = {0}; + AppendPathInput partial_only = {0}; + AppendPathInput parallel_append = {0}; + bool unparameterized_valid = true; + bool startup_valid = true; + bool partial_only_valid = true; + bool parallel_append_valid = true; List *all_child_pathkeys = NIL; List *all_child_outers = NIL; ListCell *l; double partial_rows = -1; /* If appropriate, consider parallel append */ - pa_subpaths_valid = enable_parallel_append && rel->consider_parallel; + parallel_append_valid = enable_parallel_append && rel->consider_parallel; /* * For every non-dummy child, remember the cheapest path. Also, identify @@ -1443,9 +1444,9 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, if (childrel->pathlist != NIL && childrel->cheapest_total_path->param_info == NULL) accumulate_append_subpath(childrel->cheapest_total_path, - &subpaths, NULL); + &unparameterized.subpaths, NULL, &unparameterized.child_append_relid_sets); else - subpaths_valid = false; + unparameterized_valid = false; /* * When the planner is considering cheap startup plans, we'll also @@ -1471,11 +1472,12 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, /* cheapest_startup_path must not be a parameterized path. */ Assert(cheapest_path->param_info == NULL); accumulate_append_subpath(cheapest_path, - &startup_subpaths, - NULL); + &startup.subpaths, + NULL, + &startup.child_append_relid_sets); } else - startup_subpaths_valid = false; + startup_valid = false; /* Same idea, but for a partial plan. */ @@ -1483,16 +1485,17 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, { cheapest_partial_path = linitial(childrel->partial_pathlist); accumulate_append_subpath(cheapest_partial_path, - &partial_subpaths, NULL); + &partial_only.partial_subpaths, NULL, + &partial_only.child_append_relid_sets); } else - partial_subpaths_valid = false; + partial_only_valid = false; /* * Same idea, but for a parallel append mixing partial and non-partial * paths. */ - if (pa_subpaths_valid) + if (parallel_append_valid) { Path *nppath = NULL; @@ -1502,7 +1505,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, if (cheapest_partial_path == NULL && nppath == NULL) { /* Neither a partial nor a parallel-safe path? Forget it. */ - pa_subpaths_valid = false; + parallel_append_valid = false; } else if (nppath == NULL || (cheapest_partial_path != NULL && @@ -1511,8 +1514,9 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, /* Partial path is cheaper or the only option. */ Assert(cheapest_partial_path != NULL); accumulate_append_subpath(cheapest_partial_path, - &pa_partial_subpaths, - &pa_nonpartial_subpaths); + ¶llel_append.partial_subpaths, + ¶llel_append.subpaths, + ¶llel_append.child_append_relid_sets); } else { @@ -1530,8 +1534,9 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, * figure that out. */ accumulate_append_subpath(nppath, - &pa_nonpartial_subpaths, - NULL); + ¶llel_append.subpaths, + NULL, + ¶llel_append.child_append_relid_sets); } } @@ -1605,28 +1610,28 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, * unparameterized Append path for the rel. (Note: this is correct even * if we have zero or one live subpath due to constraint exclusion.) */ - if (subpaths_valid) - add_path(rel, (Path *) create_append_path(root, rel, subpaths, NIL, + if (unparameterized_valid) + add_path(rel, (Path *) create_append_path(root, rel, unparameterized, NIL, NULL, 0, false, -1)); /* build an AppendPath for the cheap startup paths, if valid */ - if (startup_subpaths_valid) - add_path(rel, (Path *) create_append_path(root, rel, startup_subpaths, - NIL, NIL, NULL, 0, false, -1)); + if (startup_valid) + add_path(rel, (Path *) create_append_path(root, rel, startup, + NIL, NULL, 0, false, -1)); /* * Consider an append of unordered, unparameterized partial paths. Make * it parallel-aware if possible. */ - if (partial_subpaths_valid && partial_subpaths != NIL) + if (partial_only_valid && partial_only.partial_subpaths != NIL) { AppendPath *appendpath; ListCell *lc; int parallel_workers = 0; /* Find the highest number of workers requested for any subpath. */ - foreach(lc, partial_subpaths) + foreach(lc, partial_only.partial_subpaths) { Path *path = lfirst(lc); @@ -1653,7 +1658,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, Assert(parallel_workers > 0); /* Generate a partial append path. */ - appendpath = create_append_path(root, rel, NIL, partial_subpaths, + appendpath = create_append_path(root, rel, partial_only, NIL, NULL, parallel_workers, enable_parallel_append, -1); @@ -1674,7 +1679,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, * a non-partial path that is substantially cheaper than any partial path; * otherwise, we should use the append path added in the previous step.) */ - if (pa_subpaths_valid && pa_nonpartial_subpaths != NIL) + if (parallel_append_valid && parallel_append.subpaths != NIL) { AppendPath *appendpath; ListCell *lc; @@ -1684,7 +1689,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, * Find the highest number of workers requested for any partial * subpath. */ - foreach(lc, pa_partial_subpaths) + foreach(lc, parallel_append.partial_subpaths) { Path *path = lfirst(lc); @@ -1702,8 +1707,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, max_parallel_workers_per_gather); Assert(parallel_workers > 0); - appendpath = create_append_path(root, rel, pa_nonpartial_subpaths, - pa_partial_subpaths, + appendpath = create_append_path(root, rel, parallel_append, NIL, NULL, parallel_workers, true, partial_rows); add_partial_path(rel, (Path *) appendpath); @@ -1713,7 +1717,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, * Also build unparameterized ordered append paths based on the collected * list of child pathkeys. */ - if (subpaths_valid) + if (unparameterized_valid) generate_orderedappend_paths(root, rel, live_childrels, all_child_pathkeys); @@ -1734,10 +1738,10 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, { Relids required_outer = (Relids) lfirst(l); ListCell *lcr; + AppendPathInput parameterized = {0}; + bool parameterized_valid = true; /* Select the child paths for an Append with this parameterization */ - subpaths = NIL; - subpaths_valid = true; foreach(lcr, live_childrels) { RelOptInfo *childrel = (RelOptInfo *) lfirst(lcr); @@ -1746,7 +1750,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, if (childrel->pathlist == NIL) { /* failed to make a suitable path for this child */ - subpaths_valid = false; + parameterized_valid = false; break; } @@ -1756,15 +1760,16 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, if (subpath == NULL) { /* failed to make a suitable path for this child */ - subpaths_valid = false; + parameterized_valid = false; break; } - accumulate_append_subpath(subpath, &subpaths, NULL); + accumulate_append_subpath(subpath, ¶meterized.subpaths, NULL, + ¶meterized.child_append_relid_sets); } - if (subpaths_valid) + if (parameterized_valid) add_path(rel, (Path *) - create_append_path(root, rel, subpaths, NIL, + create_append_path(root, rel, parameterized, NIL, required_outer, 0, false, -1)); } @@ -1785,13 +1790,14 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, { Path *path = (Path *) lfirst(l); AppendPath *appendpath; + AppendPathInput append = {0}; /* skip paths with no pathkeys. */ if (path->pathkeys == NIL) continue; - appendpath = create_append_path(root, rel, NIL, list_make1(path), - NIL, NULL, + append.partial_subpaths = list_make1(path); + appendpath = create_append_path(root, rel, append, NIL, NULL, path->parallel_workers, true, partial_rows); add_partial_path(rel, (Path *) appendpath); @@ -1873,9 +1879,9 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, foreach(lcp, all_child_pathkeys) { List *pathkeys = (List *) lfirst(lcp); - List *startup_subpaths = NIL; - List *total_subpaths = NIL; - List *fractional_subpaths = NIL; + AppendPathInput startup = {0}; + AppendPathInput total = {0}; + AppendPathInput fractional = {0}; bool startup_neq_total = false; bool fraction_neq_total = false; bool match_partition_order; @@ -2038,16 +2044,23 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, * just a single subpath (and hence aren't doing anything * useful). */ - cheapest_startup = get_singleton_append_subpath(cheapest_startup); - cheapest_total = get_singleton_append_subpath(cheapest_total); + cheapest_startup = + get_singleton_append_subpath(cheapest_startup, + &startup.child_append_relid_sets); + cheapest_total = + get_singleton_append_subpath(cheapest_total, + &total.child_append_relid_sets); - startup_subpaths = lappend(startup_subpaths, cheapest_startup); - total_subpaths = lappend(total_subpaths, cheapest_total); + startup.subpaths = lappend(startup.subpaths, cheapest_startup); + total.subpaths = lappend(total.subpaths, cheapest_total); if (cheapest_fractional) { - cheapest_fractional = get_singleton_append_subpath(cheapest_fractional); - fractional_subpaths = lappend(fractional_subpaths, cheapest_fractional); + cheapest_fractional = + get_singleton_append_subpath(cheapest_fractional, + &fractional.child_append_relid_sets); + fractional.subpaths = + lappend(fractional.subpaths, cheapest_fractional); } } else @@ -2057,13 +2070,16 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, * child paths for the MergeAppend. */ accumulate_append_subpath(cheapest_startup, - &startup_subpaths, NULL); + &startup.subpaths, NULL, + &startup.child_append_relid_sets); accumulate_append_subpath(cheapest_total, - &total_subpaths, NULL); + &total.subpaths, NULL, + &total.child_append_relid_sets); if (cheapest_fractional) accumulate_append_subpath(cheapest_fractional, - &fractional_subpaths, NULL); + &fractional.subpaths, NULL, + &fractional.child_append_relid_sets); } } @@ -2073,8 +2089,7 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, /* We only need Append */ add_path(rel, (Path *) create_append_path(root, rel, - startup_subpaths, - NIL, + startup, pathkeys, NULL, 0, @@ -2083,19 +2098,17 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, if (startup_neq_total) add_path(rel, (Path *) create_append_path(root, rel, - total_subpaths, - NIL, + total, pathkeys, NULL, 0, false, -1)); - if (fractional_subpaths && fraction_neq_total) + if (fractional.subpaths && fraction_neq_total) add_path(rel, (Path *) create_append_path(root, rel, - fractional_subpaths, - NIL, + fractional, pathkeys, NULL, 0, @@ -2107,20 +2120,23 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, /* We need MergeAppend */ add_path(rel, (Path *) create_merge_append_path(root, rel, - startup_subpaths, + startup.subpaths, + startup.child_append_relid_sets, pathkeys, NULL)); if (startup_neq_total) add_path(rel, (Path *) create_merge_append_path(root, rel, - total_subpaths, + total.subpaths, + total.child_append_relid_sets, pathkeys, NULL)); - if (fractional_subpaths && fraction_neq_total) + if (fractional.subpaths && fraction_neq_total) add_path(rel, (Path *) create_merge_append_path(root, rel, - fractional_subpaths, + fractional.subpaths, + fractional.child_append_relid_sets, pathkeys, NULL)); } @@ -2223,7 +2239,8 @@ get_cheapest_parameterized_child_path(PlannerInfo *root, RelOptInfo *rel, * paths). */ static void -accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths) +accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths, + List **child_append_relid_sets) { if (IsA(path, AppendPath)) { @@ -2232,6 +2249,11 @@ accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths) if (!apath->path.parallel_aware || apath->first_partial_path == 0) { *subpaths = list_concat(*subpaths, apath->subpaths); + *child_append_relid_sets = + lappend(*child_append_relid_sets, path->parent->relids); + *child_append_relid_sets = + list_concat(*child_append_relid_sets, + apath->child_append_relid_sets); return; } else if (special_subpaths != NULL) @@ -2246,6 +2268,11 @@ accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths) apath->first_partial_path); *special_subpaths = list_concat(*special_subpaths, new_special_subpaths); + *child_append_relid_sets = + lappend(*child_append_relid_sets, path->parent->relids); + *child_append_relid_sets = + list_concat(*child_append_relid_sets, + apath->child_append_relid_sets); return; } } @@ -2254,6 +2281,11 @@ accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths) MergeAppendPath *mpath = (MergeAppendPath *) path; *subpaths = list_concat(*subpaths, mpath->subpaths); + *child_append_relid_sets = + lappend(*child_append_relid_sets, path->parent->relids); + *child_append_relid_sets = + list_concat(*child_append_relid_sets, + mpath->child_append_relid_sets); return; } @@ -2265,10 +2297,15 @@ accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths) * Returns the single subpath of an Append/MergeAppend, or just * return 'path' if it's not a single sub-path Append/MergeAppend. * + * As a side effect, whenever we return a single subpath rather than the + * original path, add the relid sets for the original path to + * child_append_relid_sets, so that those relids don't entirely disappear + * from the final plan. + * * Note: 'path' must not be a parallel-aware path. */ static Path * -get_singleton_append_subpath(Path *path) +get_singleton_append_subpath(Path *path, List **child_append_relid_sets) { Assert(!path->parallel_aware); @@ -2277,14 +2314,28 @@ get_singleton_append_subpath(Path *path) AppendPath *apath = (AppendPath *) path; if (list_length(apath->subpaths) == 1) + { + *child_append_relid_sets = + lappend(*child_append_relid_sets, path->parent->relids); + *child_append_relid_sets = + list_concat(*child_append_relid_sets, + apath->child_append_relid_sets); return (Path *) linitial(apath->subpaths); + } } else if (IsA(path, MergeAppendPath)) { MergeAppendPath *mpath = (MergeAppendPath *) path; if (list_length(mpath->subpaths) == 1) + { + *child_append_relid_sets = + lappend(*child_append_relid_sets, path->parent->relids); + *child_append_relid_sets = + list_concat(*child_append_relid_sets, + mpath->child_append_relid_sets); return (Path *) linitial(mpath->subpaths); + } } return path; @@ -2304,6 +2355,8 @@ get_singleton_append_subpath(Path *path) static void set_dummy_rel_pathlist(RelOptInfo *rel) { + AppendPathInput in = {0}; + /* Set dummy size estimates --- we leave attr_widths[] as zeroes */ rel->rows = 0; rel->reltarget->width = 0; @@ -2313,7 +2366,7 @@ set_dummy_rel_pathlist(RelOptInfo *rel) rel->partial_pathlist = NIL; /* Set up the dummy path */ - add_path(rel, (Path *) create_append_path(NULL, rel, NIL, NIL, + add_path(rel, (Path *) create_append_path(NULL, rel, in, NIL, rel->lateral_relids, 0, false, -1)); diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 4da0b17f137..89ca4e08bf1 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -1461,7 +1461,6 @@ cost_tidrangescan(Path *path, PlannerInfo *root, enable_mask |= PGS_CONSIDER_NONPARTIAL; path->disabled_nodes = (baserel->pgs_mask & enable_mask) != enable_mask ? 1 : 0; - path->disabled_nodes = 0; path->startup_cost = startup_cost; path->total_cost = startup_cost + cpu_run_cost + disk_run_cost; } @@ -2590,11 +2589,6 @@ cost_material(Path *path, double nbytes = relation_byte_size(tuples, width); double work_mem_bytes = work_mem * (Size) 1024; - if (path->parallel_workers == 0 && - path->parent != NULL && - (path->parent->pgs_mask & PGS_CONSIDER_NONPARTIAL) == 0) - enabled = false; - path->rows = tuples; /* diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index 1e4246b49d5..044560da7bf 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -1048,6 +1048,7 @@ try_partial_nestloop_path(PlannerInfo *root, initial_cost_nestloop(root, &workspace, jointype, nestloop_subtype, outer_path, inner_path, extra); if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes, + workspace.startup_cost, workspace.total_cost, pathkeys)) return; @@ -1237,6 +1238,7 @@ try_partial_mergejoin_path(PlannerInfo *root, extra); if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes, + workspace.startup_cost, workspace.total_cost, pathkeys)) return; @@ -1369,6 +1371,7 @@ try_partial_hashjoin_path(PlannerInfo *root, initial_cost_hashjoin(root, &workspace, jointype, hashclauses, outer_path, inner_path, extra, parallel_hash); if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes, + workspace.startup_cost, workspace.total_cost, NIL)) return; @@ -1895,8 +1898,17 @@ match_unsorted_outer(PlannerInfo *root, /* * Consider materializing the cheapest inner path, unless that is * disabled or the path in question materializes its output anyway. + * + * At present, we only consider materialization for non-partial outer + * paths, so it's correct to test PGS_CONSIDER_NONPARTIAL here. If we + * ever want to consider materialization for partial paths, we'll need + * to create matpath whenever PGS_NESTLOOP_MATERIALIZE is set, use it + * for partial paths either way, and use it for non-partial paths only + * when PGS_CONSIDER_NONPARTIAL is also set. */ - if ((extra->pgs_mask & PGS_NESTLOOP_MATERIALIZE) != 0 && + if ((extra->pgs_mask & + (PGS_NESTLOOP_MATERIALIZE | PGS_CONSIDER_NONPARTIAL)) == + (PGS_NESTLOOP_MATERIALIZE | PGS_CONSIDER_NONPARTIAL) && inner_cheapest_total != NULL && !ExecMaterializesOutput(inner_cheapest_total->pathtype)) matpath = (Path *) diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index 2615651c073..443e2dca7c0 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -1513,6 +1513,7 @@ void mark_dummy_rel(RelOptInfo *rel) { MemoryContext oldcontext; + AppendPathInput in = {0}; /* Already marked? */ if (is_dummy_rel(rel)) @@ -1529,7 +1530,7 @@ mark_dummy_rel(RelOptInfo *rel) rel->partial_pathlist = NIL; /* Set up the dummy path */ - add_path(rel, (Path *) create_append_path(NULL, rel, NIL, NIL, + add_path(rel, (Path *) create_append_path(NULL, rel, in, NIL, rel->lateral_relids, 0, false, -1)); diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index c26e841f537..959df43c39e 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -1263,6 +1263,7 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path, int flags) plan->plan.lefttree = NULL; plan->plan.righttree = NULL; plan->apprelids = rel->relids; + plan->child_append_relid_sets = best_path->child_append_relid_sets; if (pathkeys != NIL) { @@ -1475,6 +1476,7 @@ create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path, plan->lefttree = NULL; plan->righttree = NULL; node->apprelids = rel->relids; + node->child_append_relid_sets = best_path->child_append_relid_sets; /* * Compute sort column info, and adjust MergeAppend's tlist as needed. @@ -6527,7 +6529,6 @@ materialize_finished_plan(Plan *subplan) subplan->total_cost -= initplan_cost; /* Set cost data */ - matpath.parent = NULL; cost_material(&matpath, enable_material, subplan->disabled_nodes, diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 757bdc7b1de..006b3281969 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -511,7 +511,8 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, /* Allow plugins to take control after we've initialized "glob" */ if (planner_setup_hook) - (*planner_setup_hook) (glob, parse, query_string, &tuple_fraction, es); + (*planner_setup_hook) (glob, parse, query_string, cursorOptions, + &tuple_fraction, es); /* primary planning entry point (may recurse for subqueries) */ root = subquery_planner(glob, parse, NULL, NULL, false, tuple_fraction, @@ -654,6 +655,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, result->unprunableRelids = bms_difference(glob->allRelids, glob->prunableRelids); result->permInfos = glob->finalrteperminfos; + result->subrtinfos = glob->subrtinfos; result->resultRelations = glob->resultRelations; result->appendRelations = glob->appendRelations; result->subplans = glob->subplans; @@ -664,6 +666,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, result->paramExecTypes = glob->paramExecTypes; /* utilityStmt should be null, but we might as well copy it */ result->utilityStmt = parse->utilityStmt; + result->elidedNodes = glob->elidedNodes; result->stmt_location = parse->stmt_location; result->stmt_len = parse->stmt_len; @@ -4060,7 +4063,7 @@ create_degenerate_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, * might get between 0 and N output rows. Offhand I think that's * desired.) */ - List *paths = NIL; + AppendPathInput append = {0}; while (--nrows >= 0) { @@ -4068,13 +4071,12 @@ create_degenerate_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, create_group_result_path(root, grouped_rel, grouped_rel->reltarget, (List *) parse->havingQual); - paths = lappend(paths, path); + append.subpaths = lappend(append.subpaths, path); } path = (Path *) create_append_path(root, grouped_rel, - paths, - NIL, + append, NIL, NULL, 0, diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index 16d200cfb46..5ad6c13830b 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -211,6 +211,9 @@ static List *set_windowagg_runcondition_references(PlannerInfo *root, List *runcondition, Plan *plan); +static void record_elided_node(PlannerGlobal *glob, int plan_node_id, + NodeTag elided_type, Bitmapset *relids); + /***************************************************************************** * @@ -399,6 +402,26 @@ add_rtes_to_flat_rtable(PlannerInfo *root, bool recursing) Index rti; ListCell *lc; + /* + * Record enough information to make it possible for code that looks at + * the final range table to understand how it was constructed. (If + * finalrtable is still NIL, then this is the very topmost PlannerInfo, + * which will always have plan_name == NULL and rtoffset == 0; we omit the + * degenerate list entry.) + */ + if (root->glob->finalrtable != NIL) + { + SubPlanRTInfo *rtinfo = makeNode(SubPlanRTInfo); + + rtinfo->plan_name = root->plan_name; + rtinfo->rtoffset = list_length(root->glob->finalrtable); + + /* When recursing = true, it's an unplanned or dummy subquery. */ + rtinfo->dummy = recursing; + + root->glob->subrtinfos = lappend(root->glob->subrtinfos, rtinfo); + } + /* * Add the query's own RTEs to the flattened rangetable. * @@ -1440,10 +1463,17 @@ set_subqueryscan_references(PlannerInfo *root, if (trivial_subqueryscan(plan)) { + Index scanrelid; + /* * We can omit the SubqueryScan node and just pull up the subplan. */ result = clean_up_removed_plan_level((Plan *) plan, plan->subplan); + + /* Remember that we removed a SubqueryScan */ + scanrelid = plan->scan.scanrelid + rtoffset; + record_elided_node(root->glob, plan->subplan->plan_node_id, + T_SubqueryScan, bms_make_singleton(scanrelid)); } else { @@ -1871,7 +1901,17 @@ set_append_references(PlannerInfo *root, Plan *p = (Plan *) linitial(aplan->appendplans); if (p->parallel_aware == aplan->plan.parallel_aware) - return clean_up_removed_plan_level((Plan *) aplan, p); + { + Plan *result; + + result = clean_up_removed_plan_level((Plan *) aplan, p); + + /* Remember that we removed an Append */ + record_elided_node(root->glob, p->plan_node_id, T_Append, + offset_relid_set(aplan->apprelids, rtoffset)); + + return result; + } } /* @@ -1939,7 +1979,17 @@ set_mergeappend_references(PlannerInfo *root, Plan *p = (Plan *) linitial(mplan->mergeplans); if (p->parallel_aware == mplan->plan.parallel_aware) - return clean_up_removed_plan_level((Plan *) mplan, p); + { + Plan *result; + + result = clean_up_removed_plan_level((Plan *) mplan, p); + + /* Remember that we removed a MergeAppend */ + record_elided_node(root->glob, p->plan_node_id, T_MergeAppend, + offset_relid_set(mplan->apprelids, rtoffset)); + + return result; + } } /* @@ -3754,3 +3804,21 @@ extract_query_dependencies_walker(Node *node, PlannerInfo *context) return expression_tree_walker(node, extract_query_dependencies_walker, context); } + +/* + * Record some details about a node removed from the plan during setrefs + * processing, for the benefit of code trying to reconstruct planner decisions + * from examination of the final plan tree. + */ +static void +record_elided_node(PlannerGlobal *glob, int plan_node_id, + NodeTag elided_type, Bitmapset *relids) +{ + ElidedNode *n = makeNode(ElidedNode); + + n->plan_node_id = plan_node_id; + n->elided_type = elided_type; + n->relids = relids; + + glob->elidedNodes = lappend(glob->elidedNodes, n); +} diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index 78c95c36dd5..f50c296e3d9 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -696,9 +696,9 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, ListCell *lc; ListCell *lc2; ListCell *lc3; - List *cheapest_pathlist = NIL; - List *ordered_pathlist = NIL; - List *partial_pathlist = NIL; + AppendPathInput cheapest = {0}; + AppendPathInput ordered = {0}; + AppendPathInput partial = {0}; bool partial_paths_valid = true; bool consider_parallel = true; List *rellist; @@ -783,7 +783,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, if (is_dummy_rel(rel)) continue; - cheapest_pathlist = lappend(cheapest_pathlist, + cheapest.subpaths = lappend(cheapest.subpaths, rel->cheapest_total_path); if (try_sorted) @@ -795,7 +795,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, false); if (ordered_path != NULL) - ordered_pathlist = lappend(ordered_pathlist, ordered_path); + ordered.subpaths = lappend(ordered.subpaths, ordered_path); else { /* @@ -818,20 +818,20 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, else if (rel->partial_pathlist == NIL) partial_paths_valid = false; else - partial_pathlist = lappend(partial_pathlist, - linitial(rel->partial_pathlist)); + partial.partial_subpaths = lappend(partial.partial_subpaths, + linitial(rel->partial_pathlist)); } } /* Build result relation. */ result_rel = fetch_upper_rel(root, UPPERREL_SETOP, relids); result_rel->reltarget = create_setop_pathtarget(root, tlist, - cheapest_pathlist); + cheapest.subpaths); result_rel->consider_parallel = consider_parallel; result_rel->consider_startup = (root->tuple_fraction > 0); /* If all UNION children were dummy rels, make the resulting rel dummy */ - if (cheapest_pathlist == NIL) + if (cheapest.subpaths == NIL) { mark_dummy_rel(result_rel); @@ -842,8 +842,8 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, * Append the child results together using the cheapest paths from each * union child. */ - apath = (Path *) create_append_path(root, result_rel, cheapest_pathlist, - NIL, NIL, NULL, 0, false, -1); + apath = (Path *) create_append_path(root, result_rel, cheapest, + NIL, NULL, 0, false, -1); /* * Estimate number of groups. For now we just assume the output is unique @@ -862,7 +862,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, int parallel_workers = 0; /* Find the highest number of workers requested for any subpath. */ - foreach(lc, partial_pathlist) + foreach(lc, partial.partial_subpaths) { Path *subpath = lfirst(lc); @@ -881,14 +881,14 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, if (enable_parallel_append) { parallel_workers = Max(parallel_workers, - pg_leftmost_one_pos32(list_length(partial_pathlist)) + 1); + pg_leftmost_one_pos32(list_length(partial.partial_subpaths)) + 1); parallel_workers = Min(parallel_workers, max_parallel_workers_per_gather); } Assert(parallel_workers > 0); papath = (Path *) - create_append_path(root, result_rel, NIL, partial_pathlist, + create_append_path(root, result_rel, partial, NIL, NULL, parallel_workers, enable_parallel_append, -1); gpath = (Path *) @@ -901,7 +901,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, double dNumGroups; bool can_sort = grouping_is_sortable(groupList); bool can_hash = grouping_is_hashable(groupList); - Path *first_path = linitial(cheapest_pathlist); + Path *first_path = linitial(cheapest.subpaths); /* * Estimate the number of UNION output rows. In the case when only a @@ -911,7 +911,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, * contain Vars with varno==0, which estimate_num_groups() wouldn't * like. */ - if (list_length(cheapest_pathlist) == 1 && + if (list_length(cheapest.subpaths) == 1 && first_path->parent->reloptkind != RELOPT_UPPER_REL) { dNumGroups = estimate_num_groups(root, @@ -1017,7 +1017,8 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, path = (Path *) create_merge_append_path(root, result_rel, - ordered_pathlist, + ordered.subpaths, + NIL, union_pathkeys, NULL); @@ -1216,6 +1217,9 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root, if (op->all) { Path *apath; + AppendPathInput append = {0}; + + append.subpaths = list_make1(lpath); /* * EXCEPT ALL: If the right-hand input is dummy then we can @@ -1224,8 +1228,9 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root, * between the set op targetlist and the targetlist of the * left input. The Append will be removed in setrefs.c. */ - apath = (Path *) create_append_path(root, result_rel, list_make1(lpath), - NIL, NIL, NULL, 0, false, -1); + apath = (Path *) create_append_path(root, result_rel, + append, NIL, NULL, 0, + false, -1); add_path(result_rel, apath); diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c index 32204776c45..504a30d8836 100644 --- a/src/backend/optimizer/util/clauses.c +++ b/src/backend/optimizer/util/clauses.c @@ -2705,6 +2705,7 @@ eval_const_expressions_mutator(Node *node, bool has_null_input = false; bool all_null_input = true; bool has_nonconst_input = false; + bool has_nullable_nonconst = false; Expr *simple; DistinctExpr *newexpr; @@ -2721,7 +2722,8 @@ eval_const_expressions_mutator(Node *node, /* * We must do our own check for NULLs because DistinctExpr has * different results for NULL input than the underlying - * operator does. + * operator does. We also check if any non-constant input is + * potentially nullable. */ foreach(arg, args) { @@ -2731,12 +2733,24 @@ eval_const_expressions_mutator(Node *node, all_null_input &= ((Const *) lfirst(arg))->constisnull; } else + { has_nonconst_input = true; + all_null_input = false; + + if (!has_nullable_nonconst && + !expr_is_nonnullable(context->root, + (Expr *) lfirst(arg), false)) + has_nullable_nonconst = true; + } } - /* all constants? then can optimize this out */ if (!has_nonconst_input) { + /* + * All inputs are constants. We can optimize this out + * completely. + */ + /* all nulls? then not distinct */ if (all_null_input) return makeBoolConst(false, false); @@ -2781,6 +2795,72 @@ eval_const_expressions_mutator(Node *node, return (Node *) csimple; } } + else if (!has_nullable_nonconst) + { + /* + * There are non-constant inputs, but since all of them + * are proven non-nullable, "IS DISTINCT FROM" semantics + * are much simpler. + */ + + OpExpr *eqexpr; + + /* + * If one input is an explicit NULL constant, and the + * other is a non-nullable expression, the result is + * always TRUE. + */ + if (has_null_input) + return makeBoolConst(true, false); + + /* + * Otherwise, both inputs are known non-nullable. In this + * case, "IS DISTINCT FROM" is equivalent to the standard + * inequality operator (usually "<>"). We convert this to + * an OpExpr, which is a more efficient representation for + * the planner. It can enable the use of partial indexes + * and constraint exclusion. Furthermore, if the clause + * is negated (ie, "IS NOT DISTINCT FROM"), the resulting + * "=" operator can allow the planner to use index scans, + * merge joins, hash joins, and EC-based qual deductions. + */ + eqexpr = makeNode(OpExpr); + eqexpr->opno = expr->opno; + eqexpr->opfuncid = expr->opfuncid; + eqexpr->opresulttype = BOOLOID; + eqexpr->opretset = expr->opretset; + eqexpr->opcollid = expr->opcollid; + eqexpr->inputcollid = expr->inputcollid; + eqexpr->args = args; + eqexpr->location = expr->location; + + return eval_const_expressions_mutator(negate_clause((Node *) eqexpr), + context); + } + else if (has_null_input) + { + /* + * One input is a nullable non-constant expression, and + * the other is an explicit NULL constant. We can + * transform this to a NullTest with !argisrow, which is + * much more amenable to optimization. + */ + + NullTest *nt = makeNode(NullTest); + + nt->arg = (Expr *) (IsA(linitial(args), Const) ? + lsecond(args) : linitial(args)); + nt->nulltesttype = IS_NOT_NULL; + + /* + * argisrow = false is correct whether or not arg is + * composite + */ + nt->argisrow = false; + nt->location = expr->location; + + return eval_const_expressions_mutator((Node *) nt, context); + } /* * The expression cannot be simplified any further, so build @@ -3630,6 +3710,9 @@ eval_const_expressions_mutator(Node *node, context); if (arg && IsA(arg, Const)) { + /* + * If arg is Const, simplify to constant. + */ Const *carg = (Const *) arg; bool result; @@ -3666,6 +3749,34 @@ eval_const_expressions_mutator(Node *node, return makeBoolConst(result, false); } + if (arg && expr_is_nonnullable(context->root, (Expr *) arg, false)) + { + /* + * If arg is proven non-nullable, simplify to boolean + * expression or constant. + */ + switch (btest->booltesttype) + { + case IS_TRUE: + case IS_NOT_FALSE: + return arg; + + case IS_FALSE: + case IS_NOT_TRUE: + return (Node *) make_notclause((Expr *) arg); + + case IS_UNKNOWN: + return makeBoolConst(false, false); + + case IS_NOT_UNKNOWN: + return makeBoolConst(true, false); + + default: + elog(ERROR, "unrecognized booltesttype: %d", + (int) btest->booltesttype); + break; + } + } newbtest = makeNode(BooleanTest); newbtest->arg = (Expr *) arg; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 7295438ad20..d61f328707f 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -777,10 +777,9 @@ add_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, * * Because we don't consider parameterized paths here, we also don't * need to consider the row counts as a measure of quality: every path will - * produce the same number of rows. Neither do we need to consider startup - * costs: parallelism is only used for plans that will be run to completion. - * Therefore, this routine is much simpler than add_path: it needs to - * consider only disabled nodes, pathkeys and total cost. + * produce the same number of rows. However, we do need to consider the + * startup costs: this partial path could be used beneath a Limit node, + * so a fast-start plan could be correct. * * As with add_path, we pfree paths that are found to be dominated by * another partial path; this requires that there be no other references to @@ -818,52 +817,36 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path) /* Compare pathkeys. */ keyscmp = compare_pathkeys(new_path->pathkeys, old_path->pathkeys); - /* Unless pathkeys are incompatible, keep just one of the two paths. */ + /* + * Unless pathkeys are incompatible, see if one of the paths dominates + * the other (both in startup and total cost). It may happen that one + * path has lower startup cost, the other has lower total cost. + */ if (keyscmp != PATHKEYS_DIFFERENT) { - if (unlikely(new_path->disabled_nodes != old_path->disabled_nodes)) + PathCostComparison costcmp; + + /* + * Do a fuzzy cost comparison with standard fuzziness limit. + */ + costcmp = compare_path_costs_fuzzily(new_path, old_path, + STD_FUZZ_FACTOR); + if (costcmp == COSTS_BETTER1) { - if (new_path->disabled_nodes > old_path->disabled_nodes) - accept_new = false; - else + if (keyscmp == PATHKEYS_BETTER1) remove_old = true; } - else if (new_path->total_cost > old_path->total_cost - * STD_FUZZ_FACTOR) + else if (costcmp == COSTS_BETTER2) { - /* New path costs more; keep it only if pathkeys are better. */ - if (keyscmp != PATHKEYS_BETTER1) + if (keyscmp == PATHKEYS_BETTER2) accept_new = false; } - else if (old_path->total_cost > new_path->total_cost - * STD_FUZZ_FACTOR) + else if (costcmp == COSTS_EQUAL) { - /* Old path costs more; keep it only if pathkeys are better. */ - if (keyscmp != PATHKEYS_BETTER2) + if (keyscmp == PATHKEYS_BETTER1) remove_old = true; - } - else if (keyscmp == PATHKEYS_BETTER1) - { - /* Costs are about the same, new path has better pathkeys. */ - remove_old = true; - } - else if (keyscmp == PATHKEYS_BETTER2) - { - /* Costs are about the same, old path has better pathkeys. */ - accept_new = false; - } - else if (old_path->total_cost > new_path->total_cost * 1.0000000001) - { - /* Pathkeys are the same, and the old path costs more. */ - remove_old = true; - } - else - { - /* - * Pathkeys are the same, and new path isn't materially - * cheaper. - */ - accept_new = false; + else if (keyscmp == PATHKEYS_BETTER2) + accept_new = false; } } @@ -878,8 +861,13 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path) } else { - /* new belongs after this old path if it has cost >= old's */ - if (new_path->total_cost >= old_path->total_cost) + /* + * new belongs after this old path if it has more disabled nodes + * or if it has the same number of nodes but a greater total cost + */ + if (new_path->disabled_nodes > old_path->disabled_nodes || + (new_path->disabled_nodes == old_path->disabled_nodes && + new_path->total_cost >= old_path->total_cost)) insert_at = foreach_current_index(p1) + 1; } @@ -909,16 +897,16 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path) * add_partial_path_precheck * Check whether a proposed new partial path could possibly get accepted. * - * Unlike add_path_precheck, we can ignore startup cost and parameterization, - * since they don't matter for partial paths (see add_partial_path). But - * we do want to make sure we don't add a partial path if there's already - * a complete path that dominates it, since in that case the proposed path - * is surely a loser. + * Unlike add_path_precheck, we can ignore parameterization, since it doesn't + * matter for partial paths (see add_partial_path). But we do want to make + * sure we don't add a partial path if there's already a complete path that + * dominates it, since in that case the proposed path is surely a loser. */ bool add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, - Cost total_cost, List *pathkeys) + Cost startup_cost, Cost total_cost, List *pathkeys) { + bool consider_startup = parent_rel->consider_startup; ListCell *p1; /* @@ -928,25 +916,80 @@ add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, * is clearly superior to some existing partial path -- at least, modulo * final cost computations. If so, we definitely want to consider it. * - * Unlike add_path(), we always compare pathkeys here. This is because we - * expect partial_pathlist to be very short, and getting a definitive + * Unlike add_path(), we never try to exit this loop early. This is because + * we expect partial_pathlist to be very short, and getting a definitive * answer at this stage avoids the need to call add_path_precheck. */ foreach(p1, parent_rel->partial_pathlist) { Path *old_path = (Path *) lfirst(p1); + PathCostComparison costcmp; PathKeysComparison keyscmp; - keyscmp = compare_pathkeys(pathkeys, old_path->pathkeys); - if (keyscmp != PATHKEYS_DIFFERENT) + /* + * First, compare costs and disabled nodes. This logic should be + * identical to compare_path_costs_fuzzily, except that one of the + * paths hasn't been created yet, and the fuzz factor is always + * STD_FUZZ_FACTOR. + */ + if (unlikely(old_path->disabled_nodes != disabled_nodes)) { - if (total_cost > old_path->total_cost * STD_FUZZ_FACTOR && - keyscmp != PATHKEYS_BETTER1) - return false; - if (old_path->total_cost > total_cost * STD_FUZZ_FACTOR && - keyscmp != PATHKEYS_BETTER2) - return true; + if (disabled_nodes < old_path->disabled_nodes) + costcmp = COSTS_BETTER1; + else + costcmp = COSTS_BETTER2; + } + else if (total_cost > old_path->total_cost * STD_FUZZ_FACTOR) + { + if (consider_startup && + old_path->startup_cost > startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_DIFFERENT; + else + costcmp = COSTS_BETTER2; } + else if (old_path->total_cost > total_cost * STD_FUZZ_FACTOR) + { + if (consider_startup && + startup_cost > old_path->startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_DIFFERENT; + else + costcmp = COSTS_BETTER1; + } + else if (startup_cost > old_path->startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_BETTER2; + else if (old_path->startup_cost > startup_cost * STD_FUZZ_FACTOR) + costcmp = COSTS_BETTER1; + else + costcmp = COSTS_EQUAL; + + /* + * If one path wins on startup cost and the other on total cost, we + * can't say for sure which is better. + */ + if (costcmp == COSTS_DIFFERENT) + continue; + + /* + * If the two paths have different pathkeys, we can't say for sure + * which is better. + */ + keyscmp = compare_pathkeys(pathkeys, old_path->pathkeys); + if (keyscmp == PATHKEYS_DIFFERENT) + continue; + + /* + * If the existing path is cheaper and the pathkeys are equal or worse, + * the new path is not interesting. + */ + if (costcmp == COSTS_BETTER2 && keyscmp != PATHKEYS_BETTER1) + return false; + + /* + * If the new path is cheaper and the pathkeys are equal or better, + * it is definitely interesting. + */ + if (costcmp == COSTS_BETTER1 && keyscmp != PATHKEYS_BETTER2) + return true; } /* @@ -954,14 +997,9 @@ add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes, * clearly good enough that it might replace one. Compare it to * non-parallel plans. If it loses even before accounting for the cost of * the Gather node, we should definitely reject it. - * - * Note that we pass the total_cost to add_path_precheck twice. This is - * because it's never advantageous to consider the startup cost of a - * partial path; the resulting plans, if run in parallel, will be run to - * completion. */ - if (!add_path_precheck(parent_rel, disabled_nodes, total_cost, total_cost, - pathkeys, NULL)) + if (!add_path_precheck(parent_rel, disabled_nodes, startup_cost, + total_cost, pathkeys, NULL)) return false; return true; @@ -1077,6 +1115,14 @@ create_index_path(PlannerInfo *root, cost_index(pathnode, root, loop_count, partial_path); + /* + * cost_index will set disabled_nodes to 1 if this rel is not allowed to + * use index scans in general, but it doesn't have the IndexOptInfo to + * know whether this specific index has been disabled. + */ + if (index->disabled) + pathnode->path.disabled_nodes = 1; + return pathnode; } @@ -1298,7 +1344,7 @@ create_tidrangescan_path(PlannerInfo *root, RelOptInfo *rel, AppendPath * create_append_path(PlannerInfo *root, RelOptInfo *rel, - List *subpaths, List *partial_subpaths, + AppendPathInput input, List *pathkeys, Relids required_outer, int parallel_workers, bool parallel_aware, double rows) @@ -1308,6 +1354,7 @@ create_append_path(PlannerInfo *root, Assert(!parallel_aware || parallel_workers > 0); + pathnode->child_append_relid_sets = input.child_append_relid_sets; pathnode->path.pathtype = T_Append; pathnode->path.parent = rel; pathnode->path.pathtarget = rel->reltarget; @@ -1323,7 +1370,7 @@ create_append_path(PlannerInfo *root, * on the simpler get_appendrel_parampathinfo. There's no point in doing * the more expensive thing for a dummy path, either. */ - if (rel->reloptkind == RELOPT_BASEREL && root && subpaths != NIL) + if (rel->reloptkind == RELOPT_BASEREL && root && input.subpaths != NIL) pathnode->path.param_info = get_baserel_parampathinfo(root, rel, required_outer); @@ -1354,11 +1401,11 @@ create_append_path(PlannerInfo *root, */ Assert(pathkeys == NIL); - list_sort(subpaths, append_total_cost_compare); - list_sort(partial_subpaths, append_startup_cost_compare); + list_sort(input.subpaths, append_total_cost_compare); + list_sort(input.partial_subpaths, append_startup_cost_compare); } - pathnode->first_partial_path = list_length(subpaths); - pathnode->subpaths = list_concat(subpaths, partial_subpaths); + pathnode->first_partial_path = list_length(input.subpaths); + pathnode->subpaths = list_concat(input.subpaths, input.partial_subpaths); /* * Apply query-wide LIMIT if known and path is for sole base relation. @@ -1470,6 +1517,7 @@ MergeAppendPath * create_merge_append_path(PlannerInfo *root, RelOptInfo *rel, List *subpaths, + List *child_append_relid_sets, List *pathkeys, Relids required_outer) { @@ -1485,6 +1533,7 @@ create_merge_append_path(PlannerInfo *root, */ Assert(bms_is_empty(rel->lateral_relids) && bms_is_empty(required_outer)); + pathnode->child_append_relid_sets = child_append_relid_sets; pathnode->path.pathtype = T_MergeAppend; pathnode->path.parent = rel; pathnode->path.pathtarget = rel->reltarget; @@ -3932,11 +3981,12 @@ reparameterize_path(PlannerInfo *root, Path *path, case T_Append: { AppendPath *apath = (AppendPath *) path; - List *childpaths = NIL; - List *partialpaths = NIL; + AppendPathInput new_append = {0}; int i; ListCell *lc; + new_append.child_append_relid_sets = apath->child_append_relid_sets; + /* Reparameterize the children */ i = 0; foreach(lc, apath->subpaths) @@ -3950,13 +4000,13 @@ reparameterize_path(PlannerInfo *root, Path *path, return NULL; /* We have to re-split the regular and partial paths */ if (i < apath->first_partial_path) - childpaths = lappend(childpaths, spath); + new_append.subpaths = lappend(new_append.subpaths, spath); else - partialpaths = lappend(partialpaths, spath); + new_append.partial_subpaths = lappend(new_append.partial_subpaths, spath); i++; } return (Path *) - create_append_path(root, rel, childpaths, partialpaths, + create_append_path(root, rel, new_append, apath->path.pathkeys, required_outer, apath->path.parallel_workers, apath->path.parallel_aware, @@ -3971,10 +4021,10 @@ reparameterize_path(PlannerInfo *root, Path *path, spath = reparameterize_path(root, spath, required_outer, loop_count); - enabled = - (mpath->path.disabled_nodes <= spath->disabled_nodes); if (spath == NULL) return NULL; + enabled = + (mpath->path.disabled_nodes <= spath->disabled_nodes); return (Path *) create_material_path(rel, spath, enabled); } case T_Memoize: diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 3cd3544fa2b..2e3886cf9fe 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -855,7 +855,7 @@ PGSharedMemoryCreate(Size size, * Initialize space allocation status for segment. */ hdr->totalsize = size; - hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); + hdr->content_offset = MAXALIGN(sizeof(PGShmemHeader)); *shim = hdr; /* Save info for possible future use */ diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c index 7cb8b4c9b60..794e4fcb2ad 100644 --- a/src/backend/port/win32_shmem.c +++ b/src/backend/port/win32_shmem.c @@ -389,7 +389,7 @@ PGSharedMemoryCreate(Size size, * Initialize space allocation status for segment. */ hdr->totalsize = size; - hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); + hdr->content_offset = MAXALIGN(sizeof(PGShmemHeader)); hdr->dsm_control = 0; /* Save info for possible future use */ diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 22379de1e31..6fde740465f 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -385,7 +385,6 @@ AutoVacLauncherMain(const void *startup_data, size_t startup_data_len) PostmasterContext = NULL; } - MyBackendType = B_AUTOVAC_LAUNCHER; init_ps_display(NULL); ereport(DEBUG1, @@ -1398,7 +1397,6 @@ AutoVacWorkerMain(const void *startup_data, size_t startup_data_len) PostmasterContext = NULL; } - MyBackendType = B_AUTOVAC_WORKER; init_ps_display(NULL); Assert(GetProcessingMode() == InitProcessing); diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 65deabe91a7..261ccd3f59c 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -120,22 +120,28 @@ static const struct { { - "ParallelWorkerMain", ParallelWorkerMain + .fn_name = "ParallelWorkerMain", + .fn_addr = ParallelWorkerMain }, { - "ApplyLauncherMain", ApplyLauncherMain + .fn_name = "ApplyLauncherMain", + .fn_addr = ApplyLauncherMain }, { - "ApplyWorkerMain", ApplyWorkerMain + .fn_name = "ApplyWorkerMain", + .fn_addr = ApplyWorkerMain }, { - "ParallelApplyWorkerMain", ParallelApplyWorkerMain + .fn_name = "ParallelApplyWorkerMain", + .fn_addr = ParallelApplyWorkerMain }, { - "TableSyncWorkerMain", TableSyncWorkerMain + .fn_name = "TableSyncWorkerMain", + .fn_addr = TableSyncWorkerMain }, { - "SequenceSyncWorkerMain", SequenceSyncWorkerMain + .fn_name = "SequenceSyncWorkerMain", + .fn_addr = SequenceSyncWorkerMain } }; @@ -753,7 +759,6 @@ BackgroundWorkerMain(const void *startup_data, size_t startup_data_len) } MyBgworkerEntry = worker; - MyBackendType = B_BG_WORKER; init_ps_display(worker->bgw_name); Assert(GetProcessingMode() == InitProcessing); diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 80e3088fc7e..0956bd39a85 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -94,7 +94,6 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len) Assert(startup_data_len == 0); - MyBackendType = B_BG_WRITER; AuxiliaryProcessMainCommon(); /* diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 6482c21b8f9..e03c19123bc 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -199,7 +199,6 @@ CheckpointerMain(const void *startup_data, size_t startup_data_len) Assert(startup_data_len == 0); - MyBackendType = B_CHECKPOINTER; AuxiliaryProcessMainCommon(); CheckpointerShmem->checkpointer_pid = MyProcPid; diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c index cea229ad6a4..05b1feef3cf 100644 --- a/src/backend/postmaster/launch_backend.c +++ b/src/backend/postmaster/launch_backend.c @@ -96,7 +96,6 @@ typedef struct HANDLE UsedShmemSegID; #endif void *UsedShmemSegAddr; - slock_t *ShmemLock; #ifdef USE_INJECTION_POINTS struct InjectionPointsCtl *ActiveInjectionPoints; #endif @@ -179,7 +178,7 @@ typedef struct } child_process_kind; static child_process_kind child_process_kinds[] = { -#define PG_PROCTYPE(bktype, description, main_func, shmem_attach) \ +#define PG_PROCTYPE(bktype, bkcategory, description, main_func, shmem_attach) \ [bktype] = {description, main_func, shmem_attach}, #include "postmaster/proctypelist.h" #undef PG_PROCTYPE @@ -224,6 +223,8 @@ postmaster_child_launch(BackendType child_type, int child_slot, pid = fork_process(); if (pid == 0) /* child */ { + MyBackendType = child_type; + /* Capture and transfer timings that may be needed for logging */ if (IsExternalConnectionBackend(child_type)) { @@ -608,6 +609,7 @@ SubPostmasterMain(int argc, char *argv[]) child_type = (BackendType) atoi(child_kind); if (child_type <= B_INVALID || child_type > BACKEND_NUM_TYPES - 1) elog(ERROR, "unknown child kind %s", child_kind); + MyBackendType = child_type; /* Read in the variables file */ read_backend_variables(argv[2], &startup_data, &startup_data_len); @@ -676,7 +678,7 @@ SubPostmasterMain(int argc, char *argv[]) /* Restore basic shared memory pointers */ if (UsedShmemSegAddr != NULL) - InitShmemAccess(UsedShmemSegAddr); + InitShmemAllocator(UsedShmemSegAddr); /* * Run the appropriate Main function @@ -724,8 +726,6 @@ save_backend_variables(BackendParameters *param, param->UsedShmemSegID = UsedShmemSegID; param->UsedShmemSegAddr = UsedShmemSegAddr; - param->ShmemLock = ShmemLock; - #ifdef USE_INJECTION_POINTS param->ActiveInjectionPoints = ActiveInjectionPoints; #endif @@ -986,8 +986,6 @@ restore_backend_variables(BackendParameters *param) UsedShmemSegID = param->UsedShmemSegID; UsedShmemSegAddr = param->UsedShmemSegAddr; - ShmemLock = param->ShmemLock; - #ifdef USE_INJECTION_POINTS ActiveInjectionPoints = param->ActiveInjectionPoints; #endif diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 1a20387c4bd..82731e452fc 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -222,7 +222,6 @@ PgArchiverMain(const void *startup_data, size_t startup_data_len) { Assert(startup_data_len == 0); - MyBackendType = B_ARCHIVER; AuxiliaryProcessMainCommon(); /* diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c index a1a4f65f9a9..cdbe53dd262 100644 --- a/src/backend/postmaster/startup.c +++ b/src/backend/postmaster/startup.c @@ -217,7 +217,6 @@ StartupProcessMain(const void *startup_data, size_t startup_data_len) { Assert(startup_data_len == 0); - MyBackendType = B_STARTUP; AuxiliaryProcessMainCommon(); /* Arrange to clean up at startup process exit */ diff --git a/src/backend/postmaster/syslogger.c b/src/backend/postmaster/syslogger.c index 1c443b3d126..86c5e376b40 100644 --- a/src/backend/postmaster/syslogger.c +++ b/src/backend/postmaster/syslogger.c @@ -206,7 +206,6 @@ SysLoggerMain(const void *startup_data, size_t startup_data_len) now = MyStartTime; - MyBackendType = B_LOGGER; init_ps_display(NULL); /* diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index c3d56c866d3..2d8f57099fd 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -234,7 +234,6 @@ WalSummarizerMain(const void *startup_data, size_t startup_data_len) Assert(startup_data_len == 0); - MyBackendType = B_WAL_SUMMARIZER; AuxiliaryProcessMainCommon(); ereport(DEBUG1, diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c index 38ec8a4c8c7..23e79a32345 100644 --- a/src/backend/postmaster/walwriter.c +++ b/src/backend/postmaster/walwriter.c @@ -94,7 +94,6 @@ WalWriterMain(const void *startup_data, size_t startup_data_len) Assert(startup_data_len == 0); - MyBackendType = B_WAL_WRITER; AuxiliaryProcessMainCommon(); /* diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index 85060d19a49..603a2b94d05 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -1986,16 +1986,22 @@ UpdateDecodingStats(LogicalDecodingContext *ctx) } /* - * Read up to the end of WAL starting from the decoding slot's restart_lsn. - * Return true if any meaningful/decodable WAL records are encountered, - * otherwise false. + * Read up to the end of WAL starting from the decoding slot's restart_lsn + * to end_of_wal in order to check if any meaningful/decodable WAL records + * are encountered. scan_cutoff_lsn is the LSN, where we can terminate the + * WAL scan early if we find a decodable WAL record after this LSN. + * + * Returns the last LSN decodable WAL record's LSN if found, otherwise + * returns InvalidXLogRecPtr. */ -bool -LogicalReplicationSlotHasPendingWal(XLogRecPtr end_of_wal) +XLogRecPtr +LogicalReplicationSlotCheckPendingWal(XLogRecPtr end_of_wal, + XLogRecPtr scan_cutoff_lsn) { - bool has_pending_wal = false; + XLogRecPtr last_pending_wal = InvalidXLogRecPtr; Assert(MyReplicationSlot); + Assert(end_of_wal >= scan_cutoff_lsn); PG_TRY(); { @@ -2023,8 +2029,7 @@ LogicalReplicationSlotHasPendingWal(XLogRecPtr end_of_wal) /* Invalidate non-timetravel entries */ InvalidateSystemCaches(); - /* Loop until the end of WAL or some changes are processed */ - while (!has_pending_wal && ctx->reader->EndRecPtr < end_of_wal) + while (ctx->reader->EndRecPtr < end_of_wal) { XLogRecord *record; char *errm = NULL; @@ -2037,7 +2042,20 @@ LogicalReplicationSlotHasPendingWal(XLogRecPtr end_of_wal) if (record != NULL) LogicalDecodingProcessRecord(ctx, ctx->reader); - has_pending_wal = ctx->processing_required; + if (ctx->processing_required) + { + last_pending_wal = ctx->reader->ReadRecPtr; + + /* + * If we find a decodable WAL after the scan_cutoff_lsn point, + * we can terminate the scan early. + */ + if (last_pending_wal >= scan_cutoff_lsn) + break; + + /* Reset the flag and continue checking */ + ctx->processing_required = false; + } CHECK_FOR_INTERRUPTS(); } @@ -2055,7 +2073,7 @@ LogicalReplicationSlotHasPendingWal(XLogRecPtr end_of_wal) } PG_END_TRY(); - return has_pending_wal; + return last_pending_wal; } /* diff --git a/src/backend/replication/logical/logicalctl.c b/src/backend/replication/logical/logicalctl.c index 9f787f3dc51..4e292951201 100644 --- a/src/backend/replication/logical/logicalctl.c +++ b/src/backend/replication/logical/logicalctl.c @@ -71,6 +71,7 @@ #include "storage/lmgr.h" #include "storage/proc.h" #include "storage/procarray.h" +#include "storage/procsignal.h" #include "utils/injection_point.h" /* diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index d84fa120b9f..2d2a6d5e9e7 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -5361,7 +5361,7 @@ DisplayMapping(HTAB *tuplecid_data) * transaction c) applied in LSN order. */ static void -ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname) +ApplyLogicalMappingFile(HTAB *tuplecid_data, const char *fname) { char path[MAXPGPATH]; int fd; @@ -5544,7 +5544,7 @@ UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot) elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname, snapshot->subxip[0]); - ApplyLogicalMappingFile(tuplecid_data, relid, f->fname); + ApplyLogicalMappingFile(tuplecid_data, f->fname); pfree(f); } } diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c index 1c343d03d21..d02d44d26a0 100644 --- a/src/backend/replication/logical/slotsync.c +++ b/src/backend/replication/logical/slotsync.c @@ -1541,8 +1541,6 @@ ReplSlotSyncWorkerMain(const void *startup_data, size_t startup_data_len) Assert(startup_data_len == 0); - MyBackendType = B_SLOTSYNC_WORKER; - init_ps_display(NULL); Assert(GetProcessingMode() == InitProcessing); @@ -1759,7 +1757,7 @@ update_synced_slots_inactive_since(void) Assert(SlotIsLogical(s)); /* The slot must not be acquired by any process */ - Assert(s->active_pid == 0); + Assert(s->active_proc == INVALID_PROC_NUMBER); /* Use the same inactive_since time for all the slots. */ if (now == 0) diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 4c47261c7f9..28c7019402b 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -226,6 +226,7 @@ ReplicationSlotsShmemInit(void) ReplicationSlot *slot = &ReplicationSlotCtl->replication_slots[i]; /* everything else is zeroed by the memset above */ + slot->active_proc = INVALID_PROC_NUMBER; SpinLockInit(&slot->mutex); LWLockInitialize(&slot->io_in_progress_lock, LWTRANCHE_REPLICATION_SLOT_IO); @@ -461,7 +462,7 @@ ReplicationSlotCreate(const char *name, bool db_specific, * be doing that. So it's safe to initialize the slot. */ Assert(!slot->in_use); - Assert(slot->active_pid == 0); + Assert(slot->active_proc == INVALID_PROC_NUMBER); /* first initialize persistent data */ memset(&slot->data, 0, sizeof(ReplicationSlotPersistentData)); @@ -505,8 +506,8 @@ ReplicationSlotCreate(const char *name, bool db_specific, /* We can now mark the slot active, and that makes it our slot. */ SpinLockAcquire(&slot->mutex); - Assert(slot->active_pid == 0); - slot->active_pid = MyProcPid; + Assert(slot->active_proc == INVALID_PROC_NUMBER); + slot->active_proc = MyProcNumber; SpinLockRelease(&slot->mutex); MyReplicationSlot = slot; @@ -620,6 +621,7 @@ void ReplicationSlotAcquire(const char *name, bool nowait, bool error_if_invalid) { ReplicationSlot *s; + ProcNumber active_proc; int active_pid; Assert(name != NULL); @@ -672,17 +674,18 @@ ReplicationSlotAcquire(const char *name, bool nowait, bool error_if_invalid) * to inactive_since in InvalidatePossiblyObsoleteSlot. */ SpinLockAcquire(&s->mutex); - if (s->active_pid == 0) - s->active_pid = MyProcPid; - active_pid = s->active_pid; + if (s->active_proc == INVALID_PROC_NUMBER) + s->active_proc = MyProcNumber; + active_proc = s->active_proc; ReplicationSlotSetInactiveSince(s, 0, false); SpinLockRelease(&s->mutex); } else { - s->active_pid = active_pid = MyProcPid; + s->active_proc = active_proc = MyProcNumber; ReplicationSlotSetInactiveSince(s, 0, true); } + active_pid = GetPGProcByNumber(active_proc)->pid; LWLockRelease(ReplicationSlotControlLock); /* @@ -690,7 +693,7 @@ ReplicationSlotAcquire(const char *name, bool nowait, bool error_if_invalid) * wait until the owning process signals us that it's been released, or * error out. */ - if (active_pid != MyProcPid) + if (active_proc != MyProcNumber) { if (!nowait) { @@ -762,7 +765,7 @@ ReplicationSlotRelease(void) bool is_logical; TimestampTz now = 0; - Assert(slot != NULL && slot->active_pid != 0); + Assert(slot != NULL && slot->active_proc != INVALID_PROC_NUMBER); is_logical = SlotIsLogical(slot); @@ -815,7 +818,7 @@ ReplicationSlotRelease(void) * disconnecting, but wake up others that may be waiting for it. */ SpinLockAcquire(&slot->mutex); - slot->active_pid = 0; + slot->active_proc = INVALID_PROC_NUMBER; ReplicationSlotSetInactiveSince(slot, now, false); SpinLockRelease(&slot->mutex); ConditionVariableBroadcast(&slot->active_cv); @@ -877,7 +880,7 @@ ReplicationSlotCleanup(bool synced_only) found_valid_logicalslot |= (SlotIsLogical(s) && s->data.invalidated == RS_INVAL_NONE); - if ((s->active_pid == MyProcPid && + if ((s->active_proc == MyProcNumber && (!synced_only || s->data.synced))) { Assert(s->data.persistency == RS_TEMPORARY); @@ -1088,7 +1091,7 @@ ReplicationSlotDropPtr(ReplicationSlot *slot) bool fail_softly = slot->data.persistency != RS_PERSISTENT; SpinLockAcquire(&slot->mutex); - slot->active_pid = 0; + slot->active_proc = INVALID_PROC_NUMBER; SpinLockRelease(&slot->mutex); /* wake up anyone waiting on this slot */ @@ -1110,7 +1113,7 @@ ReplicationSlotDropPtr(ReplicationSlot *slot) * Also wake up processes waiting for it. */ LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE); - slot->active_pid = 0; + slot->active_proc = INVALID_PROC_NUMBER; slot->in_use = false; LWLockRelease(ReplicationSlotControlLock); ConditionVariableBroadcast(&slot->active_cv); @@ -1476,7 +1479,7 @@ ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive) /* count slots with spinlock held */ SpinLockAcquire(&s->mutex); (*nslots)++; - if (s->active_pid != 0) + if (s->active_proc != INVALID_PROC_NUMBER) (*nactive)++; SpinLockRelease(&s->mutex); } @@ -1520,7 +1523,7 @@ ReplicationSlotsDropDBSlots(Oid dboid) { ReplicationSlot *s; char *slotname; - int active_pid; + ProcNumber active_proc; s = &ReplicationSlotCtl->replication_slots[i]; @@ -1550,11 +1553,11 @@ ReplicationSlotsDropDBSlots(Oid dboid) SpinLockAcquire(&s->mutex); /* can't change while ReplicationSlotControlLock is held */ slotname = NameStr(s->data.name); - active_pid = s->active_pid; - if (active_pid == 0) + active_proc = s->active_proc; + if (active_proc == INVALID_PROC_NUMBER) { MyReplicationSlot = s; - s->active_pid = MyProcPid; + s->active_proc = MyProcNumber; } SpinLockRelease(&s->mutex); @@ -1579,11 +1582,11 @@ ReplicationSlotsDropDBSlots(Oid dboid) * XXX: We can consider shutting down the slot sync worker before * trying to drop synced temporary slots here. */ - if (active_pid) + if (active_proc != INVALID_PROC_NUMBER) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("replication slot \"%s\" is active for PID %d", - slotname, active_pid))); + slotname, GetPGProcByNumber(active_proc)->pid))); /* * To avoid duplicating ReplicationSlotDropAcquired() and to avoid @@ -1974,6 +1977,7 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, { XLogRecPtr restart_lsn; NameData slotname; + ProcNumber active_proc; int active_pid = 0; ReplicationSlotInvalidationCause invalidation_cause = RS_INVAL_NONE; TimestampTz now = 0; @@ -2027,7 +2031,7 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, } slotname = s->data.name; - active_pid = s->active_pid; + active_proc = s->active_proc; /* * If the slot can be acquired, do so and mark it invalidated @@ -2039,10 +2043,10 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, * is terminated. So, the inactive slot can only be invalidated * immediately without being terminated. */ - if (active_pid == 0) + if (active_proc == INVALID_PROC_NUMBER) { MyReplicationSlot = s; - s->active_pid = MyProcPid; + s->active_proc = MyProcNumber; s->data.invalidated = invalidation_cause; /* @@ -2058,6 +2062,11 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, /* Let caller know */ invalidated = true; } + else + { + active_pid = GetPGProcByNumber(active_proc)->pid; + Assert(active_pid != 0); + } SpinLockRelease(&s->mutex); @@ -2073,7 +2082,7 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, &slot_idle_usecs); } - if (active_pid != 0) + if (active_proc != INVALID_PROC_NUMBER) { /* * Prepare the sleep on the slot's condition variable before @@ -2105,9 +2114,9 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, slot_idle_secs); if (MyBackendType == B_STARTUP) - (void) SendProcSignal(active_pid, - PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT, - INVALID_PROC_NUMBER); + (void) SignalRecoveryConflict(GetPGProcByNumber(active_proc), + active_pid, + RECOVERY_CONFLICT_LOGICALSLOT); else (void) kill(active_pid, SIGTERM); @@ -2875,7 +2884,7 @@ RestoreSlotFromDisk(const char *name) slot->candidate_restart_valid = InvalidXLogRecPtr; slot->in_use = true; - slot->active_pid = 0; + slot->active_proc = INVALID_PROC_NUMBER; /* * Set the time since the slot has become inactive after loading the @@ -3158,7 +3167,7 @@ StandbySlotsHaveCaughtup(XLogRecPtr wait_for_lsn, int elevel) SpinLockAcquire(&slot->mutex); restart_lsn = slot->data.restart_lsn; invalidated = slot->data.invalidated != RS_INVAL_NONE; - inactive = slot->active_pid == 0; + inactive = slot->active_proc == INVALID_PROC_NUMBER; SpinLockRelease(&slot->mutex); if (invalidated) diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c index 1ed2d80c2d2..9f5e4f998fe 100644 --- a/src/backend/replication/slotfuncs.c +++ b/src/backend/replication/slotfuncs.c @@ -20,6 +20,7 @@ #include "replication/logical.h" #include "replication/slot.h" #include "replication/slotsync.h" +#include "storage/proc.h" #include "utils/builtins.h" #include "utils/guc.h" #include "utils/pg_lsn.h" @@ -309,10 +310,10 @@ pg_get_replication_slots(PG_FUNCTION_ARGS) values[i++] = ObjectIdGetDatum(slot_contents.data.database); values[i++] = BoolGetDatum(slot_contents.data.persistency == RS_TEMPORARY); - values[i++] = BoolGetDatum(slot_contents.active_pid != 0); + values[i++] = BoolGetDatum(slot_contents.active_proc != INVALID_PROC_NUMBER); - if (slot_contents.active_pid != 0) - values[i++] = Int32GetDatum(slot_contents.active_pid); + if (slot_contents.active_proc != INVALID_PROC_NUMBER) + values[i++] = Int32GetDatum(GetPGProcByNumber(slot_contents.active_proc)->pid); else nulls[i++] = true; @@ -377,13 +378,13 @@ pg_get_replication_slots(PG_FUNCTION_ARGS) */ if (XLogRecPtrIsValid(slot_contents.data.restart_lsn)) { - int pid; + ProcNumber procno; SpinLockAcquire(&slot->mutex); - pid = slot->active_pid; + procno = slot->active_proc; slot_contents.data.restart_lsn = slot->data.restart_lsn; SpinLockRelease(&slot->mutex); - if (pid != 0) + if (procno != INVALID_PROC_NUMBER) { values[i++] = CStringGetTextDatum("unreserved"); walstate = WALAVAIL_UNRESERVED; diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c index e7bee777532..7ea6001e9ad 100644 --- a/src/backend/replication/syncrep.c +++ b/src/backend/replication/syncrep.c @@ -355,7 +355,7 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit) pg_read_barrier(); Assert(dlist_node_is_detached(&MyProc->syncRepLinks)); MyProc->syncRepState = SYNC_REP_NOT_WAITING; - MyProc->waitLSN = 0; + MyProc->waitLSN = InvalidXLogRecPtr; /* reset ps display to remove the suffix */ if (update_process_title) @@ -1027,7 +1027,7 @@ SyncRepQueueIsOrderedByLSN(int mode) Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE); - lastLSN = 0; + lastLSN = InvalidXLogRecPtr; dlist_foreach(iter, &WalSndCtl->SyncRepQueue[mode]) { diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 6970af3f3ff..10e64a7d1f4 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -169,7 +169,6 @@ WalReceiverMain(const void *startup_data, size_t startup_data_len) Assert(startup_data_len == 0); - MyBackendType = B_WAL_RECEIVER; AuxiliaryProcessMainCommon(); /* @@ -1122,8 +1121,8 @@ XLogWalRcvClose(XLogRecPtr recptr, TimeLineID tli) static void XLogWalRcvSendReply(bool force, bool requestReply) { - static XLogRecPtr writePtr = 0; - static XLogRecPtr flushPtr = 0; + static XLogRecPtr writePtr = InvalidXLogRecPtr; + static XLogRecPtr flushPtr = InvalidXLogRecPtr; XLogRecPtr applyPtr; TimestampTz now; diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index a0e6a3d200c..2cde8ebc729 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -1611,6 +1611,32 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, ProcessPendingWrites(); } +/* + * Handle configuration reload. + * + * Process the pending configuration file reload and reinitializes synchronous + * replication settings. Also releases any waiters that may now be satisfied due + * to changes in synchronous replication requirements. + */ +static void +WalSndHandleConfigReload(void) +{ + if (!ConfigReloadPending) + return; + + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + SyncRepInitConfig(); + + /* + * Recheck and release any now-satisfied waiters after config reload + * changes synchronous replication requirements (e.g., reducing the number + * of sync standbys or changing the standby names). + */ + if (!am_cascading_walsender) + SyncRepReleaseWaiters(); +} + /* * Wait until there is no pending write. Also process replies from the other * side and check timeouts during that. @@ -1646,12 +1672,7 @@ ProcessPendingWrites(void) CHECK_FOR_INTERRUPTS(); /* Process any requests or signals received recently */ - if (ConfigReloadPending) - { - ConfigReloadPending = false; - ProcessConfigFile(PGC_SIGHUP); - SyncRepInitConfig(); - } + WalSndHandleConfigReload(); /* Try to flush pending output to the client */ if (pq_flush_if_writable() != 0) @@ -1854,12 +1875,7 @@ WalSndWaitForWal(XLogRecPtr loc) CHECK_FOR_INTERRUPTS(); /* Process any requests or signals received recently */ - if (ConfigReloadPending) - { - ConfigReloadPending = false; - ProcessConfigFile(PGC_SIGHUP); - SyncRepInitConfig(); - } + WalSndHandleConfigReload(); /* Check for input from the client */ ProcessRepliesIfAny(); @@ -2899,12 +2915,7 @@ WalSndLoop(WalSndSendDataCallback send_data) CHECK_FOR_INTERRUPTS(); /* Process any requests or signals received recently */ - if (ConfigReloadPending) - { - ConfigReloadPending = false; - ProcessConfigFile(PGC_SIGHUP); - SyncRepInitConfig(); - } + WalSndHandleConfigReload(); /* Check for input from the client */ ProcessRepliesIfAny(); diff --git a/src/backend/statistics/extended_stats_funcs.c b/src/backend/statistics/extended_stats_funcs.c index db107684607..479f74652be 100644 --- a/src/backend/statistics/extended_stats_funcs.c +++ b/src/backend/statistics/extended_stats_funcs.c @@ -347,9 +347,8 @@ extended_statistics_update(FunctionCallInfo fcinfo) { ereport(WARNING, errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("could not find extended statistics object \"%s\".\"%s\"", - quote_identifier(nspname), - quote_identifier(stxname))); + errmsg("could not find extended statistics object \"%s.%s\"", + nspname, stxname)); success = false; goto cleanup; } @@ -364,11 +363,9 @@ extended_statistics_update(FunctionCallInfo fcinfo) { ereport(WARNING, errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not restore extended statistics object \"%s\".\"%s\": incorrect relation \"%s\".\"%s\" specified", - quote_identifier(nspname), - quote_identifier(stxname), - quote_identifier(relnspname), - quote_identifier(relname))); + errmsg("could not restore extended statistics object \"%s.%s\": incorrect relation \"%s.%s\" specified", + nspname, stxname, + relnspname, relname)); success = false; goto cleanup; @@ -420,9 +417,8 @@ extended_statistics_update(FunctionCallInfo fcinfo) errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("cannot specify parameter \"%s\"", extarginfo[NDISTINCT_ARG].argname), - errhint("Extended statistics object \"%s\".\"%s\" does not support statistics of this type.", - quote_identifier(nspname), - quote_identifier(stxname))); + errhint("Extended statistics object \"%s.%s\" does not support statistics of this type.", + nspname, stxname)); has.ndistinct = false; success = false; @@ -438,9 +434,8 @@ extended_statistics_update(FunctionCallInfo fcinfo) errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("cannot specify parameter \"%s\"", extarginfo[DEPENDENCIES_ARG].argname), - errhint("Extended statistics object \"%s\".\"%s\" does not support statistics of this type.", - quote_identifier(nspname), - quote_identifier(stxname))); + errhint("Extended statistics object \"%s.%s\" does not support statistics of this type.", + nspname, stxname)); has.dependencies = false; success = false; } @@ -463,9 +458,8 @@ extended_statistics_update(FunctionCallInfo fcinfo) extarginfo[MOST_COMMON_VALS_ARG].argname, extarginfo[MOST_COMMON_FREQS_ARG].argname, extarginfo[MOST_COMMON_BASE_FREQS_ARG].argname), - errhint("Extended statistics object \"%s\".\"%s\" does not support statistics of this type.", - quote_identifier(nspname), - quote_identifier(stxname))); + errhint("Extended statistics object \"%s.%s\" does not support statistics of this type.", + nspname, stxname)); has.mcv = false; success = false; @@ -539,7 +533,7 @@ extended_statistics_update(FunctionCallInfo fcinfo) /* * After all the positive number attnums in stxkeys come the negative * numbers (if any) which represent expressions in the order that they - * appear in stxdexprs. Because the expressions are always + * appear in stxdexpr. Because the expressions are always * monotonically decreasing from -1, there is no point in looking at * the values in stxkeys, it's enough to know how many of them there * are. @@ -888,7 +882,7 @@ pg_clear_extended_stats(PG_FUNCTION_ARGS) table_close(pg_stext, RowExclusiveLock); ereport(WARNING, errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("could not find extended statistics object \"%s\".\"%s\"", + errmsg("could not find extended statistics object \"%s.%s\"", nspname, stxname)); PG_RETURN_VOID(); } @@ -904,7 +898,7 @@ pg_clear_extended_stats(PG_FUNCTION_ARGS) table_close(pg_stext, RowExclusiveLock); ereport(WARNING, errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not clear extended statistics object \"%s\".\"%s\": incorrect relation \"%s\".\"%s\" specified", + errmsg("could not clear extended statistics object \"%s.%s\": incorrect relation \"%s.%s\" specified", get_namespace_name(nspoid), stxname, relnspname, relname)); PG_RETURN_VOID(); diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c index d7c144cd8f7..d9617c20e76 100644 --- a/src/backend/storage/aio/method_worker.c +++ b/src/backend/storage/aio/method_worker.c @@ -390,7 +390,6 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) volatile int error_errno = 0; char cmd[128]; - MyBackendType = B_IO_WORKER; AuxiliaryProcessMainCommon(); pqsignal(SIGHUP, SignalHandlerForConfigReload); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 6f935648ae9..d1babaff023 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -59,6 +59,7 @@ #include "storage/lmgr.h" #include "storage/proc.h" #include "storage/proclist.h" +#include "storage/procsignal.h" #include "storage/read_stream.h" #include "storage/smgr.h" #include "storage/standby.h" @@ -5895,6 +5896,13 @@ BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr) /* * Acquire the content lock for the buffer, but only if we don't have to wait. + * + * It is allowed to try to conditionally acquire a lock on a buffer that this + * backend has already locked, but the lock acquisition will always fail, even + * if the new lock acquisition does not conflict with an already held lock + * (e.g. two share locks). This is because we currently do not have space to + * track multiple lock ownerships of the same buffer within one backend. That + * is ok for the current uses of BufferLockConditional(). */ static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode) @@ -5903,9 +5911,12 @@ BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode) bool mustwait; /* - * We better not already hold a lock on the buffer. + * As described above, if we're trying to lock a buffer this backend + * already has locked, return false, independent of the existing and + * desired lock level. */ - Assert(entry->data.lockmode == BUFFER_LOCK_UNLOCK); + if (entry->data.lockmode != BUFFER_LOCK_UNLOCK) + return false; /* * Lock out cancel/die interrupts until we exit the code section protected @@ -6560,7 +6571,7 @@ LockBufferForCleanup(Buffer buffer) * deadlock_timeout for it. */ if (logged_recovery_conflict) - LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, + LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN, waitStart, GetCurrentTimestamp(), NULL, false); @@ -6611,7 +6622,7 @@ LockBufferForCleanup(Buffer buffer) if (TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout)) { - LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, + LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN, waitStart, now, NULL, true); logged_recovery_conflict = true; } diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 0f8083651de..5d07b64a1ef 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -164,6 +164,9 @@ bool data_sync_retry = false; /* How SyncDataDirectory() should do its job. */ int recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC; +/* How data files should be bulk-extended with zeros. */ +int file_extend_method = DEFAULT_FILE_EXTEND_METHOD; + /* Which kinds of files should be opened with PG_O_DIRECT. */ int io_direct_flags; diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 2a3dfedf7e9..1f7e933d500 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -212,12 +212,10 @@ CreateSharedMemoryAndSemaphores(void) Assert(strcmp("unknown", GetConfigOption("huge_pages_status", false, false)) != 0); - InitShmemAccess(seghdr); - /* * Set up shared memory allocation mechanism */ - InitShmemAllocation(); + InitShmemAllocator(seghdr); /* Initialize subsystems */ CreateOrAttachShmemStructs(); diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 6be565155ab..40312df2cac 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -60,6 +60,7 @@ #include "port/pg_lfind.h" #include "storage/proc.h" #include "storage/procarray.h" +#include "storage/procsignal.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/injection_point.h" @@ -708,7 +709,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) /* be sure this is cleared in abort */ proc->delayChkptFlags = 0; - proc->recoveryConflictPending = false; + pg_atomic_write_u32(&proc->pendingRecoveryConflicts, 0); /* must be cleared with xid/xmin: */ /* avoid unnecessarily dirtying shared cachelines */ @@ -750,7 +751,7 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) /* be sure this is cleared in abort */ proc->delayChkptFlags = 0; - proc->recoveryConflictPending = false; + pg_atomic_write_u32(&proc->pendingRecoveryConflicts, 0); /* must be cleared with xid/xmin: */ /* avoid unnecessarily dirtying shared cachelines */ @@ -933,7 +934,7 @@ ProcArrayClearTransaction(PGPROC *proc) proc->vxid.lxid = InvalidLocalTransactionId; proc->xmin = InvalidTransactionId; - proc->recoveryConflictPending = false; + pg_atomic_write_u32(&proc->pendingRecoveryConflicts, 0); Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK)); Assert(!proc->delayChkptFlags); @@ -3445,19 +3446,46 @@ GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid) } /* - * CancelVirtualTransaction - used in recovery conflict processing + * SignalRecoveryConflict -- signal that a process is blocking recovery * - * Returns pid of the process signaled, or 0 if not found. + * The 'pid' is redundant with 'proc', but it acts as a cross-check to + * detect process had exited and the PGPROC entry was reused for a different + * process. + * + * Returns true if the process was signaled, or false if not found. */ -pid_t -CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode) +bool +SignalRecoveryConflict(PGPROC *proc, pid_t pid, RecoveryConflictReason reason) { - return SignalVirtualTransaction(vxid, sigmode, true); + bool found = false; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + /* + * Kill the pid if it's still here. If not, that's what we wanted so + * ignore any errors. + */ + if (proc->pid == pid) + { + (void) pg_atomic_fetch_or_u32(&proc->pendingRecoveryConflicts, (1 << reason)); + + /* wake up the process */ + (void) SendProcSignal(pid, PROCSIG_RECOVERY_CONFLICT, GetNumberFromPGProc(proc)); + found = true; + } + + LWLockRelease(ProcArrayLock); + + return found; } -pid_t -SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode, - bool conflictPending) +/* + * SignalRecoveryConflictWithVirtualXID -- signal that a VXID is blocking recovery + * + * Like SignalRecoveryConflict, but the target is identified by VXID + */ +bool +SignalRecoveryConflictWithVirtualXID(VirtualTransactionId vxid, RecoveryConflictReason reason) { ProcArrayStruct *arrayP = procArray; int index; @@ -3476,15 +3504,16 @@ SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode, if (procvxid.procNumber == vxid.procNumber && procvxid.localTransactionId == vxid.localTransactionId) { - proc->recoveryConflictPending = conflictPending; pid = proc->pid; if (pid != 0) { + (void) pg_atomic_fetch_or_u32(&proc->pendingRecoveryConflicts, (1 << reason)); + /* * Kill the pid if it's still here. If not, that's what we * wanted so ignore any errors. */ - (void) SendProcSignal(pid, sigmode, vxid.procNumber); + (void) SendProcSignal(pid, PROCSIG_RECOVERY_CONFLICT, vxid.procNumber); } break; } @@ -3492,7 +3521,50 @@ SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode, LWLockRelease(ProcArrayLock); - return pid; + return pid != 0; +} + +/* + * SignalRecoveryConflictWithDatabase --- signal all backends specified database + * + * Like SignalRecoveryConflict, but signals all backends using the database. + */ +void +SignalRecoveryConflictWithDatabase(Oid databaseid, RecoveryConflictReason reason) +{ + ProcArrayStruct *arrayP = procArray; + int index; + + /* tell all backends to die */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + if (databaseid == InvalidOid || proc->databaseId == databaseid) + { + VirtualTransactionId procvxid; + pid_t pid; + + GET_VXID_FROM_PGPROC(procvxid, *proc); + + pid = proc->pid; + if (pid != 0) + { + (void) pg_atomic_fetch_or_u32(&proc->pendingRecoveryConflicts, (1 << reason)); + + /* + * Kill the pid if it's still here. If not, that's what we + * wanted so ignore any errors. + */ + (void) SendProcSignal(pid, PROCSIG_RECOVERY_CONFLICT, procvxid.procNumber); + } + } + } + + LWLockRelease(ProcArrayLock); } /* @@ -3602,7 +3674,7 @@ CountDBConnections(Oid databaseid) if (proc->pid == 0) continue; /* do not count prepared xacts */ - if (!proc->isRegularBackend) + if (proc->backendType != B_BACKEND) continue; /* count only regular backend processes */ if (!OidIsValid(databaseid) || proc->databaseId == databaseid) @@ -3614,46 +3686,6 @@ CountDBConnections(Oid databaseid) return count; } -/* - * CancelDBBackends --- cancel backends that are using specified database - */ -void -CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending) -{ - ProcArrayStruct *arrayP = procArray; - int index; - - /* tell all backends to die */ - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - - for (index = 0; index < arrayP->numProcs; index++) - { - int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; - - if (databaseid == InvalidOid || proc->databaseId == databaseid) - { - VirtualTransactionId procvxid; - pid_t pid; - - GET_VXID_FROM_PGPROC(procvxid, *proc); - - proc->recoveryConflictPending = conflictPending; - pid = proc->pid; - if (pid != 0) - { - /* - * Kill the pid if it's still here. If not, that's what we - * wanted so ignore any errors. - */ - (void) SendProcSignal(pid, sigmode, procvxid.procNumber); - } - } - } - - LWLockRelease(ProcArrayLock); -} - /* * CountUserBackends --- count backends that are used by specified user * (only regular backends, not any type of background worker) @@ -3674,7 +3706,7 @@ CountUserBackends(Oid roleid) if (proc->pid == 0) continue; /* do not count prepared xacts */ - if (!proc->isRegularBackend) + if (proc->backendType != B_BACKEND) continue; /* count only regular backend processes */ if (proc->roleId == roleid) count++; diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 8e56922dcea..5d33559926a 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -697,26 +697,8 @@ procsignal_sigusr1_handler(SIGNAL_ARGS) if (CheckProcSignal(PROCSIG_PARALLEL_APPLY_MESSAGE)) HandleParallelApplyMessageInterrupt(); - if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_DATABASE)) - HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_DATABASE); - - if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_TABLESPACE)) - HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_TABLESPACE); - - if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_LOCK)) - HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_LOCK); - - if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT)) - HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT); - - if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT)) - HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT); - - if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK)) - HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); - - if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN)) - HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT)) + HandleRecoveryConflictInterrupt(); SetLatch(MyLatch); } diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 1b536363152..9f362ce8641 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -76,20 +76,33 @@ #include "storage/spin.h" #include "utils/builtins.h" +/* + * This is the first data structure stored in the shared memory segment, at + * the offset that PGShmemHeader->content_offset points to. Allocations by + * ShmemAlloc() are carved out of the space after this. + * + * For the base pointer and the total size of the shmem segment, we rely on + * the PGShmemHeader. + */ +typedef struct ShmemAllocatorData +{ + Size free_offset; /* offset to first free space from ShmemBase */ + HTAB *index; /* copy of ShmemIndex */ + + /* protects shared memory and LWLock allocation */ + slock_t shmem_lock; +} ShmemAllocatorData; + static void *ShmemAllocRaw(Size size, Size *allocated_size); -static void *ShmemAllocUnlocked(Size size); /* shared memory global variables */ static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */ - static void *ShmemBase; /* start address of shared memory */ - static void *ShmemEnd; /* end+1 address of shared memory */ -slock_t *ShmemLock; /* spinlock for shared memory and LWLock - * allocation */ - +static ShmemAllocatorData *ShmemAllocator; +slock_t *ShmemLock; /* points to ShmemAllocator->shmem_lock */ static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */ /* To get reliable results for NUMA inquiry we need to "touch pages" once */ @@ -98,49 +111,64 @@ static bool firstNumaTouch = true; Datum pg_numa_available(PG_FUNCTION_ARGS); /* - * InitShmemAccess() --- set up basic pointers to shared memory. + * InitShmemAllocator() --- set up basic pointers to shared memory. + * + * Called at postmaster or stand-alone backend startup, to initialize the + * allocator's data structure in the shared memory segment. In EXEC_BACKEND, + * this is also called at backend startup, to set up pointers to the shared + * memory areas. */ void -InitShmemAccess(PGShmemHeader *seghdr) +InitShmemAllocator(PGShmemHeader *seghdr) { + Assert(seghdr != NULL); + + /* + * We assume the pointer and offset are MAXALIGN. Not a hard requirement, + * but it's true today and keeps the math below simpler. + */ + Assert(seghdr == (void *) MAXALIGN(seghdr)); + Assert(seghdr->content_offset == MAXALIGN(seghdr->content_offset)); + ShmemSegHdr = seghdr; ShmemBase = seghdr; ShmemEnd = (char *) ShmemBase + seghdr->totalsize; -} -/* - * InitShmemAllocation() --- set up shared-memory space allocation. - * - * This should be called only in the postmaster or a standalone backend. - */ -void -InitShmemAllocation(void) -{ - PGShmemHeader *shmhdr = ShmemSegHdr; - char *aligned; +#ifndef EXEC_BACKEND + Assert(!IsUnderPostmaster); +#endif + if (IsUnderPostmaster) + { + PGShmemHeader *shmhdr = ShmemSegHdr; - Assert(shmhdr != NULL); + ShmemAllocator = (ShmemAllocatorData *) ((char *) shmhdr + shmhdr->content_offset); + ShmemLock = &ShmemAllocator->shmem_lock; + } + else + { + Size offset; - /* - * Initialize the spinlock used by ShmemAlloc. We must use - * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet. - */ - ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t)); + /* + * Allocations after this point should go through ShmemAlloc, which + * expects to allocate everything on cache line boundaries. Make sure + * the first allocation begins on a cache line boundary. + */ + offset = CACHELINEALIGN(seghdr->content_offset + sizeof(ShmemAllocatorData)); + if (offset > seghdr->totalsize) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory (%zu bytes requested)", + offset))); - SpinLockInit(ShmemLock); + ShmemAllocator = (ShmemAllocatorData *) ((char *) seghdr + seghdr->content_offset); - /* - * Allocations after this point should go through ShmemAlloc, which - * expects to allocate everything on cache line boundaries. Make sure the - * first allocation begins on a cache line boundary. - */ - aligned = (char *) - (CACHELINEALIGN((((char *) shmhdr) + shmhdr->freeoffset))); - shmhdr->freeoffset = aligned - (char *) shmhdr; - - /* ShmemIndex can't be set up yet (need LWLocks first) */ - shmhdr->index = NULL; - ShmemIndex = (HTAB *) NULL; + SpinLockInit(&ShmemAllocator->shmem_lock); + ShmemLock = &ShmemAllocator->shmem_lock; + ShmemAllocator->free_offset = offset; + /* ShmemIndex can't be set up yet (need LWLocks first) */ + ShmemAllocator->index = NULL; + ShmemIndex = (HTAB *) NULL; + } } /* @@ -209,13 +237,13 @@ ShmemAllocRaw(Size size, Size *allocated_size) SpinLockAcquire(ShmemLock); - newStart = ShmemSegHdr->freeoffset; + newStart = ShmemAllocator->free_offset; newFree = newStart + size; if (newFree <= ShmemSegHdr->totalsize) { newSpace = (char *) ShmemBase + newStart; - ShmemSegHdr->freeoffset = newFree; + ShmemAllocator->free_offset = newFree; } else newSpace = NULL; @@ -228,45 +256,6 @@ ShmemAllocRaw(Size size, Size *allocated_size) return newSpace; } -/* - * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory - * - * Allocate space without locking ShmemLock. This should be used for, - * and only for, allocations that must happen before ShmemLock is ready. - * - * We consider maxalign, rather than cachealign, sufficient here. - */ -static void * -ShmemAllocUnlocked(Size size) -{ - Size newStart; - Size newFree; - void *newSpace; - - /* - * Ensure allocated space is adequately aligned. - */ - size = MAXALIGN(size); - - Assert(ShmemSegHdr != NULL); - - newStart = ShmemSegHdr->freeoffset; - - newFree = newStart + size; - if (newFree > ShmemSegHdr->totalsize) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of shared memory (%zu bytes requested)", - size))); - ShmemSegHdr->freeoffset = newFree; - - newSpace = (char *) ShmemBase + newStart; - - Assert(newSpace == (void *) MAXALIGN(newSpace)); - - return newSpace; -} - /* * ShmemAddrIsValid -- test if an address refers to shared memory * @@ -395,16 +384,14 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) if (!ShmemIndex) { - PGShmemHeader *shmemseghdr = ShmemSegHdr; - /* Must be trying to create/attach to ShmemIndex itself */ Assert(strcmp(name, "ShmemIndex") == 0); if (IsUnderPostmaster) { /* Must be initializing a (non-standalone) backend */ - Assert(shmemseghdr->index != NULL); - structPtr = shmemseghdr->index; + Assert(ShmemAllocator->index != NULL); + structPtr = ShmemAllocator->index; *foundPtr = true; } else @@ -417,9 +404,9 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) * index has been initialized. This should be OK because no other * process can be accessing shared memory yet. */ - Assert(shmemseghdr->index == NULL); + Assert(ShmemAllocator->index == NULL); structPtr = ShmemAlloc(size); - shmemseghdr->index = structPtr; + ShmemAllocator->index = structPtr; *foundPtr = false; } LWLockRelease(ShmemIndexLock); @@ -553,15 +540,15 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS) /* output shared memory allocated but not counted via the shmem index */ values[0] = CStringGetTextDatum(""); nulls[1] = true; - values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated); + values[2] = Int64GetDatum(ShmemAllocator->free_offset - named_allocated); values[3] = values[2]; tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); /* output as-of-yet unused shared memory */ nulls[0] = true; - values[1] = Int64GetDatum(ShmemSegHdr->freeoffset); + values[1] = Int64GetDatum(ShmemAllocator->free_offset); nulls[1] = false; - values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset); + values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemAllocator->free_offset); values[3] = values[2]; tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); diff --git a/src/backend/storage/ipc/signalfuncs.c b/src/backend/storage/ipc/signalfuncs.c index 6f7759cd720..d48b4fe3799 100644 --- a/src/backend/storage/ipc/signalfuncs.c +++ b/src/backend/storage/ipc/signalfuncs.c @@ -87,10 +87,7 @@ pg_signal_backend(int pid, int sig) */ if (!OidIsValid(proc->roleId) || superuser_arg(proc->roleId)) { - ProcNumber procNumber = GetNumberFromPGProc(proc); - BackendType backendType = pgstat_get_backend_type_by_proc_number(procNumber); - - if (backendType == B_AUTOVAC_WORKER) + if (proc->backendType == B_AUTOVAC_WORKER) { if (!has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_AUTOVACUUM_WORKER)) return SIGNAL_BACKEND_NOAUTOVAC; diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index afffab77106..d83afbfb9d6 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -71,13 +71,13 @@ static volatile sig_atomic_t got_standby_delay_timeout = false; static volatile sig_atomic_t got_standby_lock_timeout = false; static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, - ProcSignalReason reason, + RecoveryConflictReason reason, uint32 wait_event_info, bool report_waiting); -static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason); +static void SendRecoveryConflictWithBufferPin(RecoveryConflictReason reason); static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts); static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks); -static const char *get_recovery_conflict_desc(ProcSignalReason reason); +static const char *get_recovery_conflict_desc(RecoveryConflictReason reason); /* * InitRecoveryTransactionEnvironment @@ -271,7 +271,7 @@ WaitExceedsMaxStandbyDelay(uint32 wait_event_info) * to be resolved or not. */ void -LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, +LogRecoveryConflict(RecoveryConflictReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting) { @@ -358,7 +358,8 @@ LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, */ static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, - ProcSignalReason reason, uint32 wait_event_info, + RecoveryConflictReason reason, + uint32 wait_event_info, bool report_waiting) { TimestampTz waitStart = 0; @@ -384,19 +385,19 @@ ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, /* Is it time to kill it? */ if (WaitExceedsMaxStandbyDelay(wait_event_info)) { - pid_t pid; + bool signaled; /* * Now find out who to throw out of the balloon. */ Assert(VirtualTransactionIdIsValid(*waitlist)); - pid = CancelVirtualTransaction(*waitlist, reason); + signaled = SignalRecoveryConflictWithVirtualXID(*waitlist, reason); /* * Wait a little bit for it to die so that we avoid flooding * an unresponsive backend when system is heavily loaded. */ - if (pid != 0) + if (signaled) pg_usleep(5000L); } @@ -489,7 +490,7 @@ ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon, backends = GetConflictingVirtualXIDs(snapshotConflictHorizon, locator.dbOid); ResolveRecoveryConflictWithVirtualXIDs(backends, - PROCSIG_RECOVERY_CONFLICT_SNAPSHOT, + RECOVERY_CONFLICT_SNAPSHOT, WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT, true); @@ -560,7 +561,7 @@ ResolveRecoveryConflictWithTablespace(Oid tsid) temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId, InvalidOid); ResolveRecoveryConflictWithVirtualXIDs(temp_file_users, - PROCSIG_RECOVERY_CONFLICT_TABLESPACE, + RECOVERY_CONFLICT_TABLESPACE, WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE, true); } @@ -581,7 +582,7 @@ ResolveRecoveryConflictWithDatabase(Oid dbid) */ while (CountDBBackends(dbid) > 0) { - CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true); + SignalRecoveryConflictWithDatabase(dbid, RECOVERY_CONFLICT_DATABASE); /* * Wait awhile for them to die so that we avoid flooding an @@ -665,7 +666,7 @@ ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict) * because the caller, WaitOnLock(), has already reported that. */ ResolveRecoveryConflictWithVirtualXIDs(backends, - PROCSIG_RECOVERY_CONFLICT_LOCK, + RECOVERY_CONFLICT_LOCK, PG_WAIT_LOCK | locktag.locktag_type, false); } @@ -723,9 +724,8 @@ ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict) */ while (VirtualTransactionIdIsValid(*backends)) { - SignalVirtualTransaction(*backends, - PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK, - false); + (void) SignalRecoveryConflictWithVirtualXID(*backends, + RECOVERY_CONFLICT_STARTUP_DEADLOCK); backends++; } @@ -803,7 +803,7 @@ ResolveRecoveryConflictWithBufferPin(void) /* * We're already behind, so clear a path as quickly as possible. */ - SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); + SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN); } else { @@ -843,7 +843,7 @@ ResolveRecoveryConflictWithBufferPin(void) ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP); if (got_standby_delay_timeout) - SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); + SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN); else if (got_standby_deadlock_timeout) { /* @@ -859,7 +859,7 @@ ResolveRecoveryConflictWithBufferPin(void) * not be so harmful because the period that the buffer is kept pinned * is basically no so long. But we should fix this? */ - SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); + SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK); } /* @@ -874,18 +874,18 @@ ResolveRecoveryConflictWithBufferPin(void) } static void -SendRecoveryConflictWithBufferPin(ProcSignalReason reason) +SendRecoveryConflictWithBufferPin(RecoveryConflictReason reason) { - Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN || - reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); + Assert(reason == RECOVERY_CONFLICT_BUFFERPIN || + reason == RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK); /* * We send signal to all backends to ask them if they are holding the - * buffer pin which is delaying the Startup process. We must not set the - * conflict flag yet, since most backends will be innocent. Let the - * SIGUSR1 handling in each backend decide their own fate. + * buffer pin which is delaying the Startup process. Most of them will be + * innocent, but we let the SIGUSR1 handling in each backend decide their + * own fate. */ - CancelDBBackends(InvalidOid, reason, false); + SignalRecoveryConflictWithDatabase(InvalidOid, reason); } /* @@ -1490,35 +1490,36 @@ LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs, /* Return the description of recovery conflict */ static const char * -get_recovery_conflict_desc(ProcSignalReason reason) +get_recovery_conflict_desc(RecoveryConflictReason reason) { const char *reasonDesc = _("unknown reason"); switch (reason) { - case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + case RECOVERY_CONFLICT_BUFFERPIN: reasonDesc = _("recovery conflict on buffer pin"); break; - case PROCSIG_RECOVERY_CONFLICT_LOCK: + case RECOVERY_CONFLICT_LOCK: reasonDesc = _("recovery conflict on lock"); break; - case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: + case RECOVERY_CONFLICT_TABLESPACE: reasonDesc = _("recovery conflict on tablespace"); break; - case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + case RECOVERY_CONFLICT_SNAPSHOT: reasonDesc = _("recovery conflict on snapshot"); break; - case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT: + case RECOVERY_CONFLICT_LOGICALSLOT: reasonDesc = _("recovery conflict on replication slot"); break; - case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + case RECOVERY_CONFLICT_STARTUP_DEADLOCK: + reasonDesc = _("recovery conflict on deadlock"); + break; + case RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK: reasonDesc = _("recovery conflict on buffer deadlock"); break; - case PROCSIG_RECOVERY_CONFLICT_DATABASE: + case RECOVERY_CONFLICT_DATABASE: reasonDesc = _("recovery conflict on database"); break; - default: - break; } return reasonDesc; diff --git a/src/backend/storage/lmgr/deadlock.c b/src/backend/storage/lmgr/deadlock.c index 8334a887618..0a8dd5eb7c2 100644 --- a/src/backend/storage/lmgr/deadlock.c +++ b/src/backend/storage/lmgr/deadlock.c @@ -135,10 +135,9 @@ static PGPROC *blocking_autovacuum_proc = NULL; * This does per-backend initialization of the deadlock checker; primarily, * allocation of working memory for DeadLockCheck. We do this per-backend * since there's no percentage in making the kernel do copy-on-write - * inheritance of workspace from the postmaster. We want to allocate the - * space at startup because (a) the deadlock checker might be invoked when - * there's no free memory left, and (b) the checker is normally run inside a - * signal handler, which is a very dangerous place to invoke palloc from. + * inheritance of workspace from the postmaster. We allocate the space at + * startup because the deadlock checker is run with all the partitions of the + * lock table locked, and we want to keep that section as short as possible. */ void InitDeadLockChecking(void) @@ -213,8 +212,7 @@ InitDeadLockChecking(void) * * On failure, deadlock details are recorded in deadlockDetails[] for * subsequent printing by DeadLockReport(). That activity is separate - * because (a) we don't want to do it while holding all those LWLocks, - * and (b) we are typically invoked inside a signal handler. + * because we don't want to do it while holding all those LWLocks. */ DeadLockState DeadLockCheck(PGPROC *proc) diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 063826ae576..31ccdb1ef89 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -80,15 +80,13 @@ PROC_HDR *ProcGlobal = NULL; NON_EXEC_STATIC PGPROC *AuxiliaryProcs = NULL; PGPROC *PreparedXactProcs = NULL; -static DeadLockState deadlock_state = DS_NOT_YET_CHECKED; - /* Is a deadlock check pending? */ static volatile sig_atomic_t got_deadlock_timeout; static void RemoveProcFromArray(int code, Datum arg); static void ProcKill(int code, Datum arg); static void AuxiliaryProcKill(int code, Datum arg); -static void CheckDeadLock(void); +static DeadLockState CheckDeadLock(void); /* @@ -486,7 +484,7 @@ InitProcess(void) MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; MyProc->tempNamespaceId = InvalidOid; - MyProc->isRegularBackend = AmRegularBackendProcess(); + MyProc->backendType = MyBackendType; MyProc->delayChkptFlags = 0; MyProc->statusFlags = 0; /* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */ @@ -506,10 +504,10 @@ InitProcess(void) Assert(dlist_is_empty(&(MyProc->myProcLocks[i]))); } #endif - MyProc->recoveryConflictPending = false; + pg_atomic_write_u32(&MyProc->pendingRecoveryConflicts, 0); /* Initialize fields for sync rep */ - MyProc->waitLSN = 0; + MyProc->waitLSN = InvalidXLogRecPtr; MyProc->syncRepState = SYNC_REP_NOT_WAITING; dlist_node_init(&MyProc->syncRepLinks); @@ -685,7 +683,7 @@ InitAuxiliaryProcess(void) MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; MyProc->tempNamespaceId = InvalidOid; - MyProc->isRegularBackend = false; + MyProc->backendType = MyBackendType; MyProc->delayChkptFlags = 0; MyProc->statusFlags = 0; MyProc->lwWaiting = LW_WS_NOT_WAITING; @@ -1322,6 +1320,7 @@ ProcSleep(LOCALLOCK *locallock) bool allow_autovacuum_cancel = true; bool logged_recovery_conflict = false; ProcWaitStatus myWaitStatus; + DeadLockState deadlock_state; /* The caller must've armed the on-error cleanup mechanism */ Assert(GetAwaitedLock() == locallock); @@ -1447,7 +1446,7 @@ ProcSleep(LOCALLOCK *locallock) * because the startup process here has already waited * longer than deadlock_timeout. */ - LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK, + LogRecoveryConflict(RECOVERY_CONFLICT_LOCK, standbyWaitStart, now, cnt > 0 ? vxids : NULL, true); logged_recovery_conflict = true; @@ -1462,7 +1461,7 @@ ProcSleep(LOCALLOCK *locallock) /* check for deadlocks first, as that's probably log-worthy */ if (got_deadlock_timeout) { - CheckDeadLock(); + deadlock_state = CheckDeadLock(); got_deadlock_timeout = false; } CHECK_FOR_INTERRUPTS(); @@ -1688,7 +1687,7 @@ ProcSleep(LOCALLOCK *locallock) * startup process waited longer than deadlock_timeout for it. */ if (InHotStandby && logged_recovery_conflict) - LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK, + LogRecoveryConflict(RECOVERY_CONFLICT_LOCK, standbyWaitStart, GetCurrentTimestamp(), NULL, false); @@ -1785,14 +1784,14 @@ ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock) * * We only get to this routine, if DEADLOCK_TIMEOUT fired while waiting for a * lock to be released by some other process. Check if there's a deadlock; if - * not, just return. (But signal ProcSleep to log a message, if - * log_lock_waits is true.) If we have a real deadlock, remove ourselves from - * the lock's wait queue and signal an error to ProcSleep. + * not, just return. If we have a real deadlock, remove ourselves from the + * lock's wait queue. */ -static void +static DeadLockState CheckDeadLock(void) { int i; + DeadLockState result; /* * Acquire exclusive lock on the entire shared lock data structures. Must @@ -1819,17 +1818,20 @@ CheckDeadLock(void) */ if (MyProc->links.prev == NULL || MyProc->links.next == NULL) + { + result = DS_NO_DEADLOCK; goto check_done; + } #ifdef LOCK_DEBUG if (Debug_deadlocks) DumpAllLocks(); #endif - /* Run the deadlock check, and set deadlock_state for use by ProcSleep */ - deadlock_state = DeadLockCheck(MyProc); + /* Run the deadlock check */ + result = DeadLockCheck(MyProc); - if (deadlock_state == DS_HARD_DEADLOCK) + if (result == DS_HARD_DEADLOCK) { /* * Oops. We have a deadlock. @@ -1841,7 +1843,7 @@ CheckDeadLock(void) * * RemoveFromWaitQueue sets MyProc->waitStatus to * PROC_WAIT_STATUS_ERROR, so ProcSleep will report an error after we - * return from the signal handler. + * return. */ Assert(MyProc->waitLock != NULL); RemoveFromWaitQueue(MyProc, LockTagHashCode(&(MyProc->waitLock->tag))); @@ -1868,6 +1870,8 @@ CheckDeadLock(void) check_done: for (i = NUM_LOCK_PARTITIONS; --i >= 0;) LWLockRelease(LockHashPartitionLockByIndex(i)); + + return result; } /* diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index a2625871185..443434e4ea8 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -602,13 +602,24 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, * that decision should be made though? For now just use a cutoff of * 8, anything between 4 and 8 worked OK in some local testing. */ - if (numblocks > 8) + if (numblocks > 8 && + file_extend_method != FILE_EXTEND_METHOD_WRITE_ZEROS) { - int ret; + int ret = 0; - ret = FileFallocate(v->mdfd_vfd, - seekpos, (pgoff_t) BLCKSZ * numblocks, - WAIT_EVENT_DATA_FILE_EXTEND); +#ifdef HAVE_POSIX_FALLOCATE + if (file_extend_method == FILE_EXTEND_METHOD_POSIX_FALLOCATE) + { + ret = FileFallocate(v->mdfd_vfd, + seekpos, (pgoff_t) BLCKSZ * numblocks, + WAIT_EVENT_DATA_FILE_EXTEND); + } + else +#endif + { + elog(ERROR, "unsupported file_extend_method: %d", + file_extend_method); + } if (ret != 0) { ereport(ERROR, diff --git a/src/backend/tcop/backend_startup.c b/src/backend/tcop/backend_startup.c index 94a7b839563..c517115927c 100644 --- a/src/backend/tcop/backend_startup.c +++ b/src/backend/tcop/backend_startup.c @@ -846,10 +846,9 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) if (strlen(port->user_name) >= NAMEDATALEN) port->user_name[NAMEDATALEN - 1] = '\0'; + Assert(MyBackendType == B_BACKEND || MyBackendType == B_DEAD_END_BACKEND); if (am_walsender) MyBackendType = B_WAL_SENDER; - else - MyBackendType = B_BACKEND; /* * Normal walsender backends, e.g. for streaming replication, are not diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index e54bf1e760f..21de158adbb 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -67,6 +67,7 @@ #include "storage/proc.h" #include "storage/procsignal.h" #include "storage/sinval.h" +#include "storage/standby.h" #include "tcop/backend_startup.h" #include "tcop/fastpath.h" #include "tcop/pquery.h" @@ -155,10 +156,6 @@ static const char *userDoption = NULL; /* -D switch */ static bool EchoQuery = false; /* -E switch */ static bool UseSemiNewlineNewline = false; /* -j switch */ -/* whether or not, and why, we were canceled by conflict with recovery */ -static volatile sig_atomic_t RecoveryConflictPending = false; -static volatile sig_atomic_t RecoveryConflictPendingReasons[NUM_PROCSIGNALS]; - /* reused buffer to pass to SendRowDescriptionMessage() */ static MemoryContext row_description_context = NULL; static StringInfoData row_description_buf; @@ -175,7 +172,6 @@ static void forbidden_in_wal_sender(char firstchar); static bool check_log_statement(List *stmt_list); static int errdetail_execute(List *raw_parsetree_list); static int errdetail_params(ParamListInfo params); -static int errdetail_abort(void); static void bind_param_error_callback(void *arg); static void start_xact_command(void); static void finish_xact_command(void); @@ -183,6 +179,9 @@ static bool IsTransactionExitStmt(Node *parsetree); static bool IsTransactionExitStmtList(List *pstmts); static bool IsTransactionStmtList(List *pstmts); static void drop_unnamed_stmt(void); +static void ProcessRecoveryConflictInterrupts(void); +static void ProcessRecoveryConflictInterrupt(RecoveryConflictReason reason); +static void report_recovery_conflict(RecoveryConflictReason reason); static void log_disconnections(int code, Datum arg); static void enable_statement_timeout(void); static void disable_statement_timeout(void); @@ -1117,7 +1116,7 @@ exec_simple_query(const char *query_string) /* * Get the command name for use in status display (it also becomes the - * default completion tag, down inside PortalRun). Set ps_status and + * default completion tag, in PortalDefineQuery). Set ps_status and * do any special start-of-SQL-command processing needed by the * destination. */ @@ -1141,8 +1140,7 @@ exec_simple_query(const char *query_string) ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " - "commands ignored until end of transaction block"), - errdetail_abort())); + "commands ignored until end of transaction block"))); /* Make sure we are in a transaction command */ start_xact_command(); @@ -1498,8 +1496,7 @@ exec_parse_message(const char *query_string, /* string to execute */ ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " - "commands ignored until end of transaction block"), - errdetail_abort())); + "commands ignored until end of transaction block"))); /* * Create the CachedPlanSource before we do parse analysis, since it @@ -1750,8 +1747,7 @@ exec_bind_message(StringInfo input_message) ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " - "commands ignored until end of transaction block"), - errdetail_abort())); + "commands ignored until end of transaction block"))); /* * Create the portal. Allow silent replacement of an existing portal only @@ -2255,8 +2251,7 @@ exec_execute_message(const char *portal_name, long max_rows) ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " - "commands ignored until end of transaction block"), - errdetail_abort())); + "commands ignored until end of transaction block"))); /* Check for cancel signal before we start execution */ CHECK_FOR_INTERRUPTS(); @@ -2536,54 +2531,40 @@ errdetail_params(ParamListInfo params) return 0; } -/* - * errdetail_abort - * - * Add an errdetail() line showing abort reason, if any. - */ -static int -errdetail_abort(void) -{ - if (MyProc->recoveryConflictPending) - errdetail("Abort reason: recovery conflict"); - - return 0; -} - /* * errdetail_recovery_conflict * * Add an errdetail() line showing conflict source. */ static int -errdetail_recovery_conflict(ProcSignalReason reason) +errdetail_recovery_conflict(RecoveryConflictReason reason) { switch (reason) { - case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + case RECOVERY_CONFLICT_BUFFERPIN: errdetail("User was holding shared buffer pin for too long."); break; - case PROCSIG_RECOVERY_CONFLICT_LOCK: + case RECOVERY_CONFLICT_LOCK: errdetail("User was holding a relation lock for too long."); break; - case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: + case RECOVERY_CONFLICT_TABLESPACE: errdetail("User was or might have been using tablespace that must be dropped."); break; - case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + case RECOVERY_CONFLICT_SNAPSHOT: errdetail("User query might have needed to see row versions that must be removed."); break; - case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT: + case RECOVERY_CONFLICT_LOGICALSLOT: errdetail("User was using a logical replication slot that must be invalidated."); break; - case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + case RECOVERY_CONFLICT_STARTUP_DEADLOCK: + errdetail("User transaction caused deadlock with recovery."); + break; + case RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK: errdetail("User transaction caused buffer deadlock with recovery."); break; - case PROCSIG_RECOVERY_CONFLICT_DATABASE: + case RECOVERY_CONFLICT_DATABASE: errdetail("User was connected to a database that must be dropped."); break; - default: - break; - /* no errdetail */ } return 0; @@ -2692,8 +2673,7 @@ exec_describe_statement_message(const char *stmt_name) ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " - "commands ignored until end of transaction block"), - errdetail_abort())); + "commands ignored until end of transaction block"))); if (whereToSendOutput != DestRemote) return; /* can't actually do anything... */ @@ -2769,8 +2749,7 @@ exec_describe_portal_message(const char *portal_name) ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " - "commands ignored until end of transaction block"), - errdetail_abort())); + "commands ignored until end of transaction block"))); if (whereToSendOutput != DestRemote) return; /* can't actually do anything... */ @@ -3088,15 +3067,14 @@ FloatExceptionHandler(SIGNAL_ARGS) } /* - * Tell the next CHECK_FOR_INTERRUPTS() to check for a particular type of - * recovery conflict. Runs in a SIGUSR1 handler. + * Tell the next CHECK_FOR_INTERRUPTS() to process recovery conflicts. Runs + * in a SIGUSR1 handler. */ void -HandleRecoveryConflictInterrupt(ProcSignalReason reason) +HandleRecoveryConflictInterrupt(void) { - RecoveryConflictPendingReasons[reason] = true; - RecoveryConflictPending = true; - InterruptPending = true; + if (pg_atomic_read_u32(&MyProc->pendingRecoveryConflicts) != 0) + InterruptPending = true; /* latch will be set by procsignal_sigusr1_handler */ } @@ -3104,49 +3082,73 @@ HandleRecoveryConflictInterrupt(ProcSignalReason reason) * Check one individual conflict reason. */ static void -ProcessRecoveryConflictInterrupt(ProcSignalReason reason) +ProcessRecoveryConflictInterrupt(RecoveryConflictReason reason) { switch (reason) { - case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + case RECOVERY_CONFLICT_STARTUP_DEADLOCK: /* + * The startup process is waiting on a lock held by us, and has + * requested us to check if it is a deadlock (i.e. the deadlock + * timeout expired). + * * If we aren't waiting for a lock we can never deadlock. */ if (GetAwaitedLock() == NULL) return; - /* Intentional fall through to check wait for pin */ - /* FALLTHROUGH */ + /* Set the flag so that ProcSleep() will check for deadlocks. */ + CheckDeadLockAlert(); + return; - case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + case RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK: /* - * If PROCSIG_RECOVERY_CONFLICT_BUFFERPIN is requested but we - * aren't blocking the Startup process there is nothing more to - * do. + * The startup process is waiting on a buffer pin, and has + * requested us to check if there is a deadlock involving the pin. * - * When PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK is requested, - * if we're waiting for locks and the startup process is not - * waiting for buffer pin (i.e., also waiting for locks), we set - * the flag so that ProcSleep() will check for deadlocks. + * If we're not waiting on a lock, there can be no deadlock. + */ + if (GetAwaitedLock() == NULL) + return; + + /* + * If we're not holding the buffer pin, also no deadlock. (The + * startup process doesn't know who's holding the pin, and sends + * this signal to *all* backends, so this is the common case.) */ if (!HoldingBufferPinThatDelaysRecovery()) - { - if (reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK && - GetStartupBufferPinWaitBufId() < 0) - CheckDeadLockAlert(); return; - } - MyProc->recoveryConflictPending = true; + /* + * Otherwise, we probably have a deadlock. Unfortunately the + * normal deadlock detector doesn't know about buffer pins, so we + * cannot perform comprehensively deadlock check. Instead, we + * just assume that it is a deadlock if the above two conditions + * are met. In principle this can lead to false positives, but + * it's rare in practice because sessions in a hot standby server + * rarely hold locks that can block other backends. + */ + report_recovery_conflict(reason); + return; + + case RECOVERY_CONFLICT_BUFFERPIN: - /* Intentional fall through to error handling */ - /* FALLTHROUGH */ + /* + * Someone is holding a buffer pin that the startup process is + * waiting for, and it got tired of waiting. If that's us, error + * out to release the pin. + */ + if (!HoldingBufferPinThatDelaysRecovery()) + return; - case PROCSIG_RECOVERY_CONFLICT_LOCK: - case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: - case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + report_recovery_conflict(reason); + return; + + case RECOVERY_CONFLICT_LOCK: + case RECOVERY_CONFLICT_TABLESPACE: + case RECOVERY_CONFLICT_SNAPSHOT: /* * If we aren't in a transaction any longer then ignore. @@ -3154,108 +3156,128 @@ ProcessRecoveryConflictInterrupt(ProcSignalReason reason) if (!IsTransactionOrTransactionBlock()) return; - /* FALLTHROUGH */ + report_recovery_conflict(reason); + return; - case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT: + case RECOVERY_CONFLICT_LOGICALSLOT: + report_recovery_conflict(reason); + return; - /* - * If we're not in a subtransaction then we are OK to throw an - * ERROR to resolve the conflict. Otherwise drop through to the - * FATAL case. - * - * PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT is a special case that - * always throws an ERROR (ie never promotes to FATAL), though it - * still has to respect QueryCancelHoldoffCount, so it shares this - * code path. Logical decoding slots are only acquired while - * performing logical decoding. During logical decoding no user - * controlled code is run. During [sub]transaction abort, the - * slot is released. Therefore user controlled code cannot - * intercept an error before the replication slot is released. - * - * XXX other times that we can throw just an ERROR *may* be - * PROCSIG_RECOVERY_CONFLICT_LOCK if no locks are held in parent - * transactions - * - * PROCSIG_RECOVERY_CONFLICT_SNAPSHOT if no snapshots are held by - * parent transactions and the transaction is not - * transaction-snapshot mode - * - * PROCSIG_RECOVERY_CONFLICT_TABLESPACE if no temp files or - * cursors open in parent transactions - */ - if (reason == PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT || - !IsSubTransaction()) - { - /* - * If we already aborted then we no longer need to cancel. We - * do this here since we do not wish to ignore aborted - * subtransactions, which must cause FATAL, currently. - */ - if (IsAbortedTransactionBlockState()) - return; + case RECOVERY_CONFLICT_DATABASE: - /* - * If a recovery conflict happens while we are waiting for - * input from the client, the client is presumably just - * sitting idle in a transaction, preventing recovery from - * making progress. We'll drop through to the FATAL case - * below to dislodge it, in that case. - */ - if (!DoingCommandRead) - { - /* Avoid losing sync in the FE/BE protocol. */ - if (QueryCancelHoldoffCount != 0) - { - /* - * Re-arm and defer this interrupt until later. See - * similar code in ProcessInterrupts(). - */ - RecoveryConflictPendingReasons[reason] = true; - RecoveryConflictPending = true; - InterruptPending = true; - return; - } + /* The database is being dropped; terminate the session */ + report_recovery_conflict(reason); + return; + } + elog(FATAL, "unrecognized conflict mode: %d", (int) reason); +} - /* - * We are cleared to throw an ERROR. Either it's the - * logical slot case, or we have a top-level transaction - * that we can abort and a conflict that isn't inherently - * non-retryable. - */ - LockErrorCleanup(); - pgstat_report_recovery_conflict(reason); - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("canceling statement due to conflict with recovery"), - errdetail_recovery_conflict(reason))); - break; - } - } +/* + * This transaction or session is conflicting with recovery and needs to be + * killed. Roll back the transaction, if that's sufficient, or terminate the + * connection, or do nothing if we're already in an aborted state. + */ +static void +report_recovery_conflict(RecoveryConflictReason reason) +{ + bool fatal; - /* Intentional fall through to session cancel */ - /* FALLTHROUGH */ + if (reason == RECOVERY_CONFLICT_DATABASE) + { + /* note: no hint about reconnecting, and different errcode */ + pgstat_report_recovery_conflict(reason); + ereport(FATAL, + (errcode(ERRCODE_DATABASE_DROPPED), + errmsg("terminating connection due to conflict with recovery"), + errdetail_recovery_conflict(reason))); + } + if (reason == RECOVERY_CONFLICT_LOGICALSLOT) + { + /* + * RECOVERY_CONFLICT_LOGICALSLOT is a special case that always throws + * an ERROR (ie never promotes to FATAL), though it still has to + * respect QueryCancelHoldoffCount, so it shares this code path. + * Logical decoding slots are only acquired while performing logical + * decoding. During logical decoding no user controlled code is run. + * During [sub]transaction abort, the slot is released. Therefore + * user controlled code cannot intercept an error before the + * replication slot is released. + */ + fatal = false; + } + else + { + fatal = IsSubTransaction(); + } - case PROCSIG_RECOVERY_CONFLICT_DATABASE: + /* + * If we're not in a subtransaction then we are OK to throw an ERROR to + * resolve the conflict. + * + * XXX other times that we can throw just an ERROR *may* be + * RECOVERY_CONFLICT_LOCK if no locks are held in parent transactions + * + * RECOVERY_CONFLICT_SNAPSHOT if no snapshots are held by parent + * transactions and the transaction is not transaction-snapshot mode + * + * RECOVERY_CONFLICT_TABLESPACE if no temp files or cursors open in parent + * transactions + */ + if (!fatal) + { + /* + * If we already aborted then we no longer need to cancel. We do this + * here since we do not wish to ignore aborted subtransactions, which + * must cause FATAL, currently. + */ + if (IsAbortedTransactionBlockState()) + return; + + /* + * If a recovery conflict happens while we are waiting for input from + * the client, the client is presumably just sitting idle in a + * transaction, preventing recovery from making progress. We'll drop + * through to the FATAL case below to dislodge it, in that case. + */ + if (!DoingCommandRead) + { + /* Avoid losing sync in the FE/BE protocol. */ + if (QueryCancelHoldoffCount != 0) + { + /* + * Re-arm and defer this interrupt until later. See similar + * code in ProcessInterrupts(). + */ + (void) pg_atomic_fetch_or_u32(&MyProc->pendingRecoveryConflicts, (1 << reason)); + InterruptPending = true; + return; + } /* - * Retrying is not possible because the database is dropped, or we - * decided above that we couldn't resolve the conflict with an - * ERROR and fell through. Terminate the session. + * We are cleared to throw an ERROR. Either it's the logical slot + * case, or we have a top-level transaction that we can abort and + * a conflict that isn't inherently non-retryable. */ + LockErrorCleanup(); pgstat_report_recovery_conflict(reason); - ereport(FATAL, - (errcode(reason == PROCSIG_RECOVERY_CONFLICT_DATABASE ? - ERRCODE_DATABASE_DROPPED : - ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("terminating connection due to conflict with recovery"), - errdetail_recovery_conflict(reason), - errhint("In a moment you should be able to reconnect to the" - " database and repeat your command."))); - break; - - default: - elog(FATAL, "unrecognized conflict mode: %d", (int) reason); + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("canceling statement due to conflict with recovery"), + errdetail_recovery_conflict(reason))); + } } + + /* + * We couldn't resolve the conflict with ERROR, so terminate the whole + * session. + */ + pgstat_report_recovery_conflict(reason); + ereport(FATAL, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("terminating connection due to conflict with recovery"), + errdetail_recovery_conflict(reason), + errhint("In a moment you should be able to reconnect to the" + " database and repeat your command."))); } /* @@ -3264,6 +3286,8 @@ ProcessRecoveryConflictInterrupt(ProcSignalReason reason) static void ProcessRecoveryConflictInterrupts(void) { + uint32 pending; + /* * We don't need to worry about joggling the elbow of proc_exit, because * proc_exit_prepare() holds interrupts, so ProcessInterrupts() won't call @@ -3271,17 +3295,27 @@ ProcessRecoveryConflictInterrupts(void) */ Assert(!proc_exit_inprogress); Assert(InterruptHoldoffCount == 0); - Assert(RecoveryConflictPending); - RecoveryConflictPending = false; + /* Are any recovery conflict pending? */ + pending = pg_atomic_read_membarrier_u32(&MyProc->pendingRecoveryConflicts); + if (pending == 0) + return; - for (ProcSignalReason reason = PROCSIG_RECOVERY_CONFLICT_FIRST; - reason <= PROCSIG_RECOVERY_CONFLICT_LAST; + /* + * Check the conflicts one by one, clearing each flag only before + * processing the particular conflict. This ensures that if multiple + * conflicts are pending, we come back here to process the remaining + * conflicts, if an error is thrown during processing one of them. + */ + for (RecoveryConflictReason reason = 0; + reason < NUM_RECOVERY_CONFLICT_REASONS; reason++) { - if (RecoveryConflictPendingReasons[reason]) + if ((pending & (1 << reason)) != 0) { - RecoveryConflictPendingReasons[reason] = false; + /* clear the flag */ + (void) pg_atomic_fetch_and_u32(&MyProc->pendingRecoveryConflicts, ~(1 << reason)); + ProcessRecoveryConflictInterrupt(reason); } } @@ -3472,7 +3506,7 @@ ProcessInterrupts(void) } } - if (RecoveryConflictPending) + if (pg_atomic_read_u32(&MyProc->pendingRecoveryConflicts) != 0) ProcessRecoveryConflictInterrupts(); if (IdleInTransactionSessionTimeoutPending) diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c index 6dee28ae525..3937f25bcc6 100644 --- a/src/backend/tsearch/dict_synonym.c +++ b/src/backend/tsearch/dict_synonym.c @@ -50,7 +50,7 @@ findwrd(char *in, char **end, uint16 *flags) /* Skip leading spaces */ while (*in && isspace((unsigned char) *in)) - in += pg_mblen(in); + in += pg_mblen_cstr(in); /* Return NULL on empty lines */ if (*in == '\0') @@ -65,7 +65,7 @@ findwrd(char *in, char **end, uint16 *flags) while (*in && !isspace((unsigned char) *in)) { lastchar = in; - in += pg_mblen(in); + in += pg_mblen_cstr(in); } if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags) diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c index 7253f64e5f7..0fd4cf3dfa8 100644 --- a/src/backend/tsearch/dict_thesaurus.c +++ b/src/backend/tsearch/dict_thesaurus.c @@ -191,7 +191,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) /* is it a comment? */ while (*ptr && isspace((unsigned char) *ptr)) - ptr += pg_mblen(ptr); + ptr += pg_mblen_cstr(ptr); if (t_iseq(ptr, '#') || *ptr == '\0' || t_iseq(ptr, '\n') || t_iseq(ptr, '\r')) @@ -237,13 +237,13 @@ thesaurusRead(const char *filename, DictThesaurus *d) { useasis = true; state = TR_INSUBS; - beginwrd = ptr + pg_mblen(ptr); + beginwrd = ptr + pg_mblen_cstr(ptr); } else if (t_iseq(ptr, '\\')) { useasis = false; state = TR_INSUBS; - beginwrd = ptr + pg_mblen(ptr); + beginwrd = ptr + pg_mblen_cstr(ptr); } else if (!isspace((unsigned char) *ptr)) { @@ -267,7 +267,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) else elog(ERROR, "unrecognized thesaurus state: %d", state); - ptr += pg_mblen(ptr); + ptr += pg_mblen_cstr(ptr); } if (state == TR_INSUBS) diff --git a/src/backend/tsearch/regis.c b/src/backend/tsearch/regis.c index 1c7d5c361f1..51ba78fabbc 100644 --- a/src/backend/tsearch/regis.c +++ b/src/backend/tsearch/regis.c @@ -37,7 +37,7 @@ RS_isRegis(const char *str) { if (state == RS_IN_WAIT) { - if (t_isalpha(c)) + if (t_isalpha_cstr(c)) /* okay */ ; else if (t_iseq(c, '[')) state = RS_IN_ONEOF; @@ -48,14 +48,14 @@ RS_isRegis(const char *str) { if (t_iseq(c, '^')) state = RS_IN_NONEOF; - else if (t_isalpha(c)) + else if (t_isalpha_cstr(c)) state = RS_IN_ONEOF_IN; else return false; } else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) { - if (t_isalpha(c)) + if (t_isalpha_cstr(c)) /* okay */ ; else if (t_iseq(c, ']')) state = RS_IN_WAIT; @@ -64,7 +64,7 @@ RS_isRegis(const char *str) } else elog(ERROR, "internal error in RS_isRegis: state %d", state); - c += pg_mblen(c); + c += pg_mblen_cstr(c); } return (state == RS_IN_WAIT); @@ -96,15 +96,14 @@ RS_compile(Regis *r, bool issuffix, const char *str) { if (state == RS_IN_WAIT) { - if (t_isalpha(c)) + if (t_isalpha_cstr(c)) { if (ptr) ptr = newRegisNode(ptr, len); else ptr = r->node = newRegisNode(NULL, len); - COPYCHAR(ptr->data, c); ptr->type = RSF_ONEOF; - ptr->len = pg_mblen(c); + ptr->len = ts_copychar_cstr(ptr->data, c); } else if (t_iseq(c, '[')) { @@ -125,10 +124,9 @@ RS_compile(Regis *r, bool issuffix, const char *str) ptr->type = RSF_NONEOF; state = RS_IN_NONEOF; } - else if (t_isalpha(c)) + else if (t_isalpha_cstr(c)) { - COPYCHAR(ptr->data, c); - ptr->len = pg_mblen(c); + ptr->len = ts_copychar_cstr(ptr->data, c); state = RS_IN_ONEOF_IN; } else /* shouldn't get here */ @@ -136,11 +134,8 @@ RS_compile(Regis *r, bool issuffix, const char *str) } else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) { - if (t_isalpha(c)) - { - COPYCHAR(ptr->data + ptr->len, c); - ptr->len += pg_mblen(c); - } + if (t_isalpha_cstr(c)) + ptr->len += ts_copychar_cstr(ptr->data + ptr->len, c); else if (t_iseq(c, ']')) state = RS_IN_WAIT; else /* shouldn't get here */ @@ -148,7 +143,7 @@ RS_compile(Regis *r, bool issuffix, const char *str) } else elog(ERROR, "internal error in RS_compile: state %d", state); - c += pg_mblen(c); + c += pg_mblen_cstr(c); } if (state != RS_IN_WAIT) /* shouldn't get here */ @@ -187,10 +182,10 @@ mb_strchr(char *str, char *c) char *ptr = str; bool res = false; - clen = pg_mblen(c); + clen = pg_mblen_cstr(c); while (*ptr && !res) { - plen = pg_mblen(ptr); + plen = pg_mblen_cstr(ptr); if (plen == clen) { i = plen; @@ -219,7 +214,7 @@ RS_execute(Regis *r, char *str) while (*c) { len++; - c += pg_mblen(c); + c += pg_mblen_cstr(c); } if (len < r->nchar) @@ -230,7 +225,7 @@ RS_execute(Regis *r, char *str) { len -= r->nchar; while (len-- > 0) - c += pg_mblen(c); + c += pg_mblen_cstr(c); } @@ -250,7 +245,7 @@ RS_execute(Regis *r, char *str) elog(ERROR, "unrecognized regis node type: %d", ptr->type); } ptr = ptr->next; - c += pg_mblen(c); + c += pg_mblen_cstr(c); } return true; diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c index ad0ceec37b0..a1bfd2a9f9b 100644 --- a/src/backend/tsearch/spell.c +++ b/src/backend/tsearch/spell.c @@ -233,7 +233,7 @@ findchar(char *str, int c) { if (t_iseq(str, c)) return str; - str += pg_mblen(str); + str += pg_mblen_cstr(str); } return NULL; @@ -246,7 +246,7 @@ findchar2(char *str, int c1, int c2) { if (t_iseq(str, c1) || t_iseq(str, c2)) return str; - str += pg_mblen(str); + str += pg_mblen_cstr(str); } return NULL; @@ -353,6 +353,7 @@ getNextFlagFromString(IspellDict *Conf, const char **sflagset, char *sflag) char *next; const char *sbuf = *sflagset; int maxstep; + int clen; bool stop = false; bool met_comma = false; @@ -364,11 +365,11 @@ getNextFlagFromString(IspellDict *Conf, const char **sflagset, char *sflag) { case FM_LONG: case FM_CHAR: - COPYCHAR(sflag, *sflagset); - sflag += pg_mblen(*sflagset); + clen = ts_copychar_cstr(sflag, *sflagset); + sflag += clen; /* Go to start of the next flag */ - *sflagset += pg_mblen(*sflagset); + *sflagset += clen; /* Check if we get all characters of flag */ maxstep--; @@ -418,7 +419,7 @@ getNextFlagFromString(IspellDict *Conf, const char **sflagset, char *sflag) *sflagset))); } - *sflagset += pg_mblen(*sflagset); + *sflagset += pg_mblen_cstr(*sflagset); } stop = true; break; @@ -544,7 +545,7 @@ NIImportDictionary(IspellDict *Conf, const char *filename) while (*s) { /* we allow only single encoded flags for faster works */ - if (pg_mblen(s) == 1 && isprint((unsigned char) *s) && !isspace((unsigned char) *s)) + if (pg_mblen_cstr(s) == 1 && isprint((unsigned char) *s) && !isspace((unsigned char) *s)) s++; else { @@ -565,7 +566,7 @@ NIImportDictionary(IspellDict *Conf, const char *filename) *s = '\0'; break; } - s += pg_mblen(s); + s += pg_mblen_cstr(s); } pstr = lowerstr_ctx(Conf, line); @@ -797,17 +798,17 @@ get_nextfield(char **str, char *next) while (**str) { + int clen = pg_mblen_cstr(*str); + if (state == PAE_WAIT_MASK) { if (t_iseq(*str, '#')) return false; else if (!isspace((unsigned char) **str)) { - int clen = pg_mblen(*str); - if (clen < avail) { - COPYCHAR(next, *str); + ts_copychar_with_len(next, *str, clen); next += clen; avail -= clen; } @@ -823,17 +824,15 @@ get_nextfield(char **str, char *next) } else { - int clen = pg_mblen(*str); - if (clen < avail) { - COPYCHAR(next, *str); + ts_copychar_with_len(next, *str, clen); next += clen; avail -= clen; } } } - *str += pg_mblen(*str); + *str += clen; } *next = '\0'; @@ -923,14 +922,15 @@ parse_affentry(char *str, char *mask, char *find, char *repl) while (*str) { + int clen = pg_mblen_cstr(str); + if (state == PAE_WAIT_MASK) { if (t_iseq(str, '#')) return false; else if (!isspace((unsigned char) *str)) { - COPYCHAR(pmask, str); - pmask += pg_mblen(str); + pmask += ts_copychar_with_len(pmask, str, clen); state = PAE_INMASK; } } @@ -943,8 +943,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl) } else if (!isspace((unsigned char) *str)) { - COPYCHAR(pmask, str); - pmask += pg_mblen(str); + pmask += ts_copychar_with_len(pmask, str, clen); } } else if (state == PAE_WAIT_FIND) @@ -953,10 +952,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl) { state = PAE_INFIND; } - else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ ) + else if (t_isalpha_cstr(str) || t_iseq(str, '\'') /* english 's */ ) { - COPYCHAR(prepl, str); - prepl += pg_mblen(str); + prepl += ts_copychar_with_len(prepl, str, clen); state = PAE_INREPL; } else if (!isspace((unsigned char) *str)) @@ -971,10 +969,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl) *pfind = '\0'; state = PAE_WAIT_REPL; } - else if (t_isalpha(str)) + else if (t_isalpha_cstr(str)) { - COPYCHAR(pfind, str); - pfind += pg_mblen(str); + pfind += ts_copychar_with_len(pfind, str, clen); } else if (!isspace((unsigned char) *str)) ereport(ERROR, @@ -987,10 +984,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl) { break; /* void repl */ } - else if (t_isalpha(str)) + else if (t_isalpha_cstr(str)) { - COPYCHAR(prepl, str); - prepl += pg_mblen(str); + prepl += ts_copychar_with_len(prepl, str, clen); state = PAE_INREPL; } else if (!isspace((unsigned char) *str)) @@ -1005,10 +1001,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl) *prepl = '\0'; break; } - else if (t_isalpha(str)) + else if (t_isalpha_cstr(str)) { - COPYCHAR(prepl, str); - prepl += pg_mblen(str); + prepl += ts_copychar_with_len(prepl, str, clen); } else if (!isspace((unsigned char) *str)) ereport(ERROR, @@ -1018,7 +1013,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl) else elog(ERROR, "unrecognized state in parse_affentry: %d", state); - str += pg_mblen(str); + str += clen; } *pmask = *pfind = *prepl = '\0'; @@ -1071,10 +1066,9 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val) CompoundAffixFlag *newValue; char sbuf[BUFSIZ]; char *sflag; - int clen; while (*s && isspace((unsigned char) *s)) - s += pg_mblen(s); + s += pg_mblen_cstr(s); if (!*s) ereport(ERROR, @@ -1085,8 +1079,8 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val) sflag = sbuf; while (*s && !isspace((unsigned char) *s) && *s != '\n') { - clen = pg_mblen(s); - COPYCHAR(sflag, s); + int clen = ts_copychar_cstr(sflag, s); + sflag += clen; s += clen; } @@ -1267,7 +1261,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) char *s = recoded + strlen("FLAG"); while (*s && isspace((unsigned char) *s)) - s += pg_mblen(s); + s += pg_mblen_cstr(s); if (*s) { @@ -1466,11 +1460,11 @@ NIImportAffixes(IspellDict *Conf, const char *filename) if (s) { while (*s && !isspace((unsigned char) *s)) - s += pg_mblen(s); + s += pg_mblen_cstr(s); while (*s && isspace((unsigned char) *s)) - s += pg_mblen(s); + s += pg_mblen_cstr(s); - if (*s && pg_mblen(s) == 1) + if (*s && pg_mblen_cstr(s) == 1) { addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG); Conf->usecompound = true; @@ -1499,7 +1493,7 @@ NIImportAffixes(IspellDict *Conf, const char *filename) flagflags = 0; while (*s && isspace((unsigned char) *s)) - s += pg_mblen(s); + s += pg_mblen_cstr(s); if (*s == '*') { @@ -1520,12 +1514,11 @@ NIImportAffixes(IspellDict *Conf, const char *filename) * be followed by EOL, whitespace, or ':'. Otherwise this is a * new-format flag command. */ - if (*s && pg_mblen(s) == 1) + if (*s && pg_mblen_cstr(s) == 1) { - COPYCHAR(flag, s); + flag[0] = *s++; flag[1] = '\0'; - s++; if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' || isspace((unsigned char) *s)) { diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c index 1e98f321957..df02ffb12fd 100644 --- a/src/backend/tsearch/ts_locale.c +++ b/src/backend/tsearch/ts_locale.c @@ -23,32 +23,40 @@ static void tsearch_readline_callback(void *arg); /* space for a single character plus a trailing NUL */ #define WC_BUF_LEN 2 -int -t_isalpha(const char *ptr) -{ - pg_wchar wstr[WC_BUF_LEN]; - int wlen pg_attribute_unused(); - - wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr)); - Assert(wlen <= 1); - - /* pass single character, or NUL if empty */ - return pg_iswalpha(wstr[0], pg_database_locale()); -} - -int -t_isalnum(const char *ptr) -{ - pg_wchar wstr[WC_BUF_LEN]; - int wlen pg_attribute_unused(); - - wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr)); - Assert(wlen <= 1); - - /* pass single character, or NUL if empty */ - return pg_iswalnum(wstr[0], pg_database_locale()); +#define GENERATE_T_ISCLASS_DEF(character_class) \ +/* mblen shall be that of the first character */ \ +int \ +t_is##character_class##_with_len(const char *ptr, int mblen) \ +{ \ + pg_wchar wstr[WC_BUF_LEN]; \ + int wlen pg_attribute_unused(); \ + wlen = pg_mb2wchar_with_len(ptr, wstr, mblen); \ + Assert(wlen <= 1); \ + /* pass single character, or NUL if empty */ \ + return pg_isw##character_class(wstr[0], pg_database_locale()); \ +} \ +\ +/* ptr shall point to a NUL-terminated string */ \ +int \ +t_is##character_class##_cstr(const char *ptr) \ +{ \ + return t_is##character_class##_with_len(ptr, pg_mblen_cstr(ptr)); \ +} \ +/* ptr shall point to a string with pre-validated encoding */ \ +int \ +t_is##character_class##_unbounded(const char *ptr) \ +{ \ + return t_is##character_class##_with_len(ptr, pg_mblen_unbounded(ptr)); \ +} \ +/* historical name for _unbounded */ \ +int \ +t_is##character_class(const char *ptr) \ +{ \ + return t_is##character_class##_unbounded(ptr); \ } +GENERATE_T_ISCLASS_DEF(alnum) +GENERATE_T_ISCLASS_DEF(alpha) /* * Set up to read a file using tsearch_readline(). This facility is diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c index 5afa6e4bad8..64b60bb9513 100644 --- a/src/backend/tsearch/ts_selfuncs.c +++ b/src/backend/tsearch/ts_selfuncs.c @@ -108,12 +108,14 @@ tsmatchsel(PG_FUNCTION_ARGS) * OK, there's a Var and a Const we're dealing with here. We need the * Const to be a TSQuery, else we can't do anything useful. We have to * check this because the Var might be the TSQuery not the TSVector. + * + * Also check that the Var really is a TSVector, in case this estimator is + * mistakenly attached to some other operator. */ - if (((Const *) other)->consttype == TSQUERYOID) + if (((Const *) other)->consttype == TSQUERYOID && + vardata.vartype == TSVECTOROID) { /* tsvector @@ tsquery or the other way around */ - Assert(vardata.vartype == TSVECTOROID); - selec = tsquerysel(&vardata, ((Const *) other)->constvalue); } else diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c index 0c513d694e7..48ee050e37f 100644 --- a/src/backend/tsearch/ts_typanalyze.c +++ b/src/backend/tsearch/ts_typanalyze.c @@ -444,7 +444,7 @@ compute_tsvector_stats(VacAttrStats *stats, stats->statypid[0] = TEXTOID; stats->statyplen[0] = -1; /* typlen, -1 for varlena */ stats->statypbyval[0] = false; - stats->statypalign[0] = 'i'; + stats->statypalign[0] = TYPALIGN_INT; } } else diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c index 9072d22423f..52cf65533e4 100644 --- a/src/backend/tsearch/ts_utils.c +++ b/src/backend/tsearch/ts_utils.c @@ -90,7 +90,7 @@ readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *, size /* Trim trailing space */ while (*pbuf && !isspace((unsigned char) *pbuf)) - pbuf += pg_mblen(pbuf); + pbuf += pg_mblen_cstr(pbuf); *pbuf = '\0'; /* Skip empty lines */ diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index bfe8aa7fbce..8b9b34e762a 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -1683,7 +1683,8 @@ TParserGet(TParser *prs) prs->state->charlen = 0; else prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen : - pg_mblen(prs->str + prs->state->posbyte); + pg_mblen_range(prs->str + prs->state->posbyte, + prs->str + prs->lenstr); Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr); Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null); diff --git a/src/backend/utils/.gitignore b/src/backend/utils/.gitignore index 303c01d0515..fa9cfb39693 100644 --- a/src/backend/utils/.gitignore +++ b/src/backend/utils/.gitignore @@ -5,3 +5,6 @@ /guc_tables.inc.c /probes.h /errcodes.h +/pgstat_wait_event.c +/wait_event_funcs_data.c +/wait_event_types.h diff --git a/src/backend/utils/Makefile b/src/backend/utils/Makefile index 6df31504f32..81b4a956bda 100644 --- a/src/backend/utils/Makefile +++ b/src/backend/utils/Makefile @@ -43,7 +43,7 @@ generated-header-symlinks: $(top_builddir)/src/include/utils/header-stamp submak submake-adt-headers: $(MAKE) -C adt jsonpath_gram.h -$(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h guc_tables.inc.c +$(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h # fmgr-stamp records the last time we ran Gen_fmgrtab.pl. We don't rely on # the timestamps of the individual output files, because the Perl script @@ -58,6 +58,12 @@ errcodes.h: $(top_srcdir)/src/backend/utils/errcodes.txt generate-errcodes.pl guc_tables.inc.c: $(top_srcdir)/src/backend/utils/misc/guc_parameters.dat $(top_srcdir)/src/backend/utils/misc/gen_guc_tables.pl $(PERL) $(top_srcdir)/src/backend/utils/misc/gen_guc_tables.pl $< $@ +pgstat_wait_event.c: wait_event_types.h +wait_event_funcs_data.c: wait_event_types.h + +wait_event_types.h: $(top_srcdir)/src/backend/utils/activity/wait_event_names.txt $(top_srcdir)/src/backend/utils/activity/generate-wait_event_types.pl + $(PERL) $(top_srcdir)/src/backend/utils/activity/generate-wait_event_types.pl --code $< + ifeq ($(enable_dtrace), yes) probes.h: postprocess_dtrace.sed probes.h.tmp sed -f $^ >$@ @@ -73,8 +79,8 @@ endif # These generated headers must be symlinked into src/include/. # We use header-stamp to record that we've done this because the symlinks # themselves may appear older than fmgr-stamp. -$(top_builddir)/src/include/utils/header-stamp: fmgr-stamp errcodes.h probes.h guc_tables.inc.c - cd '$(dir $@)' && for file in fmgroids.h fmgrprotos.h errcodes.h probes.h guc_tables.inc.c; do \ +$(top_builddir)/src/include/utils/header-stamp: fmgr-stamp errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h + cd '$(dir $@)' && for file in fmgroids.h fmgrprotos.h errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h; do \ rm -f $$file && $(LN_S) "../../../$(subdir)/$$file" . ; \ done touch $@ @@ -93,3 +99,4 @@ uninstall-data: clean: rm -f probes.h probes.h.tmp rm -f fmgroids.h fmgrprotos.h fmgrtab.c fmgr-stamp errcodes.h guc_tables.inc.c + rm -f wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c diff --git a/src/backend/utils/activity/.gitignore b/src/backend/utils/activity/.gitignore deleted file mode 100644 index bd0c0c77729..00000000000 --- a/src/backend/utils/activity/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -/pgstat_wait_event.c -/wait_event_types.h -/wait_event_funcs_data.c diff --git a/src/backend/utils/activity/Makefile b/src/backend/utils/activity/Makefile index 0eb29ee78aa..c37bfb350bb 100644 --- a/src/backend/utils/activity/Makefile +++ b/src/backend/utils/activity/Makefile @@ -36,17 +36,8 @@ OBJS = \ wait_event.o \ wait_event_funcs.o -include $(top_srcdir)/src/backend/common.mk - -wait_event_funcs.o: wait_event_funcs_data.c -wait_event_funcs_data.c: wait_event_types.h - -wait_event.o: pgstat_wait_event.c -pgstat_wait_event.c: wait_event_types.h - touch $@ +# Force these dependencies to be known even without dependency info built: +wait_event.o: wait_event.c $(top_builddir)/src/backend/utils/pgstat_wait_event.c +wait_event_funcs.o: wait_event_funcs.c $(top_builddir)/src/backend/utils/wait_event_funcs_data.c -wait_event_types.h: $(top_srcdir)/src/backend/utils/activity/wait_event_names.txt generate-wait_event_types.pl - $(PERL) $(srcdir)/generate-wait_event_types.pl --code $< - -clean: - rm -f wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c index c84e6536580..cd087129469 100644 --- a/src/backend/utils/activity/backend_status.c +++ b/src/backend/utils/activity/backend_status.c @@ -1164,31 +1164,6 @@ pgstat_get_my_plan_id(void) return MyBEEntry->st_plan_id; } -/* ---------- - * pgstat_get_backend_type_by_proc_number() - - * - * Return the type of the backend with the specified ProcNumber. This looks - * directly at the BackendStatusArray, so the return value may be out of date. - * The only current use of this function is in pg_signal_backend(), which is - * inherently racy, so we don't worry too much about this. - * - * It is the caller's responsibility to use this wisely; at minimum, callers - * should ensure that procNumber is valid and perform the required permissions - * checks. - * ---------- - */ -BackendType -pgstat_get_backend_type_by_proc_number(ProcNumber procNumber) -{ - volatile PgBackendStatus *status = &BackendStatusArray[procNumber]; - - /* - * We bypass the changecount mechanism since fetching and storing an int - * is almost certainly atomic. - */ - return status->st_backendType; -} - /* ---------- * cmp_lbestatus * diff --git a/src/backend/utils/activity/meson.build b/src/backend/utils/activity/meson.build index 9f48d5970e1..53bd5a246ca 100644 --- a/src/backend/utils/activity/meson.build +++ b/src/backend/utils/activity/meson.build @@ -30,7 +30,6 @@ waitevent_sources = files( wait_event = static_library('wait_event_names', waitevent_sources, dependencies: [backend_code], - include_directories: include_directories('../../../include/utils'), kwargs: internal_lib_args, ) diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c index 1350f5f62f1..f2f8d3ff75f 100644 --- a/src/backend/utils/activity/pgstat_backend.c +++ b/src/backend/utils/activity/pgstat_backend.c @@ -326,7 +326,7 @@ pgstat_create_backend(ProcNumber procnum) PgStatShared_Backend *shstatent; entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_BACKEND, InvalidOid, - MyProcNumber, false); + procnum, false); shstatent = (PgStatShared_Backend *) entry_ref->shared_stats; /* diff --git a/src/backend/utils/activity/pgstat_database.c b/src/backend/utils/activity/pgstat_database.c index d7f6d4c5ee6..6309909bcd0 100644 --- a/src/backend/utils/activity/pgstat_database.c +++ b/src/backend/utils/activity/pgstat_database.c @@ -17,7 +17,7 @@ #include "postgres.h" -#include "storage/procsignal.h" +#include "storage/standby.h" #include "utils/pgstat_internal.h" #include "utils/timestamp.h" @@ -88,31 +88,41 @@ pgstat_report_recovery_conflict(int reason) dbentry = pgstat_prep_database_pending(MyDatabaseId); - switch (reason) + switch ((RecoveryConflictReason) reason) { - case PROCSIG_RECOVERY_CONFLICT_DATABASE: + case RECOVERY_CONFLICT_DATABASE: /* * Since we drop the information about the database as soon as it * replicates, there is no point in counting these conflicts. */ break; - case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: + case RECOVERY_CONFLICT_TABLESPACE: dbentry->conflict_tablespace++; break; - case PROCSIG_RECOVERY_CONFLICT_LOCK: + case RECOVERY_CONFLICT_LOCK: dbentry->conflict_lock++; break; - case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + case RECOVERY_CONFLICT_SNAPSHOT: dbentry->conflict_snapshot++; break; - case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + case RECOVERY_CONFLICT_BUFFERPIN: dbentry->conflict_bufferpin++; break; - case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT: + case RECOVERY_CONFLICT_LOGICALSLOT: dbentry->conflict_logicalslot++; break; - case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + case RECOVERY_CONFLICT_STARTUP_DEADLOCK: + dbentry->conflict_startup_deadlock++; + break; + case RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK: + + /* + * The difference between RECOVERY_CONFLICT_STARTUP_DEADLOCK and + * RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK is merely whether a buffer + * pin was part of the deadlock. We use the same counter for both + * reasons. + */ dbentry->conflict_startup_deadlock++; break; } diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c index e4f2c440257..aca2c8fc742 100644 --- a/src/backend/utils/activity/wait_event.c +++ b/src/backend/utils/activity/wait_event.c @@ -503,4 +503,4 @@ pgstat_get_wait_event(uint32 wait_event_info) return event_name; } -#include "pgstat_wait_event.c" +#include "utils/pgstat_wait_event.c" diff --git a/src/backend/utils/activity/wait_event_funcs.c b/src/backend/utils/activity/wait_event_funcs.c index b62ee83ef73..fa10a80b088 100644 --- a/src/backend/utils/activity/wait_event_funcs.c +++ b/src/backend/utils/activity/wait_event_funcs.c @@ -31,7 +31,7 @@ static const struct waitEventData[] = { -#include "wait_event_funcs_data.c" +#include "utils/wait_event_funcs_data.c" /* end of list */ {NULL, NULL, NULL} }; diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 5537a2d2530..4aa864fe3c3 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -14,13 +14,13 @@ # # The files generated from this one are: # -# src/backend/utils/activity/wait_event_types.h +# wait_event_types.h # typedef enum definitions for wait events. # -# src/backend/utils/activity/pgstat_wait_event.c +# pgstat_wait_event.c # C functions to get the wait event name based on the enum. # -# src/backend/utils/activity/wait_event_types.sgml +# wait_event_types.sgml # SGML tables of wait events for inclusion in the documentation. # # When adding a new wait event, make sure it is placed in the appropriate @@ -213,6 +213,8 @@ CONTROL_FILE_WRITE_UPDATE "Waiting for a write to update the pg_contro COPY_FILE_COPY "Waiting for a file copy operation." COPY_FILE_READ "Waiting for a read during a file copy operation." COPY_FILE_WRITE "Waiting for a write during a file copy operation." +COPY_FROM_READ "Waiting to read data from a pipe, a file or a program during COPY FROM." +COPY_TO_WRITE "Waiting to write data to a pipe, a file or a program during COPY TO." DATA_FILE_EXTEND "Waiting for a relation data file to be extended." DATA_FILE_FLUSH "Waiting for a relation data file to reach durable storage." DATA_FILE_IMMEDIATE_SYNC "Waiting for an immediate synchronization of a relation data file to durable storage." diff --git a/src/backend/utils/adt/array_expanded.c b/src/backend/utils/adt/array_expanded.c index 01e3dddcbbb..7e8352af52b 100644 --- a/src/backend/utils/adt/array_expanded.c +++ b/src/backend/utils/adt/array_expanded.c @@ -238,6 +238,7 @@ EA_get_flat_size(ExpandedObjectHeader *eohptr) Datum *dvalues; bool *dnulls; Size nbytes; + uint8 typalignby; int i; Assert(eah->ea_magic == EA_MAGIC); @@ -261,12 +262,13 @@ EA_get_flat_size(ExpandedObjectHeader *eohptr) dvalues = eah->dvalues; dnulls = eah->dnulls; nbytes = 0; + typalignby = typalign_to_alignby(eah->typalign); for (i = 0; i < nelems; i++) { if (dnulls && dnulls[i]) continue; nbytes = att_addlength_datum(nbytes, eah->typlen, dvalues[i]); - nbytes = att_align_nominal(nbytes, eah->typalign); + nbytes = att_nominal_alignby(nbytes, typalignby); /* check for overflow of total request */ if (!AllocSizeIsValid(nbytes)) ereport(ERROR, diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c index e71d32773b5..734e5fea45e 100644 --- a/src/backend/utils/adt/arrayfuncs.c +++ b/src/backend/utils/adt/arrayfuncs.c @@ -75,6 +75,7 @@ typedef struct ArrayIteratorData int16 typlen; /* element type's length */ bool typbyval; /* element type's byval property */ char typalign; /* element type's align property */ + uint8 typalignby; /* typalign mapped to numeric alignment */ /* information about the requested slice size */ int slice_ndim; /* slice dimension, or 0 if not slicing */ @@ -123,7 +124,7 @@ static bool array_get_isnull(const bits8 *nullbitmap, int offset); static void array_set_isnull(bits8 *nullbitmap, int offset, bool isNull); static Datum ArrayCast(char *value, bool byval, int len); static int ArrayCastAndSet(Datum src, - int typlen, bool typbyval, char typalign, + int typlen, bool typbyval, uint8 typalignby, char *dest); static char *array_seek(char *ptr, int offset, bits8 *nullbitmap, int nitems, int typlen, bool typbyval, char typalign); @@ -187,6 +188,7 @@ array_in(PG_FUNCTION_ARGS) int typlen; bool typbyval; char typalign; + uint8 typalignby; char typdelim; Oid typioparam; char *p; @@ -232,6 +234,7 @@ array_in(PG_FUNCTION_ARGS) typlen = my_extra->typlen; typbyval = my_extra->typbyval; typalign = my_extra->typalign; + typalignby = typalign_to_alignby(typalign); typdelim = my_extra->typdelim; typioparam = my_extra->typioparam; @@ -328,7 +331,7 @@ array_in(PG_FUNCTION_ARGS) if (typlen == -1) values[i] = PointerGetDatum(PG_DETOAST_DATUM(values[i])); nbytes = att_addlength_datum(nbytes, typlen, values[i]); - nbytes = att_align_nominal(nbytes, typalign); + nbytes = att_nominal_alignby(nbytes, typalignby); /* check for overflow of total request */ if (!AllocSizeIsValid(nbytes)) ereturn(escontext, (Datum) 0, @@ -972,6 +975,7 @@ CopyArrayEls(ArrayType *array, bits8 *bitmap = ARR_NULLBITMAP(array); int bitval = 0; int bitmask = 1; + uint8 typalignby = typalign_to_alignby(typalign); int i; if (typbyval) @@ -988,7 +992,7 @@ CopyArrayEls(ArrayType *array, else { bitval |= bitmask; - p += ArrayCastAndSet(values[i], typlen, typbyval, typalign, p); + p += ArrayCastAndSet(values[i], typlen, typbyval, typalignby, p); if (freedata) pfree(DatumGetPointer(values[i])); } @@ -1112,7 +1116,7 @@ array_out(PG_FUNCTION_ARGS) needquotes = (bool *) palloc(nitems * sizeof(bool)); overall_length = 0; - array_iter_setup(&iter, v); + array_iter_setup(&iter, v, typlen, typbyval, typalign); for (i = 0; i < nitems; i++) { @@ -1121,8 +1125,7 @@ array_out(PG_FUNCTION_ARGS) bool needquote; /* Get source element, checking for NULL */ - itemvalue = array_iter_next(&iter, &isnull, i, - typlen, typbyval, typalign); + itemvalue = array_iter_next(&iter, &isnull, i); if (isnull) { @@ -1468,6 +1471,7 @@ ReadArrayBinary(StringInfo buf, int i; bool hasnull; int32 totbytes; + uint8 typalignby = typalign_to_alignby(typalign); for (i = 0; i < nitems; i++) { @@ -1526,7 +1530,7 @@ ReadArrayBinary(StringInfo buf, if (typlen == -1) values[i] = PointerGetDatum(PG_DETOAST_DATUM(values[i])); totbytes = att_addlength_datum(totbytes, typlen, values[i]); - totbytes = att_align_nominal(totbytes, typalign); + totbytes = att_nominal_alignby(totbytes, typalignby); /* check for overflow of total request */ if (!AllocSizeIsValid(totbytes)) ereport(ERROR, @@ -1614,7 +1618,7 @@ array_send(PG_FUNCTION_ARGS) } /* Send the array elements using the element's own sendproc */ - array_iter_setup(&iter, v); + array_iter_setup(&iter, v, typlen, typbyval, typalign); for (i = 0; i < nitems; i++) { @@ -1622,8 +1626,7 @@ array_send(PG_FUNCTION_ARGS) bool isnull; /* Get source element, checking for NULL */ - itemvalue = array_iter_next(&iter, &isnull, i, - typlen, typbyval, typalign); + itemvalue = array_iter_next(&iter, &isnull, i); if (isnull) { @@ -2231,6 +2234,7 @@ array_set_element(Datum arraydatum, addedafter, lenbefore, lenafter; + uint8 elmalignby = typalign_to_alignby(elmalign); if (arraytyplen > 0) { @@ -2258,7 +2262,7 @@ array_set_element(Datum arraydatum, resultarray = (char *) palloc(arraytyplen); memcpy(resultarray, DatumGetPointer(arraydatum), arraytyplen); elt_ptr = resultarray + indx[0] * elmlen; - ArrayCastAndSet(dataValue, elmlen, elmbyval, elmalign, elt_ptr); + ArrayCastAndSet(dataValue, elmlen, elmbyval, elmalignby, elt_ptr); return PointerGetDatum(resultarray); } @@ -2416,7 +2420,7 @@ array_set_element(Datum arraydatum, else { olditemlen = att_addlength_pointer(0, elmlen, elt_ptr); - olditemlen = att_align_nominal(olditemlen, elmalign); + olditemlen = att_nominal_alignby(olditemlen, elmalignby); } lenafter = olddatasize - lenbefore - olditemlen; } @@ -2426,7 +2430,7 @@ array_set_element(Datum arraydatum, else { newitemlen = att_addlength_datum(0, elmlen, dataValue); - newitemlen = att_align_nominal(newitemlen, elmalign); + newitemlen = att_nominal_alignby(newitemlen, elmalignby); } newsize = overheadlen + lenbefore + newitemlen + lenafter; @@ -2449,7 +2453,7 @@ array_set_element(Datum arraydatum, (char *) array + oldoverheadlen, lenbefore); if (!isNull) - ArrayCastAndSet(dataValue, elmlen, elmbyval, elmalign, + ArrayCastAndSet(dataValue, elmlen, elmbyval, elmalignby, (char *) newarray + overheadlen + lenbefore); memcpy((char *) newarray + overheadlen + lenbefore + newitemlen, (char *) array + oldoverheadlen + lenbefore + olditemlen, @@ -3221,6 +3225,7 @@ array_map(Datum arrayd, int typlen; bool typbyval; char typalign; + uint8 typalignby; array_iter iter; ArrayMetaState *inp_extra; ArrayMetaState *ret_extra; @@ -3270,21 +3275,21 @@ array_map(Datum arrayd, typlen = ret_extra->typlen; typbyval = ret_extra->typbyval; typalign = ret_extra->typalign; + typalignby = typalign_to_alignby(typalign); /* Allocate temporary arrays for new values */ values = (Datum *) palloc(nitems * sizeof(Datum)); nulls = (bool *) palloc(nitems * sizeof(bool)); /* Loop over source data */ - array_iter_setup(&iter, v); + array_iter_setup(&iter, v, inp_typlen, inp_typbyval, inp_typalign); hasnulls = false; for (i = 0; i < nitems; i++) { /* Get source element, checking for NULL */ *transform_source = - array_iter_next(&iter, transform_source_isnull, i, - inp_typlen, inp_typbyval, inp_typalign); + array_iter_next(&iter, transform_source_isnull, i); /* Apply the given expression to source element */ values[i] = ExecEvalExpr(exprstate, econtext, &nulls[i]); @@ -3298,7 +3303,7 @@ array_map(Datum arrayd, values[i] = PointerGetDatum(PG_DETOAST_DATUM(values[i])); /* Update total result size */ nbytes = att_addlength_datum(nbytes, typlen, values[i]); - nbytes = att_align_nominal(nbytes, typalign); + nbytes = att_nominal_alignby(nbytes, typalignby); /* check for overflow of total request */ if (!AllocSizeIsValid(nbytes)) ereport(ERROR, @@ -3505,6 +3510,7 @@ construct_md_array(Datum *elems, int32 dataoffset; int i; int nelems; + uint8 elmalignby = typalign_to_alignby(elmalign); if (ndims < 0) /* we do allow zero-dimension arrays */ ereport(ERROR, @@ -3538,7 +3544,7 @@ construct_md_array(Datum *elems, if (elmlen == -1) elems[i] = PointerGetDatum(PG_DETOAST_DATUM(elems[i])); nbytes = att_addlength_datum(nbytes, elmlen, elems[i]); - nbytes = att_align_nominal(nbytes, elmalign); + nbytes = att_nominal_alignby(nbytes, elmalignby); /* check for overflow of total request */ if (!AllocSizeIsValid(nbytes)) ereport(ERROR, @@ -3641,6 +3647,7 @@ deconstruct_array(const ArrayType *array, bits8 *bitmap; int bitmask; int i; + uint8 elmalignby = typalign_to_alignby(elmalign); Assert(ARR_ELEMTYPE(array) == elmtype); @@ -3673,7 +3680,7 @@ deconstruct_array(const ArrayType *array, { elems[i] = fetch_att(p, elmbyval, elmlen); p = att_addlength_pointer(p, elmlen, p); - p = (char *) att_align_nominal(p, elmalign); + p = (char *) att_nominal_alignby(p, elmalignby); } /* advance bitmap pointer if any */ @@ -3729,6 +3736,12 @@ deconstruct_array_builtin(const ArrayType *array, elmalign = TYPALIGN_SHORT; break; + case INT4OID: + elmlen = sizeof(int32); + elmbyval = true; + elmalign = TYPALIGN_INT; + break; + case OIDOID: elmlen = sizeof(Oid); elmbyval = true; @@ -3878,8 +3891,8 @@ array_eq(PG_FUNCTION_ARGS) /* Loop over source data */ nitems = ArrayGetNItems(ndims1, dims1); - array_iter_setup(&it1, array1); - array_iter_setup(&it2, array2); + array_iter_setup(&it1, array1, typlen, typbyval, typalign); + array_iter_setup(&it2, array2, typlen, typbyval, typalign); for (i = 0; i < nitems; i++) { @@ -3890,10 +3903,8 @@ array_eq(PG_FUNCTION_ARGS) bool oprresult; /* Get elements, checking for NULL */ - elt1 = array_iter_next(&it1, &isnull1, i, - typlen, typbyval, typalign); - elt2 = array_iter_next(&it2, &isnull2, i, - typlen, typbyval, typalign); + elt1 = array_iter_next(&it1, &isnull1, i); + elt2 = array_iter_next(&it2, &isnull2, i); /* * We consider two NULLs equal; NULL and not-NULL are unequal. @@ -4042,8 +4053,8 @@ array_cmp(FunctionCallInfo fcinfo) /* Loop over source data */ min_nitems = Min(nitems1, nitems2); - array_iter_setup(&it1, array1); - array_iter_setup(&it2, array2); + array_iter_setup(&it1, array1, typlen, typbyval, typalign); + array_iter_setup(&it2, array2, typlen, typbyval, typalign); for (i = 0; i < min_nitems; i++) { @@ -4054,8 +4065,8 @@ array_cmp(FunctionCallInfo fcinfo) int32 cmpresult; /* Get elements, checking for NULL */ - elt1 = array_iter_next(&it1, &isnull1, i, typlen, typbyval, typalign); - elt2 = array_iter_next(&it2, &isnull2, i, typlen, typbyval, typalign); + elt1 = array_iter_next(&it1, &isnull1, i); + elt2 = array_iter_next(&it2, &isnull2, i); /* * We consider two NULLs equal; NULL > not-NULL. @@ -4238,7 +4249,7 @@ hash_array(PG_FUNCTION_ARGS) /* Loop over source data */ nitems = ArrayGetNItems(ndims, dims); - array_iter_setup(&iter, array); + array_iter_setup(&iter, array, typlen, typbyval, typalign); for (i = 0; i < nitems; i++) { @@ -4247,7 +4258,7 @@ hash_array(PG_FUNCTION_ARGS) uint32 elthash; /* Get element, checking for NULL */ - elt = array_iter_next(&iter, &isnull, i, typlen, typbyval, typalign); + elt = array_iter_next(&iter, &isnull, i); if (isnull) { @@ -4328,7 +4339,7 @@ hash_array_extended(PG_FUNCTION_ARGS) /* Loop over source data */ nitems = ArrayGetNItems(ndims, dims); - array_iter_setup(&iter, array); + array_iter_setup(&iter, array, typlen, typbyval, typalign); for (i = 0; i < nitems; i++) { @@ -4337,7 +4348,7 @@ hash_array_extended(PG_FUNCTION_ARGS) uint64 elthash; /* Get element, checking for NULL */ - elt = array_iter_next(&iter, &isnull, i, typlen, typbyval, typalign); + elt = array_iter_next(&iter, &isnull, i); if (isnull) { @@ -4451,7 +4462,7 @@ array_contain_compare(AnyArrayType *array1, AnyArrayType *array2, Oid collation, /* Loop over source data */ nelems1 = ArrayGetNItems(AARR_NDIM(array1), AARR_DIMS(array1)); - array_iter_setup(&it1, array1); + array_iter_setup(&it1, array1, typlen, typbyval, typalign); for (i = 0; i < nelems1; i++) { @@ -4459,7 +4470,7 @@ array_contain_compare(AnyArrayType *array1, AnyArrayType *array2, Oid collation, bool isnull1; /* Get element, checking for NULL */ - elt1 = array_iter_next(&it1, &isnull1, i, typlen, typbyval, typalign); + elt1 = array_iter_next(&it1, &isnull1, i); /* * We assume that the comparison operator is strict, so a NULL can't @@ -4626,6 +4637,7 @@ array_create_iterator(ArrayType *arr, int slice_ndim, ArrayMetaState *mstate) &iterator->typlen, &iterator->typbyval, &iterator->typalign); + iterator->typalignby = typalign_to_alignby(iterator->typalign); /* * Remember the slicing parameters. @@ -4700,7 +4712,7 @@ array_iterate(ArrayIterator iterator, Datum *value, bool *isnull) /* Move our data pointer forward to the next element */ p = att_addlength_pointer(p, iterator->typlen, p); - p = (char *) att_align_nominal(p, iterator->typalign); + p = (char *) att_nominal_alignby(p, iterator->typalignby); iterator->data_ptr = p; } } @@ -4730,7 +4742,7 @@ array_iterate(ArrayIterator iterator, Datum *value, bool *isnull) /* Move our data pointer forward to the next element */ p = att_addlength_pointer(p, iterator->typlen, p); - p = (char *) att_align_nominal(p, iterator->typalign); + p = (char *) att_nominal_alignby(p, iterator->typalignby); } } @@ -4828,7 +4840,7 @@ static int ArrayCastAndSet(Datum src, int typlen, bool typbyval, - char typalign, + uint8 typalignby, char *dest) { int inc; @@ -4839,14 +4851,14 @@ ArrayCastAndSet(Datum src, store_att_byval(dest, src, typlen); else memmove(dest, DatumGetPointer(src), typlen); - inc = att_align_nominal(typlen, typalign); + inc = att_nominal_alignby(typlen, typalignby); } else { Assert(!typbyval); inc = att_addlength_datum(0, typlen, src); memmove(dest, DatumGetPointer(src), inc); - inc = att_align_nominal(inc, typalign); + inc = att_nominal_alignby(inc, typalignby); } return inc; @@ -4867,12 +4879,13 @@ static char * array_seek(char *ptr, int offset, bits8 *nullbitmap, int nitems, int typlen, bool typbyval, char typalign) { + uint8 typalignby = typalign_to_alignby(typalign); int bitmask; int i; /* easy if fixed-size elements and no NULLs */ if (typlen > 0 && !nullbitmap) - return ptr + nitems * ((Size) att_align_nominal(typlen, typalign)); + return ptr + nitems * ((Size) att_nominal_alignby(typlen, typalignby)); /* seems worth having separate loops for NULL and no-NULLs cases */ if (nullbitmap) @@ -4885,7 +4898,7 @@ array_seek(char *ptr, int offset, bits8 *nullbitmap, int nitems, if (*nullbitmap & bitmask) { ptr = att_addlength_pointer(ptr, typlen, ptr); - ptr = (char *) att_align_nominal(ptr, typalign); + ptr = (char *) att_nominal_alignby(ptr, typalignby); } bitmask <<= 1; if (bitmask == 0x100) @@ -4900,7 +4913,7 @@ array_seek(char *ptr, int offset, bits8 *nullbitmap, int nitems, for (i = 0; i < nitems; i++) { ptr = att_addlength_pointer(ptr, typlen, ptr); - ptr = (char *) att_align_nominal(ptr, typalign); + ptr = (char *) att_nominal_alignby(ptr, typalignby); } } return ptr; @@ -5050,12 +5063,13 @@ array_slice_size(char *arraydataptr, bits8 *arraynullsptr, j, inc; int count = 0; + uint8 typalignby = typalign_to_alignby(typalign); mda_get_range(ndim, span, st, endp); /* Pretty easy for fixed element length without nulls ... */ if (typlen > 0 && !arraynullsptr) - return ArrayGetNItems(ndim, span) * att_align_nominal(typlen, typalign); + return ArrayGetNItems(ndim, span) * att_nominal_alignby(typlen, typalignby); /* Else gotta do it the hard way */ src_offset = ArrayGetOffset(ndim, dim, lb, st); @@ -5077,7 +5091,7 @@ array_slice_size(char *arraydataptr, bits8 *arraynullsptr, if (!array_get_isnull(arraynullsptr, src_offset)) { inc = att_addlength_pointer(0, typlen, ptr); - inc = att_align_nominal(inc, typalign); + inc = att_nominal_alignby(inc, typalignby); ptr += inc; count += inc; } @@ -6096,6 +6110,7 @@ array_fill_internal(ArrayType *dims, ArrayType *lbs, int16 elmlen; bool elmbyval; char elmalign; + uint8 elmalignby; ArrayMetaState *my_extra; /* @@ -6190,6 +6205,7 @@ array_fill_internal(ArrayType *dims, ArrayType *lbs, elmlen = my_extra->typlen; elmbyval = my_extra->typbyval; elmalign = my_extra->typalign; + elmalignby = typalign_to_alignby(elmalign); /* compute required space */ if (!isnull) @@ -6204,7 +6220,7 @@ array_fill_internal(ArrayType *dims, ArrayType *lbs, value = PointerGetDatum(PG_DETOAST_DATUM(value)); nbytes = att_addlength_datum(0, elmlen, value); - nbytes = att_align_nominal(nbytes, elmalign); + nbytes = att_nominal_alignby(nbytes, elmalignby); Assert(nbytes > 0); totbytes = nbytes * nitems; @@ -6228,7 +6244,7 @@ array_fill_internal(ArrayType *dims, ArrayType *lbs, p = ARR_DATA_PTR(result); for (i = 0; i < nitems; i++) - p += ArrayCastAndSet(value, elmlen, elmbyval, elmalign, p); + p += ArrayCastAndSet(value, elmlen, elmbyval, elmalignby, p); } else { @@ -6259,9 +6275,6 @@ array_unnest(PG_FUNCTION_ARGS) array_iter iter; int nextelem; int numelems; - int16 elmlen; - bool elmbyval; - char elmalign; } array_unnest_fctx; FuncCallContext *funcctx; @@ -6272,6 +6285,9 @@ array_unnest(PG_FUNCTION_ARGS) if (SRF_IS_FIRSTCALL()) { AnyArrayType *arr; + int16 elmlen; + bool elmbyval; + char elmalign; /* create a function context for cross-call persistence */ funcctx = SRF_FIRSTCALL_INIT(); @@ -6293,23 +6309,24 @@ array_unnest(PG_FUNCTION_ARGS) /* allocate memory for user context */ fctx = palloc_object(array_unnest_fctx); - /* initialize state */ - array_iter_setup(&fctx->iter, arr); - fctx->nextelem = 0; - fctx->numelems = ArrayGetNItems(AARR_NDIM(arr), AARR_DIMS(arr)); - + /* get element-type data */ if (VARATT_IS_EXPANDED_HEADER(arr)) { /* we can just grab the type data from expanded array */ - fctx->elmlen = arr->xpn.typlen; - fctx->elmbyval = arr->xpn.typbyval; - fctx->elmalign = arr->xpn.typalign; + elmlen = arr->xpn.typlen; + elmbyval = arr->xpn.typbyval; + elmalign = arr->xpn.typalign; } else get_typlenbyvalalign(AARR_ELEMTYPE(arr), - &fctx->elmlen, - &fctx->elmbyval, - &fctx->elmalign); + &elmlen, + &elmbyval, + &elmalign); + + /* initialize state */ + array_iter_setup(&fctx->iter, arr, elmlen, elmbyval, elmalign); + fctx->nextelem = 0; + fctx->numelems = ArrayGetNItems(AARR_NDIM(arr), AARR_DIMS(arr)); funcctx->user_fctx = fctx; MemoryContextSwitchTo(oldcontext); @@ -6324,8 +6341,7 @@ array_unnest(PG_FUNCTION_ARGS) int offset = fctx->nextelem++; Datum elem; - elem = array_iter_next(&fctx->iter, &fcinfo->isnull, offset, - fctx->elmlen, fctx->elmbyval, fctx->elmalign); + elem = array_iter_next(&fctx->iter, &fcinfo->isnull, offset); SRF_RETURN_NEXT(funcctx, elem); } @@ -6401,6 +6417,7 @@ array_replace_internal(ArrayType *array, int typlen; bool typbyval; char typalign; + uint8 typalignby; char *arraydataptr; bits8 *bitmap; int bitmask; @@ -6445,6 +6462,7 @@ array_replace_internal(ArrayType *array, typlen = typentry->typlen; typbyval = typentry->typbyval; typalign = typentry->typalign; + typalignby = typalign_to_alignby(typalign); /* * Detoast values if they are toasted. The replacement value must be @@ -6506,7 +6524,7 @@ array_replace_internal(ArrayType *array, isNull = false; elt = fetch_att(arraydataptr, typbyval, typlen); arraydataptr = att_addlength_datum(arraydataptr, typlen, elt); - arraydataptr = (char *) att_align_nominal(arraydataptr, typalign); + arraydataptr = (char *) att_nominal_alignby(arraydataptr, typalignby); if (search_isnull) { @@ -6553,7 +6571,7 @@ array_replace_internal(ArrayType *array, { /* Update total result size */ nbytes = att_addlength_datum(nbytes, typlen, values[nresult]); - nbytes = att_align_nominal(nbytes, typalign); + nbytes = att_nominal_alignby(nbytes, typalignby); /* check for overflow of total request */ if (!AllocSizeIsValid(nbytes)) ereport(ERROR, @@ -6860,6 +6878,7 @@ width_bucket_array_variable(Datum operand, int typlen = typentry->typlen; bool typbyval = typentry->typbyval; char typalign = typentry->typalign; + uint8 typalignby = typalign_to_alignby(typalign); int left; int right; @@ -6883,7 +6902,7 @@ width_bucket_array_variable(Datum operand, for (i = left; i < mid; i++) { ptr = att_addlength_pointer(ptr, typlen, ptr); - ptr = (char *) att_align_nominal(ptr, typalign); + ptr = (char *) att_nominal_alignby(ptr, typalignby); } locfcinfo->args[0].value = operand; @@ -6908,7 +6927,7 @@ width_bucket_array_variable(Datum operand, * ensures we do only O(N) array indexing work, not O(N^2). */ ptr = att_addlength_pointer(ptr, typlen, ptr); - thresholds_data = (char *) att_align_nominal(ptr, typalign); + thresholds_data = (char *) att_nominal_alignby(ptr, typalignby); } } diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c index 3c7f54f2638..f5f835e944a 100644 --- a/src/backend/utils/adt/encode.c +++ b/src/backend/utils/adt/encode.c @@ -290,7 +290,7 @@ hex_decode_safe_scalar(const char *src, size_t len, char *dst, Node *escontext) ereturn(escontext, 0, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid hexadecimal digit: \"%.*s\"", - pg_mblen(s), s))); + pg_mblen_range(s, srcend), s))); s++; if (s >= srcend) ereturn(escontext, 0, @@ -300,7 +300,7 @@ hex_decode_safe_scalar(const char *src, size_t len, char *dst, Node *escontext) ereturn(escontext, 0, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid hexadecimal digit: \"%.*s\"", - pg_mblen(s), s))); + pg_mblen_range(s, srcend), s))); s++; *p++ = (v1 << 4) | v2; } @@ -564,7 +564,7 @@ pg_base64_decode_internal(const char *src, size_t len, char *dst, bool url) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid symbol \"%.*s\" found while decoding %s sequence", - pg_mblen(s - 1), s - 1, + pg_mblen_range(s - 1, srcend), s - 1, url ? "base64url" : "base64"))); } } diff --git a/src/backend/utils/adt/format_type.c b/src/backend/utils/adt/format_type.c index 544205ca067..3cd5053d118 100644 --- a/src/backend/utils/adt/format_type.c +++ b/src/backend/utils/adt/format_type.c @@ -448,11 +448,15 @@ oidvectortypes(PG_FUNCTION_ARGS) { oidvector *oidArray = (oidvector *) PG_GETARG_POINTER(0); char *result; - int numargs = oidArray->dim1; + int numargs; int num; size_t total; size_t left; + /* validate input before fetching dim1 */ + check_valid_oidvector(oidArray); + numargs = oidArray->dim1; + total = 20 * numargs + 1; result = palloc(total); result[0] = '\0'; diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index cf580c63c78..7720911a6a9 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1438,7 +1438,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, ereport(ERROR, (errcode(ERRCODE_INVALID_DATETIME_FORMAT), errmsg("invalid datetime format separator: \"%s\"", - pnstrdup(str, pg_mblen(str))))); + pnstrdup(str, pg_mblen_cstr(str))))); if (*str == ' ') n->type = NODE_TYPE_SPACE; @@ -1468,7 +1468,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, /* backslash quotes the next character, if any */ if (*str == '\\' && *(str + 1)) str++; - chlen = pg_mblen(str); + chlen = pg_mblen_cstr(str); n->type = NODE_TYPE_CHAR; memcpy(n->character, str, chlen); n->character[chlen] = '\0'; @@ -1486,7 +1486,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, */ if (*str == '\\' && *(str + 1) == '"') str++; - chlen = pg_mblen(str); + chlen = pg_mblen_cstr(str); if ((flags & DCH_FLAG) && is_separator_char(str)) n->type = NODE_TYPE_SEPARATOR; @@ -1992,8 +1992,8 @@ asc_toupper_z(const char *buff) do { \ if (IS_SUFFIX_THth(_suf)) \ { \ - if (*(ptr)) (ptr) += pg_mblen(ptr); \ - if (*(ptr)) (ptr) += pg_mblen(ptr); \ + if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \ + if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \ } \ } while (0) @@ -3183,7 +3183,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, * insist that the consumed character match the format's * character. */ - s += pg_mblen(s); + s += pg_mblen_cstr(s); } continue; } @@ -3205,11 +3205,11 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, if (extra_skip > 0) extra_skip--; else - s += pg_mblen(s); + s += pg_mblen_cstr(s); } else { - int chlen = pg_mblen(s); + int chlen = pg_mblen_cstr(s); /* * Standard mode requires strict match of format characters. @@ -5724,13 +5724,15 @@ NUM_numpart_to_char(NUMProc *Np, int id) static void NUM_eat_non_data_chars(NUMProc *Np, int n, size_t input_len) { + const char *end = Np->inout + input_len; + while (n-- > 0) { if (OVERLOAD_TEST) break; /* end of input */ if (strchr("0123456789.,+-", *Np->inout_p) != NULL) break; /* it's a data character */ - Np->inout_p += pg_mblen(Np->inout_p); + Np->inout_p += pg_mblen_range(Np->inout_p, end); } } @@ -6167,7 +6169,7 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout, } else { - Np->inout_p += pg_mblen(Np->inout_p); + Np->inout_p += pg_mblen_range(Np->inout_p, Np->inout + input_len); } continue; } diff --git a/src/backend/utils/adt/int.c b/src/backend/utils/adt/int.c index d2302626585..ff54d50ea9d 100644 --- a/src/backend/utils/adt/int.c +++ b/src/backend/utils/adt/int.c @@ -134,6 +134,30 @@ buildint2vector(const int16 *int2s, int n) return result; } +/* + * validate that an array object meets the restrictions of int2vector + * + * We need this because there are pathways by which a general int2[] array can + * be cast to int2vector, allowing the type's restrictions to be violated. + * All code that receives an int2vector as a SQL parameter should check this. + */ +static void +check_valid_int2vector(const int2vector *int2Array) +{ + /* + * We insist on ndim == 1 and dataoffset == 0 (that is, no nulls) because + * otherwise the array's layout will not be what calling code expects. We + * needn't be picky about the index lower bound though. Checking elemtype + * is just paranoia. + */ + if (int2Array->ndim != 1 || + int2Array->dataoffset != 0 || + int2Array->elemtype != INT2OID) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("array is not a valid int2vector"))); +} + /* * int2vectorin - converts "num num ..." to internal form */ @@ -208,10 +232,14 @@ int2vectorout(PG_FUNCTION_ARGS) { int2vector *int2Array = (int2vector *) PG_GETARG_POINTER(0); int num, - nnums = int2Array->dim1; + nnums; char *rp; char *result; + /* validate input before fetching dim1 */ + check_valid_int2vector(int2Array); + nnums = int2Array->dim1; + /* assumes sign, 5 digits, ' ' */ rp = result = (char *) palloc(nnums * 7 + 1); for (num = 0; num < nnums; num++) @@ -272,6 +300,7 @@ int2vectorrecv(PG_FUNCTION_ARGS) Datum int2vectorsend(PG_FUNCTION_ARGS) { + /* We don't do check_valid_int2vector, since array_send won't care */ return array_send(fcinfo); } diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c index 1e5b60801e4..d5b64d7fca5 100644 --- a/src/backend/utils/adt/jsonfuncs.c +++ b/src/backend/utils/adt/jsonfuncs.c @@ -695,7 +695,7 @@ report_json_context(JsonLexContext *lex) { /* Advance to next multibyte character */ if (IS_HIGHBIT_SET(*context_start)) - context_start += pg_mblen(context_start); + context_start += pg_mblen_range(context_start, context_end); else context_start++; } diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y index 4543626ffc8..87070235d11 100644 --- a/src/backend/utils/adt/jsonpath_gram.y +++ b/src/backend/utils/adt/jsonpath_gram.y @@ -599,7 +599,8 @@ makeItemLikeRegex(JsonPathParseItem *expr, JsonPathString *pattern, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("invalid input syntax for type %s", "jsonpath"), errdetail("Unrecognized flag character \"%.*s\" in LIKE_REGEX predicate.", - pg_mblen(flags->val + i), flags->val + i))); + pg_mblen_range(flags->val + i, flags->val + flags->len), + flags->val + i))); break; } } diff --git a/src/backend/utils/adt/levenshtein.c b/src/backend/utils/adt/levenshtein.c index fb2ba591acd..5b3d84029f6 100644 --- a/src/backend/utils/adt/levenshtein.c +++ b/src/backend/utils/adt/levenshtein.c @@ -83,6 +83,8 @@ varstr_levenshtein(const char *source, int slen, int *s_char_len = NULL; int j; const char *y; + const char *send = source + slen; + const char *tend = target + tlen; /* * For varstr_levenshtein_less_equal, we have real variables called @@ -183,10 +185,10 @@ varstr_levenshtein(const char *source, int slen, #endif /* - * In order to avoid calling pg_mblen() repeatedly on each character in s, - * we cache all the lengths before starting the main loop -- but if all - * the characters in both strings are single byte, then we skip this and - * use a fast-path in the main loop. If only one string contains + * In order to avoid calling pg_mblen_range() repeatedly on each character + * in s, we cache all the lengths before starting the main loop -- but if + * all the characters in both strings are single byte, then we skip this + * and use a fast-path in the main loop. If only one string contains * multi-byte characters, we still build the array, so that the fast-path * needn't deal with the case where the array hasn't been initialized. */ @@ -198,7 +200,7 @@ varstr_levenshtein(const char *source, int slen, s_char_len = (int *) palloc((m + 1) * sizeof(int)); for (i = 0; i < m; ++i) { - s_char_len[i] = pg_mblen(cp); + s_char_len[i] = pg_mblen_range(cp, send); cp += s_char_len[i]; } s_char_len[i] = 0; @@ -224,7 +226,7 @@ varstr_levenshtein(const char *source, int slen, { int *temp; const char *x = source; - int y_char_len = n != tlen + 1 ? pg_mblen(y) : 1; + int y_char_len = n != tlen + 1 ? pg_mblen_range(y, tend) : 1; int i; #ifdef LEVENSHTEIN_LESS_EQUAL diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index 2143d8658e8..350bc07f210 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -55,20 +55,20 @@ static int Generic_Text_IC_like(text *str, text *pat, Oid collation); *-------------------- */ static inline int -wchareq(const char *p1, const char *p2) +wchareq(const char *p1, int p1len, const char *p2, int p2len) { - int p1_len; + int p1clen; /* Optimization: quickly compare the first byte. */ if (*p1 != *p2) return 0; - p1_len = pg_mblen(p1); - if (pg_mblen(p2) != p1_len) + p1clen = pg_mblen_with_len(p1, p1len); + if (pg_mblen_with_len(p2, p2len) != p1clen) return 0; /* They are the same length */ - while (p1_len--) + while (p1clen--) { if (*p1++ != *p2++) return 0; @@ -93,11 +93,11 @@ wchareq(const char *p1, const char *p2) #define NextByte(p, plen) ((p)++, (plen)--) /* Set up to compile like_match.c for multibyte characters */ -#define CHAREQ(p1, p2) wchareq((p1), (p2)) +#define CHAREQ(p1, p1len, p2, p2len) wchareq((p1), (p1len), (p2), (p2len)) #define NextChar(p, plen) \ - do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0) + do { int __l = pg_mblen_with_len((p), (plen)); (p) +=__l; (plen) -=__l; } while (0) #define CopyAdvChar(dst, src, srclen) \ - do { int __l = pg_mblen(src); \ + do { int __l = pg_mblen_with_len((src), (srclen)); \ (srclen) -= __l; \ while (__l-- > 0) \ *(dst)++ = *(src)++; \ @@ -109,7 +109,7 @@ wchareq(const char *p1, const char *p2) #include "like_match.c" /* Set up to compile like_match.c for single-byte characters */ -#define CHAREQ(p1, p2) (*(p1) == *(p2)) +#define CHAREQ(p1, p1len, p2, p2len) (*(p1) == *(p2)) #define NextChar(p, plen) NextByte((p), (plen)) #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--) diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c index 02990ca9a1b..f5f72b82e21 100644 --- a/src/backend/utils/adt/like_match.c +++ b/src/backend/utils/adt/like_match.c @@ -442,6 +442,7 @@ do_like_escape(text *pat, text *esc) errhint("Escape string must be empty or one character."))); e = VARDATA_ANY(esc); + elen = VARSIZE_ANY_EXHDR(esc); /* * If specified escape is '\', just copy the pattern as-is. @@ -460,7 +461,7 @@ do_like_escape(text *pat, text *esc) afterescape = false; while (plen > 0) { - if (CHAREQ(p, e) && !afterescape) + if (CHAREQ(p, plen, e, elen) && !afterescape) { *r++ = '\\'; NextChar(p, plen); diff --git a/src/backend/utils/adt/mcxtfuncs.c b/src/backend/utils/adt/mcxtfuncs.c index 12b8d4cefaf..c7f7b8bc2dd 100644 --- a/src/backend/utils/adt/mcxtfuncs.c +++ b/src/backend/utils/adt/mcxtfuncs.c @@ -19,6 +19,7 @@ #include "mb/pg_wchar.h" #include "storage/proc.h" #include "storage/procarray.h" +#include "storage/procsignal.h" #include "utils/array.h" #include "utils/builtins.h" #include "utils/hsearch.h" diff --git a/src/backend/utils/adt/multirangetypes.c b/src/backend/utils/adt/multirangetypes.c index 07e2a81d46a..9548989d782 100644 --- a/src/backend/utils/adt/multirangetypes.c +++ b/src/backend/utils/adt/multirangetypes.c @@ -485,8 +485,9 @@ multirange_canonicalize(TypeCacheEntry *rangetyp, int32 input_range_count, int32 output_range_count = 0; /* Sort the ranges so we can find the ones that overlap/meet. */ - qsort_arg(ranges, input_range_count, sizeof(RangeType *), range_compare, - rangetyp); + if (ranges != NULL) + qsort_arg(ranges, input_range_count, sizeof(RangeType *), + range_compare, rangetyp); /* Now merge where possible: */ for (i = 0; i < input_range_count; i++) @@ -572,21 +573,22 @@ multirange_size_estimate(TypeCacheEntry *rangetyp, int32 range_count, RangeType **ranges) { char elemalign = rangetyp->rngelemtype->typalign; + uint8 elemalignby = typalign_to_alignby(elemalign); Size size; int32 i; /* * Count space for MultirangeType struct, items and flags. */ - size = att_align_nominal(sizeof(MultirangeType) + - Max(range_count - 1, 0) * sizeof(uint32) + - range_count * sizeof(uint8), elemalign); + size = att_nominal_alignby(sizeof(MultirangeType) + + Max(range_count - 1, 0) * sizeof(uint32) + + range_count * sizeof(uint8), elemalignby); /* Count space for range bounds */ for (i = 0; i < range_count; i++) - size += att_align_nominal(VARSIZE(ranges[i]) - - sizeof(RangeType) - - sizeof(char), elemalign); + size += att_nominal_alignby(VARSIZE(ranges[i]) - + sizeof(RangeType) - + sizeof(char), elemalignby); return size; } @@ -605,6 +607,7 @@ write_multirange_data(MultirangeType *multirange, TypeCacheEntry *rangetyp, const char *begin; char *ptr; char elemalign = rangetyp->rngelemtype->typalign; + uint8 elemalignby = typalign_to_alignby(elemalign); items = MultirangeGetItemsPtr(multirange); flags = MultirangeGetFlagsPtr(multirange); @@ -630,7 +633,7 @@ write_multirange_data(MultirangeType *multirange, TypeCacheEntry *rangetyp, flags[i] = *((char *) ranges[i] + VARSIZE(ranges[i]) - sizeof(char)); len = VARSIZE(ranges[i]) - sizeof(RangeType) - sizeof(char); memcpy(ptr, ranges[i] + 1, len); - ptr += att_align_nominal(len, elemalign); + ptr += att_nominal_alignby(len, elemalignby); } } diff --git a/src/backend/utils/adt/network_selfuncs.c b/src/backend/utils/adt/network_selfuncs.c index 902f9c25db0..2a8d2ded907 100644 --- a/src/backend/utils/adt/network_selfuncs.c +++ b/src/backend/utils/adt/network_selfuncs.c @@ -43,9 +43,9 @@ /* Maximum number of items to consider in join selectivity calculations */ #define MAX_CONSIDERED_ELEMS 1024 -static Selectivity networkjoinsel_inner(Oid operator, +static Selectivity networkjoinsel_inner(Oid operator, int opr_codenum, VariableStatData *vardata1, VariableStatData *vardata2); -static Selectivity networkjoinsel_semi(Oid operator, +static Selectivity networkjoinsel_semi(Oid operator, int opr_codenum, VariableStatData *vardata1, VariableStatData *vardata2); static Selectivity mcv_population(float4 *mcv_numbers, int mcv_nvalues); static Selectivity inet_hist_value_sel(const Datum *values, int nvalues, @@ -82,6 +82,7 @@ networksel(PG_FUNCTION_ARGS) Oid operator = PG_GETARG_OID(1); List *args = (List *) PG_GETARG_POINTER(2); int varRelid = PG_GETARG_INT32(3); + int opr_codenum; VariableStatData vardata; Node *other; bool varonleft; @@ -95,6 +96,14 @@ networksel(PG_FUNCTION_ARGS) nullfrac; FmgrInfo proc; + /* + * Before all else, verify that the operator is one of the ones supported + * by this function, which in turn proves that the input datatypes are + * what we expect. Otherwise, attaching this selectivity function to some + * unexpected operator could cause trouble. + */ + opr_codenum = inet_opr_codenum(operator); + /* * If expression is not (variable op something) or (something op * variable), then punt and return a default estimate. @@ -150,13 +159,12 @@ networksel(PG_FUNCTION_ARGS) STATISTIC_KIND_HISTOGRAM, InvalidOid, ATTSTATSSLOT_VALUES)) { - int opr_codenum = inet_opr_codenum(operator); + int h_codenum; /* Commute if needed, so we can consider histogram to be on the left */ - if (!varonleft) - opr_codenum = -opr_codenum; + h_codenum = varonleft ? opr_codenum : -opr_codenum; non_mcv_selec = inet_hist_value_sel(hslot.values, hslot.nvalues, - constvalue, opr_codenum); + constvalue, h_codenum); free_attstatsslot(&hslot); } @@ -203,10 +211,19 @@ networkjoinsel(PG_FUNCTION_ARGS) #endif SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4); double selec; + int opr_codenum; VariableStatData vardata1; VariableStatData vardata2; bool join_is_reversed; + /* + * Before all else, verify that the operator is one of the ones supported + * by this function, which in turn proves that the input datatypes are + * what we expect. Otherwise, attaching this selectivity function to some + * unexpected operator could cause trouble. + */ + opr_codenum = inet_opr_codenum(operator); + get_join_variables(root, args, sjinfo, &vardata1, &vardata2, &join_is_reversed); @@ -220,15 +237,18 @@ networkjoinsel(PG_FUNCTION_ARGS) * Selectivity for left/full join is not exactly the same as inner * join, but we neglect the difference, as eqjoinsel does. */ - selec = networkjoinsel_inner(operator, &vardata1, &vardata2); + selec = networkjoinsel_inner(operator, opr_codenum, + &vardata1, &vardata2); break; case JOIN_SEMI: case JOIN_ANTI: /* Here, it's important that we pass the outer var on the left. */ if (!join_is_reversed) - selec = networkjoinsel_semi(operator, &vardata1, &vardata2); + selec = networkjoinsel_semi(operator, opr_codenum, + &vardata1, &vardata2); else selec = networkjoinsel_semi(get_commutator(operator), + -opr_codenum, &vardata2, &vardata1); break; default: @@ -260,7 +280,7 @@ networkjoinsel(PG_FUNCTION_ARGS) * Also, MCV vs histogram selectivity is not neglected as in eqjoinsel_inner(). */ static Selectivity -networkjoinsel_inner(Oid operator, +networkjoinsel_inner(Oid operator, int opr_codenum, VariableStatData *vardata1, VariableStatData *vardata2) { Form_pg_statistic stats; @@ -273,7 +293,6 @@ networkjoinsel_inner(Oid operator, mcv2_exists = false, hist1_exists = false, hist2_exists = false; - int opr_codenum; int mcv1_length = 0, mcv2_length = 0; AttStatsSlot mcv1_slot; @@ -325,8 +344,6 @@ networkjoinsel_inner(Oid operator, memset(&hist2_slot, 0, sizeof(hist2_slot)); } - opr_codenum = inet_opr_codenum(operator); - /* * Calculate selectivity for MCV vs MCV matches. */ @@ -387,7 +404,7 @@ networkjoinsel_inner(Oid operator, * histogram selectivity for semi/anti join cases. */ static Selectivity -networkjoinsel_semi(Oid operator, +networkjoinsel_semi(Oid operator, int opr_codenum, VariableStatData *vardata1, VariableStatData *vardata2) { Form_pg_statistic stats; @@ -401,7 +418,6 @@ networkjoinsel_semi(Oid operator, mcv2_exists = false, hist1_exists = false, hist2_exists = false; - int opr_codenum; FmgrInfo proc; int i, mcv1_length = 0, @@ -455,7 +471,6 @@ networkjoinsel_semi(Oid operator, memset(&hist2_slot, 0, sizeof(hist2_slot)); } - opr_codenum = inet_opr_codenum(operator); fmgr_info(get_opcode(operator), &proc); /* Estimate number of input rows represented by RHS histogram. */ @@ -827,6 +842,9 @@ inet_semi_join_sel(Datum lhs_value, /* * Assign useful code numbers for the subnet inclusion/overlap operators * + * This will throw an error if the operator is not one of the ones we + * support in networksel() and networkjoinsel(). + * * Only inet_masklen_inclusion_cmp() and inet_hist_match_divider() depend * on the exact codes assigned here; but many other places in this file * know that they can negate a code to obtain the code for the commutator diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index 891ae6ba7fe..3bd3635d98a 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -48,8 +48,8 @@ * Uncomment the following to enable compilation of dump_numeric() * and dump_var() and to get a dump of any result produced by make_result(). * ---------- -#define NUMERIC_DEBUG */ +/* #define NUMERIC_DEBUG */ /* ---------- diff --git a/src/backend/utils/adt/oid.c b/src/backend/utils/adt/oid.c index 6f4c299dee9..a3419728971 100644 --- a/src/backend/utils/adt/oid.c +++ b/src/backend/utils/adt/oid.c @@ -107,6 +107,30 @@ buildoidvector(const Oid *oids, int n) return result; } +/* + * validate that an array object meets the restrictions of oidvector + * + * We need this because there are pathways by which a general oid[] array can + * be cast to oidvector, allowing the type's restrictions to be violated. + * All code that receives an oidvector as a SQL parameter should check this. + */ +void +check_valid_oidvector(const oidvector *oidArray) +{ + /* + * We insist on ndim == 1 and dataoffset == 0 (that is, no nulls) because + * otherwise the array's layout will not be what calling code expects. We + * needn't be picky about the index lower bound though. Checking elemtype + * is just paranoia. + */ + if (oidArray->ndim != 1 || + oidArray->dataoffset != 0 || + oidArray->elemtype != OIDOID) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("array is not a valid oidvector"))); +} + /* * oidvectorin - converts "num num ..." to internal form */ @@ -159,10 +183,14 @@ oidvectorout(PG_FUNCTION_ARGS) { oidvector *oidArray = (oidvector *) PG_GETARG_POINTER(0); int num, - nnums = oidArray->dim1; + nnums; char *rp; char *result; + /* validate input before fetching dim1 */ + check_valid_oidvector(oidArray); + nnums = oidArray->dim1; + /* assumes sign, 10 digits, ' ' */ rp = result = (char *) palloc(nnums * 12 + 1); for (num = 0; num < nnums; num++) @@ -225,6 +253,7 @@ oidvectorrecv(PG_FUNCTION_ARGS) Datum oidvectorsend(PG_FUNCTION_ARGS) { + /* We don't do check_valid_oidvector, since array_send won't care */ return array_send(fcinfo); } diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c index a003f90066c..5b0d098bd07 100644 --- a/src/backend/utils/adt/oracle_compat.c +++ b/src/backend/utils/adt/oracle_compat.c @@ -169,8 +169,8 @@ lpad(PG_FUNCTION_ARGS) char *ptr1, *ptr2, *ptr2start, - *ptr2end, *ptr_ret; + const char *ptr2end; int m, s1len, s2len; @@ -215,7 +215,7 @@ lpad(PG_FUNCTION_ARGS) while (m--) { - int mlen = pg_mblen(ptr2); + int mlen = pg_mblen_range(ptr2, ptr2end); memcpy(ptr_ret, ptr2, mlen); ptr_ret += mlen; @@ -228,7 +228,7 @@ lpad(PG_FUNCTION_ARGS) while (s1len--) { - int mlen = pg_mblen(ptr1); + int mlen = pg_mblen_unbounded(ptr1); memcpy(ptr_ret, ptr1, mlen); ptr_ret += mlen; @@ -267,8 +267,8 @@ rpad(PG_FUNCTION_ARGS) char *ptr1, *ptr2, *ptr2start, - *ptr2end, *ptr_ret; + const char *ptr2end; int m, s1len, s2len; @@ -308,11 +308,12 @@ rpad(PG_FUNCTION_ARGS) m = len - s1len; ptr1 = VARDATA_ANY(string1); + ptr_ret = VARDATA(ret); while (s1len--) { - int mlen = pg_mblen(ptr1); + int mlen = pg_mblen_unbounded(ptr1); memcpy(ptr_ret, ptr1, mlen); ptr_ret += mlen; @@ -324,7 +325,7 @@ rpad(PG_FUNCTION_ARGS) while (m--) { - int mlen = pg_mblen(ptr2); + int mlen = pg_mblen_range(ptr2, ptr2end); memcpy(ptr_ret, ptr2, mlen); ptr_ret += mlen; @@ -409,6 +410,7 @@ dotrim(const char *string, int stringlen, */ const char **stringchars; const char **setchars; + const char *setend; int *stringmblen; int *setmblen; int stringnchars; @@ -416,6 +418,7 @@ dotrim(const char *string, int stringlen, int resultndx; int resultnchars; const char *p; + const char *pend; int len; int mblen; const char *str_pos; @@ -426,10 +429,11 @@ dotrim(const char *string, int stringlen, stringnchars = 0; p = string; len = stringlen; + pend = p + len; while (len > 0) { stringchars[stringnchars] = p; - stringmblen[stringnchars] = mblen = pg_mblen(p); + stringmblen[stringnchars] = mblen = pg_mblen_range(p, pend); stringnchars++; p += mblen; len -= mblen; @@ -440,10 +444,11 @@ dotrim(const char *string, int stringlen, setnchars = 0; p = set; len = setlen; + setend = set + setlen; while (len > 0) { setchars[setnchars] = p; - setmblen[setnchars] = mblen = pg_mblen(p); + setmblen[setnchars] = mblen = pg_mblen_range(p, setend); setnchars++; p += mblen; len -= mblen; @@ -821,6 +826,8 @@ translate(PG_FUNCTION_ARGS) *to_end; char *source, *target; + const char *source_end; + const char *from_end; int m, fromlen, tolen, @@ -835,9 +842,11 @@ translate(PG_FUNCTION_ARGS) if (m <= 0) PG_RETURN_TEXT_P(string); source = VARDATA_ANY(string); + source_end = source + m; fromlen = VARSIZE_ANY_EXHDR(from); from_ptr = VARDATA_ANY(from); + from_end = from_ptr + fromlen; tolen = VARSIZE_ANY_EXHDR(to); to_ptr = VARDATA_ANY(to); to_end = to_ptr + tolen; @@ -861,12 +870,12 @@ translate(PG_FUNCTION_ARGS) while (m > 0) { - source_len = pg_mblen(source); + source_len = pg_mblen_range(source, source_end); from_index = 0; for (i = 0; i < fromlen; i += len) { - len = pg_mblen(&from_ptr[i]); + len = pg_mblen_range(&from_ptr[i], from_end); if (len == source_len && memcmp(source, &from_ptr[i], len) == 0) break; @@ -882,11 +891,11 @@ translate(PG_FUNCTION_ARGS) { if (p >= to_end) break; - p += pg_mblen(p); + p += pg_mblen_range(p, to_end); } if (p < to_end) { - len = pg_mblen(p); + len = pg_mblen_range(p, to_end); memcpy(target, p, len); target += len; retlen += len; diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 2f96e889595..78f6ea161a0 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -527,11 +527,11 @@ strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, result_size = wchar2char(result, workspace, max_size + 1, loc); - if (result_size + 1 > destsize) - return result_size; - - memcpy(dest, result, result_size); - dest[result_size] = '\0'; + if (destsize >= result_size + 1) + { + memcpy(dest, result, result_size); + dest[result_size] = '\0'; + } pfree(workspace); pfree(result); @@ -638,11 +638,11 @@ strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, result_size = wchar2char(result, workspace, max_size + 1, loc); - if (result_size + 1 > destsize) - return result_size; - - memcpy(dest, result, result_size); - dest[result_size] = '\0'; + if (destsize >= result_size + 1) + { + memcpy(dest, result, result_size); + dest[result_size] = '\0'; + } pfree(workspace); pfree(result); @@ -725,11 +725,11 @@ strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, result_size = wchar2char(result, workspace, max_size + 1, loc); - if (result_size + 1 > destsize) - return result_size; - - memcpy(dest, result, result_size); - dest[result_size] = '\0'; + if (destsize >= result_size + 1) + { + memcpy(dest, result, result_size); + dest[result_size] = '\0'; + } pfree(workspace); pfree(result); diff --git a/src/backend/utils/adt/pg_upgrade_support.c b/src/backend/utils/adt/pg_upgrade_support.c index 697143aec44..b505a6b4fee 100644 --- a/src/backend/utils/adt/pg_upgrade_support.c +++ b/src/backend/utils/adt/pg_upgrade_support.c @@ -282,11 +282,12 @@ binary_upgrade_set_missing_value(PG_FUNCTION_ARGS) * upgraded without data loss. */ Datum -binary_upgrade_logical_slot_has_caught_up(PG_FUNCTION_ARGS) +binary_upgrade_check_logical_slot_pending_wal(PG_FUNCTION_ARGS) { Name slot_name; XLogRecPtr end_of_wal; - bool found_pending_wal; + XLogRecPtr scan_cutoff_lsn; + XLogRecPtr last_pending_wal; CHECK_IS_BINARY_UPGRADE; @@ -297,6 +298,7 @@ binary_upgrade_logical_slot_has_caught_up(PG_FUNCTION_ARGS) Assert(has_rolreplication(GetUserId())); slot_name = PG_GETARG_NAME(0); + scan_cutoff_lsn = PG_GETARG_LSN(1); /* Acquire the given slot */ ReplicationSlotAcquire(NameStr(*slot_name), true, true); @@ -307,12 +309,16 @@ binary_upgrade_logical_slot_has_caught_up(PG_FUNCTION_ARGS) Assert(MyReplicationSlot->data.invalidated == RS_INVAL_NONE); end_of_wal = GetFlushRecPtr(NULL); - found_pending_wal = LogicalReplicationSlotHasPendingWal(end_of_wal); + last_pending_wal = LogicalReplicationSlotCheckPendingWal(end_of_wal, + scan_cutoff_lsn); /* Clean up */ ReplicationSlotRelease(); - PG_RETURN_BOOL(!found_pending_wal); + if (XLogRecPtrIsValid(last_pending_wal)) + PG_RETURN_LSN(last_pending_wal); + else + PG_RETURN_NULL(); } /* diff --git a/src/backend/utils/adt/rangetypes_typanalyze.c b/src/backend/utils/adt/rangetypes_typanalyze.c index 38d12dedbc5..278d4e6941a 100644 --- a/src/backend/utils/adt/rangetypes_typanalyze.c +++ b/src/backend/utils/adt/rangetypes_typanalyze.c @@ -398,7 +398,7 @@ compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, stats->statypid[slot_idx] = FLOAT8OID; stats->statyplen[slot_idx] = sizeof(float8); stats->statypbyval[slot_idx] = true; - stats->statypalign[slot_idx] = 'd'; + stats->statypalign[slot_idx] = TYPALIGN_DOUBLE; /* Store the fraction of empty ranges */ emptyfrac = palloc_object(float4); diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 94cd15bbab1..311b9877bbb 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -443,7 +443,7 @@ parse_re_flags(pg_re_flags *flags, text *opts) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid regular expression option: \"%.*s\"", - pg_mblen(opt_p + i), opt_p + i))); + pg_mblen_range(opt_p + i, opt_p + opt_len), opt_p + i))); break; } } @@ -673,12 +673,13 @@ textregexreplace(PG_FUNCTION_ARGS) if (VARSIZE_ANY_EXHDR(opt) > 0) { char *opt_p = VARDATA_ANY(opt); + const char *end_p = opt_p + VARSIZE_ANY_EXHDR(opt); if (*opt_p >= '0' && *opt_p <= '9') ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid regular expression option: \"%.*s\"", - pg_mblen(opt_p), opt_p), + pg_mblen_range(opt_p, end_p), opt_p), errhint("If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly."))); } @@ -772,6 +773,7 @@ similar_escape_internal(text *pat_text, text *esc_text) *r; int plen, elen; + const char *pend; bool afterescape = false; int nquotes = 0; int bracket_depth = 0; /* square bracket nesting level */ @@ -779,6 +781,7 @@ similar_escape_internal(text *pat_text, text *esc_text) p = VARDATA_ANY(pat_text); plen = VARSIZE_ANY_EXHDR(pat_text); + pend = p + plen; if (esc_text == NULL) { /* No ESCAPE clause provided; default to backslash as escape */ @@ -878,7 +881,7 @@ similar_escape_internal(text *pat_text, text *esc_text) if (elen > 1) { - int mblen = pg_mblen(p); + int mblen = pg_mblen_range(p, pend); if (mblen > 1) { diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c index e3bf1fbbfd7..7e54f36c2a7 100644 --- a/src/backend/utils/adt/tsquery.c +++ b/src/backend/utils/adt/tsquery.c @@ -120,7 +120,7 @@ get_modifiers(char *buf, int16 *weight, bool *prefix) return buf; buf++; - while (*buf && pg_mblen(buf) == 1) + while (*buf && pg_mblen_cstr(buf) == 1) { switch (*buf) { @@ -259,12 +259,12 @@ parse_or_operator(TSQueryParserState pstate) return false; /* it shouldn't be a part of any word */ - if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalnum(ptr)) + if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalnum_cstr(ptr)) return false; for (;;) { - ptr += pg_mblen(ptr); + ptr += pg_mblen_cstr(ptr); if (*ptr == '\0') /* got end of string without operand */ return false; @@ -390,7 +390,7 @@ gettoken_query_standard(TSQueryParserState state, int8 *operator, break; } - state->buf += pg_mblen(state->buf); + state->buf += pg_mblen_cstr(state->buf); } } @@ -502,7 +502,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator, break; } - state->buf += pg_mblen(state->buf); + state->buf += pg_mblen_cstr(state->buf); } } @@ -1014,9 +1014,8 @@ infix(INFIX *in, int parentPriority, bool rightPhraseOp) *(in->cur) = '\\'; in->cur++; } - COPYCHAR(in->cur, op); - clen = pg_mblen(op); + clen = ts_copychar_cstr(in->cur, op); op += clen; in->cur += clen; } diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c index 38342298a5d..024f5160cd4 100644 --- a/src/backend/utils/adt/tsvector.c +++ b/src/backend/utils/adt/tsvector.c @@ -319,9 +319,9 @@ tsvectorout(PG_FUNCTION_ARGS) lenbuf = 0, pp; WordEntry *ptr = ARRPTR(out); - char *curbegin, - *curin, + char *curin, *curout; + const char *curend; lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; for (i = 0; i < out->size; i++) @@ -334,13 +334,14 @@ tsvectorout(PG_FUNCTION_ARGS) curout = outbuf = (char *) palloc(lenbuf); for (i = 0; i < out->size; i++) { - curbegin = curin = STRPTR(out) + ptr->pos; + curin = STRPTR(out) + ptr->pos; + curend = curin + ptr->len; if (i != 0) *curout++ = ' '; *curout++ = '\''; - while (curin - curbegin < ptr->len) + while (curin < curend) { - int len = pg_mblen(curin); + int len = pg_mblen_range(curin, curend); if (t_iseq(curin, '\'')) *curout++ = '\''; diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index 94e0fed8309..71c7c7d3b3c 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -2604,11 +2604,15 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws) if (ws) { char *buf; + const char *end; buf = VARDATA_ANY(ws); - while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws)) + end = buf + VARSIZE_ANY_EXHDR(ws); + while (buf < end) { - if (pg_mblen(buf) == 1) + int len = pg_mblen_range(buf, end); + + if (len == 1) { switch (*buf) { @@ -2632,7 +2636,7 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws) stat->weight |= 0; } } - buf += pg_mblen(buf); + buf += len; } } diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c index b3c04f6344f..efeaeb55334 100644 --- a/src/backend/utils/adt/tsvector_parser.c +++ b/src/backend/utils/adt/tsvector_parser.c @@ -208,8 +208,7 @@ gettoken_tsvector(TSVectorParseState state, PRSSYNTAXERROR; else if (!isspace((unsigned char) *state->prsbuf)) { - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); statecode = WAITENDWORD; } } @@ -223,8 +222,7 @@ gettoken_tsvector(TSVectorParseState state, else { RESIZEPRSBUF; - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); Assert(oldstate != 0); statecode = oldstate; } @@ -259,8 +257,7 @@ gettoken_tsvector(TSVectorParseState state, else { RESIZEPRSBUF; - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); } } else if (statecode == WAITENDCMPLX) @@ -279,8 +276,7 @@ gettoken_tsvector(TSVectorParseState state, else { RESIZEPRSBUF; - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); } } else if (statecode == WAITCHARCMPLX) @@ -288,8 +284,7 @@ gettoken_tsvector(TSVectorParseState state, if (!state->is_web && t_iseq(state->prsbuf, '\'')) { RESIZEPRSBUF; - COPYCHAR(curpos, state->prsbuf); - curpos += pg_mblen(state->prsbuf); + curpos += ts_copychar_cstr(curpos, state->prsbuf); statecode = WAITENDCMPLX; } else @@ -300,7 +295,7 @@ gettoken_tsvector(TSVectorParseState state, PRSSYNTAXERROR; if (state->oprisdelim) { - /* state->prsbuf+=pg_mblen(state->prsbuf); */ + /* state->prsbuf+=pg_mblen_cstr(state->prsbuf); */ RETURN_TOKEN; } else @@ -383,6 +378,6 @@ gettoken_tsvector(TSVectorParseState state, statecode); /* get next char */ - state->prsbuf += pg_mblen(state->prsbuf); + state->prsbuf += pg_mblen_cstr(state->prsbuf); } } diff --git a/src/backend/utils/adt/varbit.c b/src/backend/utils/adt/varbit.c index 50ffee679b9..65ad1bfe18f 100644 --- a/src/backend/utils/adt/varbit.c +++ b/src/backend/utils/adt/varbit.c @@ -232,7 +232,7 @@ bit_in(PG_FUNCTION_ARGS) ereturn(escontext, (Datum) 0, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("\"%.*s\" is not a valid binary digit", - pg_mblen(sp), sp))); + pg_mblen_cstr(sp), sp))); x >>= 1; if (x == 0) @@ -257,7 +257,7 @@ bit_in(PG_FUNCTION_ARGS) ereturn(escontext, (Datum) 0, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("\"%.*s\" is not a valid hexadecimal digit", - pg_mblen(sp), sp))); + pg_mblen_cstr(sp), sp))); if (bc) { @@ -533,7 +533,7 @@ varbit_in(PG_FUNCTION_ARGS) ereturn(escontext, (Datum) 0, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("\"%.*s\" is not a valid binary digit", - pg_mblen(sp), sp))); + pg_mblen_cstr(sp), sp))); x >>= 1; if (x == 0) @@ -558,7 +558,7 @@ varbit_in(PG_FUNCTION_ARGS) ereturn(escontext, (Datum) 0, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("\"%.*s\" is not a valid hexadecimal digit", - pg_mblen(sp), sp))); + pg_mblen_cstr(sp), sp))); if (bc) { diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 6c1ebb0866d..6bb14620a63 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -494,8 +494,11 @@ text_catenate(text *t1, text *t2) * charlen_to_bytelen() * Compute the number of bytes occupied by n characters starting at *p * - * It is caller's responsibility that there actually are n characters; - * the string need not be null-terminated. + * The caller shall ensure there are n complete characters. Callers achieve + * this by deriving "n" from regmatch_t findings from searching a wchar array. + * pg_mb2wchar_with_len() skips any trailing incomplete character, so regex + * matches will end no later than the last complete character. (The string + * need not be null-terminated.) */ static int charlen_to_bytelen(const char *p, int n) @@ -510,7 +513,7 @@ charlen_to_bytelen(const char *p, int n) const char *s; for (s = p; n > 0; n--) - s += pg_mblen(s); + s += pg_mblen_unbounded(s); /* caller verified encoding */ return s - p; } @@ -644,6 +647,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) int32 slice_start; int32 slice_size; int32 slice_strlen; + int32 slice_len; text *slice; int32 E1; int32 i; @@ -713,7 +717,8 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) slice = (text *) DatumGetPointer(str); /* see if we got back an empty string */ - if (VARSIZE_ANY_EXHDR(slice) == 0) + slice_len = VARSIZE_ANY_EXHDR(slice); + if (slice_len == 0) { if (slice != (text *) DatumGetPointer(str)) pfree(slice); @@ -722,7 +727,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) /* Now we can get the actual length of the slice in MB characters */ slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice), - VARSIZE_ANY_EXHDR(slice)); + slice_len); /* * Check that the start position wasn't > slice_strlen. If so, SQL99 @@ -749,7 +754,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) */ p = VARDATA_ANY(slice); for (i = 0; i < S1 - 1; i++) - p += pg_mblen(p); + p += pg_mblen_unbounded(p); /* hang onto a pointer to our start position */ s = p; @@ -759,7 +764,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) * length. */ for (i = S1; i < E1; i++) - p += pg_mblen(p); + p += pg_mblen_unbounded(p); ret = (text *) palloc(VARHDRSZ + (p - s)); SET_VARSIZE(ret, VARHDRSZ + (p - s)); @@ -1064,6 +1069,8 @@ text_position_next(TextPositionState *state) */ if (state->is_multibyte_char_in_char && state->locale->deterministic) { + const char *haystack_end = state->str1 + state->len1; + /* Walk one character at a time, until we reach the match. */ /* the search should never move backwards. */ @@ -1072,7 +1079,7 @@ text_position_next(TextPositionState *state) while (state->refpoint < matchptr) { /* step to next character. */ - state->refpoint += pg_mblen(state->refpoint); + state->refpoint += pg_mblen_range(state->refpoint, haystack_end); state->refpos++; /* @@ -1160,7 +1167,7 @@ text_position_next_internal(char *start_ptr, TextPositionState *state) test_end = hptr; do { - test_end += pg_mblen(test_end); + test_end += pg_mblen_range(test_end, haystack_end); if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0) { state->last_match_len_tmp = (test_end - hptr); @@ -1173,7 +1180,7 @@ text_position_next_internal(char *start_ptr, TextPositionState *state) if (result_hptr) break; - hptr += pg_mblen(hptr); + hptr += pg_mblen_range(hptr, haystack_end); } return (char *) result_hptr; @@ -3767,6 +3774,8 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate) } else { + const char *end_ptr; + /* * When fldsep is NULL, each character in the input string becomes a * separate element in the result set. The separator is effectively @@ -3775,10 +3784,11 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate) inputstring_len = VARSIZE_ANY_EXHDR(inputstring); start_ptr = VARDATA_ANY(inputstring); + end_ptr = start_ptr + inputstring_len; while (inputstring_len > 0) { - int chunk_len = pg_mblen(start_ptr); + int chunk_len = pg_mblen_range(start_ptr, end_ptr); CHECK_FOR_INTERRUPTS(); @@ -3898,6 +3908,7 @@ array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v, int typlen; bool typbyval; char typalign; + uint8 typalignby; StringInfoData buf; bool printed = false; char *p; @@ -3947,6 +3958,7 @@ array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v, typlen = my_extra->typlen; typbyval = my_extra->typbyval; typalign = my_extra->typalign; + typalignby = typalign_to_alignby(typalign); p = ARR_DATA_PTR(v); bitmap = ARR_NULLBITMAP(v); @@ -3983,7 +3995,7 @@ array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v, printed = true; p = att_addlength_pointer(p, typlen, p); - p = (char *) att_align_nominal(p, typalign); + p = (char *) att_nominal_alignby(p, typalignby); } /* advance bitmap pointer if any */ @@ -4682,7 +4694,7 @@ text_reverse(PG_FUNCTION_ARGS) { int sz; - sz = pg_mblen(p); + sz = pg_mblen_range(p, endp); dst -= sz; memcpy(dst, p, sz); p += sz; @@ -4843,7 +4855,7 @@ text_format(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized format() type specifier \"%.*s\"", - pg_mblen(cp), cp), + pg_mblen_range(cp, end_ptr), cp), errhint("For a single \"%%\" use \"%%%%\"."))); /* If indirect width was specified, get its value */ @@ -4964,7 +4976,7 @@ text_format(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized format() type specifier \"%.*s\"", - pg_mblen(cp), cp), + pg_mblen_range(cp, end_ptr), cp), errhint("For a single \"%%\" use \"%%%%\"."))); break; } diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index f69dc68286c..fcb13e7c0a1 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -2376,8 +2376,7 @@ sqlchar_to_unicode(const char *s) char *utf8string; pg_wchar ret[2]; /* need space for trailing zero */ - /* note we're not assuming s is null-terminated */ - utf8string = pg_server_to_any(s, pg_mblen(s), PG_UTF8); + utf8string = pg_server_to_any(s, pg_mblen_cstr(s), PG_UTF8); pg_encoding_mb2wchar_with_len(PG_UTF8, utf8string, ret, pg_encoding_mblen(PG_UTF8, utf8string)); @@ -2430,7 +2429,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped, initStringInfo(&buf); - for (p = ident; *p; p += pg_mblen(p)) + for (p = ident; *p; p += pg_mblen_cstr(p)) { if (*p == ':' && (p == ident || fully_escaped)) appendStringInfoString(&buf, "_x003A_"); @@ -2455,7 +2454,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped, : !is_valid_xml_namechar(u)) appendStringInfo(&buf, "_x%04X_", (unsigned int) u); else - appendBinaryStringInfo(&buf, p, pg_mblen(p)); + appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p)); } } @@ -2478,7 +2477,7 @@ map_xml_name_to_sql_identifier(const char *name) initStringInfo(&buf); - for (p = name; *p; p += pg_mblen(p)) + for (p = name; *p; p += pg_mblen_cstr(p)) { if (*p == '_' && *(p + 1) == 'x' && isxdigit((unsigned char) *(p + 2)) @@ -2496,7 +2495,7 @@ map_xml_name_to_sql_identifier(const char *name) p += 6; } else - appendBinaryStringInfo(&buf, p, pg_mblen(p)); + appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p)); } return buf.data; diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index aa530d3685e..129906e2daa 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -182,6 +182,7 @@ static bool matches_backtrace_functions(const char *funcname); static pg_noinline void set_backtrace(ErrorData *edata, int num_skip); static void set_errdata_field(MemoryContextData *cxt, char **ptr, const char *str); static void FreeErrorDataContents(ErrorData *edata); +static int log_min_messages_cmp(const ListCell *a, const ListCell *b); static void write_console(const char *line, int len); static const char *process_log_prefix_padding(const char *p, int *ppadding); static void log_line_prefix(StringInfo buf, ErrorData *edata); @@ -235,7 +236,7 @@ is_log_level_output(int elevel, int log_min_level) static inline bool should_output_to_server(int elevel) { - return is_log_level_output(elevel, log_min_messages); + return is_log_level_output(elevel, log_min_messages[MyBackendType]); } /* @@ -2170,6 +2171,250 @@ DebugFileOpen(void) } +/* + * GUC check_hook for log_min_messages + * + * This value is parsed as a comma-separated list of zero or more TYPE:LEVEL + * elements. For each element, TYPE corresponds to a bk_category value (see + * postmaster/proctypelist.h); LEVEL is one of server_message_level_options. + * + * In addition, there must be a single LEVEL element (with no TYPE part) + * which sets the default level for process types that aren't specified. + */ +bool +check_log_min_messages(char **newval, void **extra, GucSource source) +{ + char *rawstring; + List *elemlist; + StringInfoData buf; + char *result; + int newlevel[BACKEND_NUM_TYPES]; + bool assigned[BACKEND_NUM_TYPES] = {0}; + int genericlevel = -1; /* -1 means not assigned */ + + const char *const process_types[] = { +#define PG_PROCTYPE(bktype, bkcategory, description, main_func, shmem_attach) \ + [bktype] = bkcategory, +#include "postmaster/proctypelist.h" +#undef PG_PROCTYPE + }; + + /* Need a modifiable copy of string. */ + rawstring = guc_strdup(LOG, *newval); + if (rawstring == NULL) + return false; + + /* Parse the string into a list. */ + if (!SplitGUCList(rawstring, ',', &elemlist)) + { + /* syntax error in list */ + GUC_check_errdetail("List syntax is invalid."); + list_free(elemlist); + guc_free(rawstring); + return false; + } + + /* Validate and assign log level and process type. */ + foreach_ptr(char, elem, elemlist) + { + char *sep = strchr(elem, ':'); + + /* + * If there's no ':' separator in the entry, this is the default log + * level. Otherwise it's a process type-specific entry. + */ + if (sep == NULL) + { + const struct config_enum_entry *entry; + bool found; + + /* Reject duplicates for generic log level. */ + if (genericlevel != -1) + { + GUC_check_errdetail("Redundant specification of default log level."); + goto lmm_fail; + } + + /* Validate the log level */ + found = false; + for (entry = server_message_level_options; entry && entry->name; entry++) + { + if (pg_strcasecmp(entry->name, elem) == 0) + { + genericlevel = entry->val; + found = true; + break; + } + } + + if (!found) + { + GUC_check_errdetail("Unrecognized log level: \"%s\".", elem); + goto lmm_fail; + } + } + else + { + char *loglevel = sep + 1; + char *ptype = elem; + bool found; + int level; + const struct config_enum_entry *entry; + + /* + * Temporarily clobber the ':' with a string terminator, so that + * we can validate it. We restore this at the bottom. + */ + *sep = '\0'; + + /* Validate the log level */ + found = false; + for (entry = server_message_level_options; entry && entry->name; entry++) + { + if (pg_strcasecmp(entry->name, loglevel) == 0) + { + level = entry->val; + found = true; + break; + } + } + + if (!found) + { + GUC_check_errdetail("Unrecognized log level for process type \"%s\": \"%s\".", + ptype, loglevel); + goto lmm_fail; + } + + /* Is the process type name valid and unique? */ + found = false; + for (int i = 0; i < BACKEND_NUM_TYPES; i++) + { + if (pg_strcasecmp(process_types[i], ptype) == 0) + { + /* Reject duplicates for a process type. */ + if (assigned[i]) + { + GUC_check_errdetail("Redundant log level specification for process type \"%s\".", + ptype); + goto lmm_fail; + } + + newlevel[i] = level; + assigned[i] = true; + found = true; + + /* + * note: we must keep looking! some process types appear + * multiple times in proctypelist.h. + */ + } + } + + if (!found) + { + GUC_check_errdetail("Unrecognized process type \"%s\".", ptype); + goto lmm_fail; + } + + /* Put the separator back in place */ + *sep = ':'; + } + + /* all good */ + continue; + +lmm_fail: + guc_free(rawstring); + list_free(elemlist); + return false; + } + + /* + * The generic log level must be specified. It is the fallback value. + */ + if (genericlevel == -1) + { + GUC_check_errdetail("Default log level was not defined."); + guc_free(rawstring); + list_free(elemlist); + return false; + } + + /* Apply the default log level to all processes not listed. */ + for (int i = 0; i < BACKEND_NUM_TYPES; i++) + { + if (!assigned[i]) + newlevel[i] = genericlevel; + } + + /* + * Save an ordered representation of the user-specified string, for the + * show_hook. + */ + list_sort(elemlist, log_min_messages_cmp); + + initStringInfoExt(&buf, strlen(rawstring) + 1); + foreach_ptr(char, elem, elemlist) + { + if (foreach_current_index(elem) == 0) + appendStringInfoString(&buf, elem); + else + appendStringInfo(&buf, ", %s", elem); + } + + result = (char *) guc_malloc(LOG, buf.len + 1); + if (!result) + return false; + memcpy(result, buf.data, buf.len); + result[buf.len] = '\0'; + + guc_free(*newval); + *newval = result; + + guc_free(rawstring); + list_free(elemlist); + pfree(buf.data); + + /* + * Pass back data for assign_log_min_messages to use. + */ + *extra = guc_malloc(LOG, BACKEND_NUM_TYPES * sizeof(int)); + if (!*extra) + return false; + memcpy(*extra, newlevel, BACKEND_NUM_TYPES * sizeof(int)); + + return true; +} + +/* + * list_sort() callback for check_log_min_messages. The default element + * goes first; the rest are ordered by strcmp() of the process type. + */ +static int +log_min_messages_cmp(const ListCell *a, const ListCell *b) +{ + const char *s = lfirst(a); + const char *t = lfirst(b); + + if (strchr(s, ':') == NULL) + return -1; + else if (strchr(t, ':') == NULL) + return 1; + else + return strcmp(s, t); +} + +/* + * GUC assign_hook for log_min_messages + */ +void +assign_log_min_messages(const char *newval, void *extra) +{ + for (int i = 0; i < BACKEND_NUM_TYPES; i++) + log_min_messages[i] = ((int *) extra)[i]; +} + /* * GUC check_hook for backtrace_functions * @@ -2779,7 +3024,12 @@ get_backend_type_for_log(void) if (MyProcPid == PostmasterPid) backend_type_str = "postmaster"; else if (MyBackendType == B_BG_WORKER) - backend_type_str = MyBgworkerEntry->bgw_type; + { + if (MyBgworkerEntry) + backend_type_str = MyBgworkerEntry->bgw_type; + else + backend_type_str = "early bgworker"; + } else backend_type_str = GetBackendTypeDesc(MyBackendType); diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 563f20374ff..03f6c8479f2 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -266,7 +266,7 @@ GetBackendTypeDesc(BackendType backendType) switch (backendType) { -#define PG_PROCTYPE(bktype, description, main_func, shmem_attach) \ +#define PG_PROCTYPE(bktype, bkcategory, description, main_func, shmem_attach) \ case bktype: backendDesc = description; break; #include "postmaster/proctypelist.h" #undef PG_PROCTYPE diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 6950e743d03..a5a734839af 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -38,6 +38,7 @@ #include "catalog/namespace.h" #include "mb/pg_wchar.h" #include "utils/fmgrprotos.h" +#include "utils/memdebug.h" #include "utils/memutils.h" #include "utils/relcache.h" #include "varatt.h" @@ -97,6 +98,13 @@ static char *perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server); static int cliplen(const char *str, int len, int limit); +pg_noreturn +static void report_invalid_encoding_int(int encoding, const char *mbstr, + int mblen, int len); + +pg_noreturn +static void report_invalid_encoding_db(const char *mbstr, int mblen, int len); + /* * Prepare for a future call to SetClientEncoding. Success should mean @@ -1021,11 +1029,126 @@ pg_encoding_wchar2mb_with_len(int encoding, return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len); } -/* returns the byte length of a multibyte character */ +/* + * Returns the byte length of a multibyte character sequence in a + * null-terminated string. Raises an illegal byte sequence error if the + * sequence would hit a null terminator. + * + * The caller is expected to have checked for a terminator at *mbstr == 0 + * before calling, but some callers want 1 in that case, so this function + * continues that tradition. + * + * This must only be used for strings that have a null-terminator to enable + * bounds detection. + */ +int +pg_mblen_cstr(const char *mbstr) +{ + int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + + /* + * The .mblen functions return 1 when given a pointer to a terminator. + * Some callers depend on that, so we tolerate it for now. Well-behaved + * callers check the leading byte for a terminator *before* calling. + */ + for (int i = 1; i < length; ++i) + if (unlikely(mbstr[i] == 0)) + report_invalid_encoding_db(mbstr, length, i); + + /* + * String should be NUL-terminated, but checking that would make typical + * callers O(N^2), tripling Valgrind check-world time. Unless + * VALGRIND_EXPENSIVE, check 1 byte after each actual character. (If we + * found a character, not a terminator, the next byte must be a terminator + * or the start of the next character.) If the caller iterates the whole + * string, the last call will diagnose a missing terminator. + */ + if (mbstr[0] != '\0') + { +#ifdef VALGRIND_EXPENSIVE + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, strlen(mbstr)); +#else + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr + length, 1); +#endif + } + + return length; +} + +/* + * Returns the byte length of a multibyte character sequence bounded by a range + * [mbstr, end) of at least one byte in size. Raises an illegal byte sequence + * error if the sequence would exceed the range. + */ +int +pg_mblen_range(const char *mbstr, const char *end) +{ + int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + + Assert(end > mbstr); +#ifdef VALGRIND_EXPENSIVE + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, end - mbstr); +#else + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); +#endif + + if (unlikely(mbstr + length > end)) + report_invalid_encoding_db(mbstr, length, end - mbstr); + + return length; +} + +/* + * Returns the byte length of a multibyte character sequence bounded by a range + * extending for 'limit' bytes, which must be at least one. Raises an illegal + * byte sequence error if the sequence would exceed the range. + */ +int +pg_mblen_with_len(const char *mbstr, int limit) +{ + int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + + Assert(limit >= 1); +#ifdef VALGRIND_EXPENSIVE + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, limit); +#else + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); +#endif + + if (unlikely(length > limit)) + report_invalid_encoding_db(mbstr, length, limit); + + return length; +} + + +/* + * Returns the length of a multibyte character sequence, without any + * validation of bounds. + * + * PLEASE NOTE: This function can only be used safely if the caller has + * already verified the input string, since otherwise there is a risk of + * overrunning the buffer if the string is invalid. A prior call to a + * pg_mbstrlen* function suffices. + */ +int +pg_mblen_unbounded(const char *mbstr) +{ + int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + + VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); + + return length; +} + +/* + * Historical name for pg_mblen_unbounded(). Should not be used and will be + * removed in a later version. + */ int pg_mblen(const char *mbstr) { - return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); + return pg_mblen_unbounded(mbstr); } /* returns the display length of a multibyte character */ @@ -1047,14 +1170,14 @@ pg_mbstrlen(const char *mbstr) while (*mbstr) { - mbstr += pg_mblen(mbstr); + mbstr += pg_mblen_cstr(mbstr); len++; } return len; } /* returns the length (counted in wchars) of a multibyte string - * (not necessarily NULL terminated) + * (stops at the first of "limit" or a NUL) */ int pg_mbstrlen_with_len(const char *mbstr, int limit) @@ -1067,7 +1190,7 @@ pg_mbstrlen_with_len(const char *mbstr, int limit) while (limit > 0 && *mbstr) { - int l = pg_mblen(mbstr); + int l = pg_mblen_with_len(mbstr, limit); limit -= l; mbstr += l; @@ -1137,7 +1260,7 @@ pg_mbcharcliplen(const char *mbstr, int len, int limit) while (len > 0 && *mbstr) { - l = pg_mblen(mbstr); + l = pg_mblen_with_len(mbstr, len); nch++; if (nch > limit) break; @@ -1701,12 +1824,19 @@ void report_invalid_encoding(int encoding, const char *mbstr, int len) { int l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len); + + report_invalid_encoding_int(encoding, mbstr, l, len); +} + +static void +report_invalid_encoding_int(int encoding, const char *mbstr, int mblen, int len) +{ char buf[8 * 5 + 1]; char *p = buf; int j, jlimit; - jlimit = Min(l, len); + jlimit = Min(mblen, len); jlimit = Min(jlimit, 8); /* prevent buffer overrun */ for (j = 0; j < jlimit; j++) @@ -1723,6 +1853,12 @@ report_invalid_encoding(int encoding, const char *mbstr, int len) buf))); } +static void +report_invalid_encoding_db(const char *mbstr, int mblen, int len) +{ + report_invalid_encoding_int(GetDatabaseEncoding(), mbstr, mblen, len); +} + /* * report_untranslatable_char: complain about untranslatable character * diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index f0260e6e412..762b8efe6b0 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -1042,6 +1042,13 @@ options => 'file_copy_method_options', }, +{ name => 'file_extend_method', type => 'enum', context => 'PGC_SIGHUP', group => 'RESOURCES_DISK', + short_desc => 'Selects the method used for extending data files.', + variable => 'file_extend_method', + boot_val => 'DEFAULT_FILE_EXTEND_METHOD', + options => 'file_extend_method_options', +}, + { name => 'from_collapse_limit', type => 'int', context => 'PGC_USERSET', group => 'QUERY_TUNING_OTHER', short_desc => 'Sets the FROM-list size beyond which subqueries are not collapsed.', long_desc => 'The planner will merge subqueries into upper queries if the resulting FROM list would have no more than this many items.', @@ -1686,12 +1693,14 @@ options => 'server_message_level_options', }, -{ name => 'log_min_messages', type => 'enum', context => 'PGC_SUSET', group => 'LOGGING_WHEN', +{ name => 'log_min_messages', type => 'string', context => 'PGC_SUSET', group => 'LOGGING_WHEN', short_desc => 'Sets the message levels that are logged.', long_desc => 'Each level includes all the levels that follow it. The later the level, the fewer messages are sent.', - variable => 'log_min_messages', - boot_val => 'WARNING', - options => 'server_message_level_options', + flags => 'GUC_LIST_INPUT', + variable => 'log_min_messages_string', + boot_val => '"WARNING"', + check_hook => 'check_log_min_messages', + assign_hook => 'assign_log_min_messages', }, { name => 'log_parameter_max_length', type => 'int', context => 'PGC_SUSET', group => 'LOGGING_WHAT', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 13c569d8790..741fce8dede 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -80,6 +80,7 @@ #include "storage/bufmgr.h" #include "storage/bufpage.h" #include "storage/copydir.h" +#include "storage/fd.h" #include "storage/io_worker.h" #include "storage/large_object.h" #include "storage/pg_shmem.h" @@ -146,7 +147,7 @@ static const struct config_enum_entry client_message_level_options[] = { {NULL, 0, false} }; -static const struct config_enum_entry server_message_level_options[] = { +const struct config_enum_entry server_message_level_options[] = { {"debug5", DEBUG5, false}, {"debug4", DEBUG4, false}, {"debug3", DEBUG3, false}, @@ -491,6 +492,14 @@ static const struct config_enum_entry file_copy_method_options[] = { {NULL, 0, false} }; +static const struct config_enum_entry file_extend_method_options[] = { +#ifdef HAVE_POSIX_FALLOCATE + {"posix_fallocate", FILE_EXTEND_METHOD_POSIX_FALLOCATE, false}, +#endif + {"write_zeros", FILE_EXTEND_METHOD_WRITE_ZEROS, false}, + {NULL, 0, false} +}; + /* * Options for enum values stored in other modules */ @@ -537,7 +546,6 @@ static bool standard_conforming_strings = true; bool current_role_is_superuser; int log_min_error_statement = ERROR; -int log_min_messages = WARNING; int client_min_messages = NOTICE; int log_min_duration_sample = -1; int log_min_duration_statement = -1; @@ -595,6 +603,7 @@ static char *server_version_string; static int server_version_num; static char *debug_io_direct_string; static char *restrict_nonsystem_relation_kind_string; +static char *log_min_messages_string; #ifdef HAVE_SYSLOG #define DEFAULT_SYSLOG_FACILITY LOG_LOCAL0 @@ -647,6 +656,15 @@ char *role_string; /* should be static, but guc.c needs to get at this */ bool in_hot_standby_guc; +/* + * set default log_min_messages to WARNING for all process types + */ +int log_min_messages[] = { +#define PG_PROCTYPE(bktype, bkcategory, description, main_func, shmem_attach) \ + [bktype] = WARNING, +#include "postmaster/proctypelist.h" +#undef PG_PROCTYPE +}; /* * Displayable names for context types (enum GucContext) diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index c4f92fcdac8..6e82c8e055d 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -179,6 +179,10 @@ # in kilobytes, or -1 for no limit #file_copy_method = copy # copy, clone (if supported by OS) +#file_extend_method = posix_fallocate # the default is the first option supported + # by the operating system: + # posix_fallocate (most Unix-like systems) + # write_zeros #max_notify_queue_pages = 1048576 # limits the number of SLRU pages allocated # for NOTIFY / LISTEN queue @@ -528,7 +532,21 @@ # - When to Log - -#log_min_messages = warning # values in order of decreasing detail: +#log_min_messages = warning # comma-separated list of + # process_type:level entries, plus + # one freestanding level as default. + # Valid process types are: + # archiver autovacuum + # backend bgworker + # bgwriter checkpointer + # ioworker postmaster + # slotsyncworker startup + # syslogger walreceiver + # walsummarizer walwriter + # walsender + # + # Level values in order of decreasing + # detail: # debug5 # debug4 # debug3 diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c index 4fa4d432021..c1a53e658cb 100644 --- a/src/backend/utils/mmgr/portalmem.c +++ b/src/backend/utils/mmgr/portalmem.c @@ -294,9 +294,8 @@ PortalDefineQuery(Portal portal, portal->prepStmtName = prepStmtName; portal->sourceText = sourceText; - portal->qc.commandTag = commandTag; - portal->qc.nprocessed = 0; portal->commandTag = commandTag; + SetQueryCompletion(&portal->qc, commandTag, 0); portal->stmts = stmts; portal->cplan = cplan; portal->status = PORTAL_DEFINED; diff --git a/src/backend/utils/sort/sharedtuplestore.c b/src/backend/utils/sort/sharedtuplestore.c index 8f35a255263..04189f708fa 100644 --- a/src/backend/utils/sort/sharedtuplestore.c +++ b/src/backend/utils/sort/sharedtuplestore.c @@ -323,7 +323,8 @@ sts_puttuple(SharedTuplestoreAccessor *accessor, void *meta_data, /* Do we have space? */ size = accessor->sts->meta_data_size + tuple->t_len; - if (accessor->write_pointer + size > accessor->write_end) + if (accessor->write_pointer == NULL || + accessor->write_pointer + size > accessor->write_end) { if (accessor->write_chunk == NULL) { diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c index dd3c0dc1c89..0287d6e87df 100644 --- a/src/bin/pg_combinebackup/copy_file.c +++ b/src/bin/pg_combinebackup/copy_file.c @@ -210,7 +210,7 @@ copy_file_blocks(const char *src, const char *dst, } if (rb < 0) - pg_fatal("could not read from file \"%s\": %m", dst); + pg_fatal("could not read from file \"%s\": %m", src); pg_free(buffer); close(src_fd); diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c index 918b8b35646..b9f26ce782e 100644 --- a/src/bin/pg_combinebackup/pg_combinebackup.c +++ b/src/bin/pg_combinebackup/pg_combinebackup.c @@ -501,7 +501,7 @@ add_tablespace_mapping(cb_options *opt, char *arg) tsmap->old_dir); if (!is_absolute_path(tsmap->new_dir)) - pg_fatal("old directory is not an absolute path in tablespace mapping: %s", + pg_fatal("new directory is not an absolute path in tablespace mapping: %s", tsmap->new_dir); /* Canonicalize paths to avoid spurious failures when comparing. */ diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 2bebefd0ba2..2c3754d020f 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -214,12 +214,6 @@ static int nbinaryUpgradeClassOids = 0; static SequenceItem *sequences = NULL; static int nsequences = 0; -/* - * For binary upgrade, the dump ID of pg_largeobject_metadata is saved for use - * as a dependency for pg_shdepend and any large object comments/seclabels. - */ -static DumpId lo_metadata_dumpId; - /* Maximum number of relations to fetch in a fetchAttributeStats() call. */ #define MAX_ATTR_STATS_RELS 64 @@ -1121,27 +1115,20 @@ main(int argc, char **argv) getTableData(&dopt, tblinfo, numTables, RELKIND_SEQUENCE); /* - * For binary upgrade mode, dump pg_largeobject_metadata and the - * associated pg_shdepend rows. This is faster to restore than the - * equivalent set of large object commands. We can only do this for - * upgrades from v12 and newer; in older versions, pg_largeobject_metadata - * was created WITH OIDS, so the OID column is hidden and won't be dumped. + * For binary upgrade mode, dump the pg_shdepend rows for large objects + * and maybe even pg_largeobject_metadata (see comment below for details). + * This is faster to restore than the equivalent set of large object + * commands. We can only do this for upgrades from v12 and newer; in + * older versions, pg_largeobject_metadata was created WITH OIDS, so the + * OID column is hidden and won't be dumped. */ if (dopt.binary_upgrade && fout->remoteVersion >= 120000) { - TableInfo *lo_metadata = findTableByOid(LargeObjectMetadataRelationId); - TableInfo *shdepend = findTableByOid(SharedDependRelationId); + TableInfo *shdepend; - makeTableDataInfo(&dopt, lo_metadata); + shdepend = findTableByOid(SharedDependRelationId); makeTableDataInfo(&dopt, shdepend); - /* - * Save pg_largeobject_metadata's dump ID for use as a dependency for - * pg_shdepend and any large object comments/seclabels. - */ - lo_metadata_dumpId = lo_metadata->dataObj->dobj.dumpId; - addObjectDependency(&shdepend->dataObj->dobj, lo_metadata_dumpId); - /* * Only dump large object shdepend rows for this database. */ @@ -1150,21 +1137,19 @@ main(int argc, char **argv) " WHERE datname = current_database())"; /* - * If upgrading from v16 or newer, only dump large objects with - * comments/seclabels. For these upgrades, pg_upgrade can copy/link - * pg_largeobject_metadata's files (which is usually faster) but we - * still need to dump LOs with comments/seclabels here so that the - * subsequent COMMENT and SECURITY LABEL commands work. pg_upgrade - * can't copy/link the files from older versions because aclitem - * (needed by pg_largeobject_metadata.lomacl) changed its storage - * format in v16. + * For binary upgrades from v16 and newer versions, we can copy + * pg_largeobject_metadata's files from the old cluster, so we don't + * need to dump its contents. pg_upgrade can't copy/link the files + * from older versions because aclitem (needed by + * pg_largeobject_metadata.lomacl) changed its storage format in v16. */ - if (fout->remoteVersion >= 160000) - lo_metadata->dataObj->filtercond = "WHERE oid IN " - "(SELECT objoid FROM pg_description " - "WHERE classoid = " CppAsString2(LargeObjectRelationId) " " - "UNION SELECT objoid FROM pg_seclabel " - "WHERE classoid = " CppAsString2(LargeObjectRelationId) ")"; + if (fout->remoteVersion < 160000) + { + TableInfo *lo_metadata; + + lo_metadata = findTableByOid(LargeObjectMetadataRelationId); + makeTableDataInfo(&dopt, lo_metadata); + } } /* @@ -3979,7 +3964,25 @@ getLOs(Archive *fout) appendPQExpBufferStr(loQry, "SELECT oid, lomowner, lomacl, " "acldefault('L', lomowner) AS acldefault " - "FROM pg_largeobject_metadata " + "FROM pg_largeobject_metadata "); + + /* + * For binary upgrades from v12 or newer, we transfer + * pg_largeobject_metadata via COPY or by copying/linking its files from + * the old cluster. On such upgrades, we only need to consider large + * objects that have comments or security labels, since we still restore + * those objects via COMMENT/SECURITY LABEL commands. + */ + if (dopt->binary_upgrade && + fout->remoteVersion >= 120000) + appendPQExpBufferStr(loQry, + "WHERE oid IN " + "(SELECT objoid FROM pg_description " + "WHERE classoid = " CppAsString2(LargeObjectRelationId) " " + "UNION SELECT objoid FROM pg_seclabel " + "WHERE classoid = " CppAsString2(LargeObjectRelationId) ") "); + + appendPQExpBufferStr(loQry, "ORDER BY lomowner, lomacl::pg_catalog.text, oid"); res = ExecuteSqlQuery(fout, loQry->data, PGRES_TUPLES_OK); @@ -4062,36 +4065,20 @@ getLOs(Archive *fout) /* * In binary-upgrade mode for LOs, we do *not* dump out the LO data, * as it will be copied by pg_upgrade, which simply copies the - * pg_largeobject table. We *do* however dump out anything but the - * data, as pg_upgrade copies just pg_largeobject, but not - * pg_largeobject_metadata, after the dump is restored. In versions - * before v12, this is done via proper large object commands. In - * newer versions, we dump the content of pg_largeobject_metadata and - * any associated pg_shdepend rows, which is faster to restore. (On - *