diff --git a/contrib/Makefile b/contrib/Makefile
index 2f0a88d3f77..dd04c20acd2 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -34,6 +34,7 @@ SUBDIRS = \
pg_freespacemap \
pg_logicalinspect \
pg_overexplain \
+ pg_plan_advice \
pg_prewarm \
pg_stat_statements \
pg_surgery \
diff --git a/contrib/btree_gist/btree_utils_var.c b/contrib/btree_gist/btree_utils_var.c
index 6847e4e54d5..f6ba1c0c825 100644
--- a/contrib/btree_gist/btree_utils_var.c
+++ b/contrib/btree_gist/btree_utils_var.c
@@ -115,36 +115,47 @@ gbt_var_leaf2node(GBT_VARKEY *leaf, const gbtree_vinfo *tinfo, FmgrInfo *flinfo)
/*
* returns the common prefix length of a node key
+ *
+ * If the underlying type is character data, the prefix length may point in
+ * the middle of a multibyte character.
*/
static int32
gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo)
{
GBT_VARKEY_R r = gbt_var_key_readable(node);
int32 i = 0;
- int32 l = 0;
+ int32 l_left_to_match = 0;
+ int32 l_total = 0;
int32 t1len = VARSIZE(r.lower) - VARHDRSZ;
int32 t2len = VARSIZE(r.upper) - VARHDRSZ;
int32 ml = Min(t1len, t2len);
char *p1 = VARDATA(r.lower);
char *p2 = VARDATA(r.upper);
+ const char *end1 = p1 + t1len;
+ const char *end2 = p2 + t2len;
if (ml == 0)
return 0;
while (i < ml)
{
- if (tinfo->eml > 1 && l == 0)
+ if (tinfo->eml > 1 && l_left_to_match == 0)
{
- if ((l = pg_mblen(p1)) != pg_mblen(p2))
+ l_total = pg_mblen_range(p1, end1);
+ if (l_total != pg_mblen_range(p2, end2))
{
return i;
}
+ l_left_to_match = l_total;
}
if (*p1 != *p2)
{
if (tinfo->eml > 1)
{
- return (i - l + 1);
+ int32 l_matched_subset = l_total - l_left_to_match;
+
+ /* end common prefix at final byte of last matching char */
+ return i - l_matched_subset;
}
else
{
@@ -154,7 +165,7 @@ gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo)
p1++;
p2++;
- l--;
+ l_left_to_match--;
i++;
}
return ml; /* lower == upper */
diff --git a/contrib/dblink/dblink.c b/contrib/dblink/dblink.c
index 8cb3166495c..2498d80c8e7 100644
--- a/contrib/dblink/dblink.c
+++ b/contrib/dblink/dblink.c
@@ -2069,6 +2069,7 @@ get_text_array_contents(ArrayType *array, int *numitems)
int16 typlen;
bool typbyval;
char typalign;
+ uint8 typalignby;
char **values;
char *ptr;
bits8 *bitmap;
@@ -2081,6 +2082,7 @@ get_text_array_contents(ArrayType *array, int *numitems)
get_typlenbyvalalign(ARR_ELEMTYPE(array),
&typlen, &typbyval, &typalign);
+ typalignby = typalign_to_alignby(typalign);
values = palloc_array(char *, nitems);
@@ -2098,7 +2100,7 @@ get_text_array_contents(ArrayType *array, int *numitems)
{
values[i] = TextDatumGetCString(PointerGetDatum(ptr));
ptr = att_addlength_pointer(ptr, typlen, ptr);
- ptr = (char *) att_align_nominal(ptr, typalign);
+ ptr = (char *) att_nominal_alignby(ptr, typalignby);
}
/* advance bitmap pointer if any */
diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c
index 5c4917ce1fc..9e3784e0f47 100644
--- a/contrib/dict_xsyn/dict_xsyn.c
+++ b/contrib/dict_xsyn/dict_xsyn.c
@@ -54,14 +54,14 @@ find_word(char *in, char **end)
*end = NULL;
while (*in && isspace((unsigned char) *in))
- in += pg_mblen(in);
+ in += pg_mblen_cstr(in);
if (!*in || *in == '#')
return NULL;
start = in;
while (*in && !isspace((unsigned char) *in))
- in += pg_mblen(in);
+ in += pg_mblen_cstr(in);
*end = in;
diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c
index 34e3918811c..9cdfcb5daa0 100644
--- a/contrib/hstore/hstore_io.c
+++ b/contrib/hstore/hstore_io.c
@@ -67,7 +67,7 @@ prssyntaxerror(HSParser *state)
errsave(state->escontext,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error in hstore, near \"%.*s\" at position %d",
- pg_mblen(state->ptr), state->ptr,
+ pg_mblen_cstr(state->ptr), state->ptr,
(int) (state->ptr - state->begin))));
/* In soft error situation, return false as convenience for caller */
return false;
diff --git a/contrib/hstore_plperl/hstore_plperl.c b/contrib/hstore_plperl/hstore_plperl.c
index 31393b4fa50..69001191cc0 100644
--- a/contrib/hstore_plperl/hstore_plperl.c
+++ b/contrib/hstore_plperl/hstore_plperl.c
@@ -21,6 +21,13 @@ static hstoreCheckKeyLen_t hstoreCheckKeyLen_p;
typedef size_t (*hstoreCheckValLen_t) (size_t len);
static hstoreCheckValLen_t hstoreCheckValLen_p;
+/* Static asserts verify that typedefs above match original declarations */
+StaticAssertVariableIsOfType(&hstoreUpgrade, hstoreUpgrade_t);
+StaticAssertVariableIsOfType(&hstoreUniquePairs, hstoreUniquePairs_t);
+StaticAssertVariableIsOfType(&hstorePairs, hstorePairs_t);
+StaticAssertVariableIsOfType(&hstoreCheckKeyLen, hstoreCheckKeyLen_t);
+StaticAssertVariableIsOfType(&hstoreCheckValLen, hstoreCheckValLen_t);
+
/*
* Module initialize function: fetch function pointers for cross-module calls.
@@ -28,24 +35,18 @@ static hstoreCheckValLen_t hstoreCheckValLen_p;
void
_PG_init(void)
{
- /* Asserts verify that typedefs above match original declarations */
- AssertVariableIsOfType(&hstoreUpgrade, hstoreUpgrade_t);
hstoreUpgrade_p = (hstoreUpgrade_t)
load_external_function("$libdir/hstore", "hstoreUpgrade",
true, NULL);
- AssertVariableIsOfType(&hstoreUniquePairs, hstoreUniquePairs_t);
hstoreUniquePairs_p = (hstoreUniquePairs_t)
load_external_function("$libdir/hstore", "hstoreUniquePairs",
true, NULL);
- AssertVariableIsOfType(&hstorePairs, hstorePairs_t);
hstorePairs_p = (hstorePairs_t)
load_external_function("$libdir/hstore", "hstorePairs",
true, NULL);
- AssertVariableIsOfType(&hstoreCheckKeyLen, hstoreCheckKeyLen_t);
hstoreCheckKeyLen_p = (hstoreCheckKeyLen_t)
load_external_function("$libdir/hstore", "hstoreCheckKeyLen",
true, NULL);
- AssertVariableIsOfType(&hstoreCheckValLen, hstoreCheckValLen_t);
hstoreCheckValLen_p = (hstoreCheckValLen_t)
load_external_function("$libdir/hstore", "hstoreCheckValLen",
true, NULL);
diff --git a/contrib/hstore_plpython/hstore_plpython.c b/contrib/hstore_plpython/hstore_plpython.c
index e2bfc6da38e..d2be030e07c 100644
--- a/contrib/hstore_plpython/hstore_plpython.c
+++ b/contrib/hstore_plpython/hstore_plpython.c
@@ -28,6 +28,15 @@ static hstoreCheckKeyLen_t hstoreCheckKeyLen_p;
typedef size_t (*hstoreCheckValLen_t) (size_t len);
static hstoreCheckValLen_t hstoreCheckValLen_p;
+/* Static asserts verify that typedefs above match original declarations */
+StaticAssertVariableIsOfType(&PLyObject_AsString, PLyObject_AsString_t);
+StaticAssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t);
+StaticAssertVariableIsOfType(&hstoreUpgrade, hstoreUpgrade_t);
+StaticAssertVariableIsOfType(&hstoreUniquePairs, hstoreUniquePairs_t);
+StaticAssertVariableIsOfType(&hstorePairs, hstorePairs_t);
+StaticAssertVariableIsOfType(&hstoreCheckKeyLen, hstoreCheckKeyLen_t);
+StaticAssertVariableIsOfType(&hstoreCheckValLen, hstoreCheckValLen_t);
+
/*
* Module initialize function: fetch function pointers for cross-module calls.
@@ -35,32 +44,24 @@ static hstoreCheckValLen_t hstoreCheckValLen_p;
void
_PG_init(void)
{
- /* Asserts verify that typedefs above match original declarations */
- AssertVariableIsOfType(&PLyObject_AsString, PLyObject_AsString_t);
PLyObject_AsString_p = (PLyObject_AsString_t)
load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyObject_AsString",
true, NULL);
- AssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t);
PLyUnicode_FromStringAndSize_p = (PLyUnicode_FromStringAndSize_t)
load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyUnicode_FromStringAndSize",
true, NULL);
- AssertVariableIsOfType(&hstoreUpgrade, hstoreUpgrade_t);
hstoreUpgrade_p = (hstoreUpgrade_t)
load_external_function("$libdir/hstore", "hstoreUpgrade",
true, NULL);
- AssertVariableIsOfType(&hstoreUniquePairs, hstoreUniquePairs_t);
hstoreUniquePairs_p = (hstoreUniquePairs_t)
load_external_function("$libdir/hstore", "hstoreUniquePairs",
true, NULL);
- AssertVariableIsOfType(&hstorePairs, hstorePairs_t);
hstorePairs_p = (hstorePairs_t)
load_external_function("$libdir/hstore", "hstorePairs",
true, NULL);
- AssertVariableIsOfType(&hstoreCheckKeyLen, hstoreCheckKeyLen_t);
hstoreCheckKeyLen_p = (hstoreCheckKeyLen_t)
load_external_function("$libdir/hstore", "hstoreCheckKeyLen",
true, NULL);
- AssertVariableIsOfType(&hstoreCheckValLen, hstoreCheckValLen_t);
hstoreCheckValLen_p = (hstoreCheckValLen_t)
load_external_function("$libdir/hstore", "hstoreCheckValLen",
true, NULL);
diff --git a/contrib/intarray/_int_selfuncs.c b/contrib/intarray/_int_selfuncs.c
index 4a7053028c6..7fce743632f 100644
--- a/contrib/intarray/_int_selfuncs.c
+++ b/contrib/intarray/_int_selfuncs.c
@@ -19,6 +19,7 @@
#include "catalog/pg_operator.h"
#include "catalog/pg_statistic.h"
#include "catalog/pg_type.h"
+#include "commands/extension.h"
#include "miscadmin.h"
#include "utils/fmgrprotos.h"
#include "utils/lsyscache.h"
@@ -170,7 +171,18 @@ _int_matchsel(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(0.0);
}
- /* The caller made sure the const is a query, so get it now */
+ /*
+ * Verify that the Const is a query_int, else return a default estimate.
+ * (This could only fail if someone attached this estimator to the wrong
+ * operator.)
+ */
+ if (((Const *) other)->consttype !=
+ get_function_sibling_type(fcinfo->flinfo->fn_oid, "query_int"))
+ {
+ ReleaseVariableStats(vardata);
+ PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
+ }
+
query = DatumGetQueryTypeP(((Const *) other)->constvalue);
/* Empty query matches nothing */
diff --git a/contrib/jsonb_plpython/jsonb_plpython.c b/contrib/jsonb_plpython/jsonb_plpython.c
index 7e8e1d6674f..c2c4ce37c08 100644
--- a/contrib/jsonb_plpython/jsonb_plpython.c
+++ b/contrib/jsonb_plpython/jsonb_plpython.c
@@ -33,22 +33,24 @@ typedef PyObject *(*PLyUnicode_FromStringAndSize_t)
(const char *s, Py_ssize_t size);
static PLyUnicode_FromStringAndSize_t PLyUnicode_FromStringAndSize_p;
+/* Static asserts verify that typedefs above match original declarations */
+StaticAssertVariableIsOfType(&PLyObject_AsString, PLyObject_AsString_t);
+StaticAssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t);
+StaticAssertVariableIsOfType(&PLy_elog_impl, PLy_elog_impl_t);
+
+
/*
* Module initialize function: fetch function pointers for cross-module calls.
*/
void
_PG_init(void)
{
- /* Asserts verify that typedefs above match original declarations */
- AssertVariableIsOfType(&PLyObject_AsString, PLyObject_AsString_t);
PLyObject_AsString_p = (PLyObject_AsString_t)
load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyObject_AsString",
true, NULL);
- AssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t);
PLyUnicode_FromStringAndSize_p = (PLyUnicode_FromStringAndSize_t)
load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyUnicode_FromStringAndSize",
true, NULL);
- AssertVariableIsOfType(&PLy_elog_impl, PLy_elog_impl_t);
PLy_elog_impl_p = (PLy_elog_impl_t)
load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLy_elog_impl",
true, NULL);
diff --git a/contrib/ltree/crc32.c b/contrib/ltree/crc32.c
index 3918d4a0ec2..d21bed31fdd 100644
--- a/contrib/ltree/crc32.c
+++ b/contrib/ltree/crc32.c
@@ -23,6 +23,7 @@ ltree_crc32_sz(const char *buf, int size)
{
pg_crc32 crc;
const char *p = buf;
+ const char *end = buf + size;
static pg_locale_t locale = NULL;
if (!locale)
@@ -32,7 +33,7 @@ ltree_crc32_sz(const char *buf, int size)
while (size > 0)
{
char foldstr[UNICODE_CASEMAP_BUFSZ];
- int srclen = pg_mblen(p);
+ int srclen = pg_mblen_range(p, end);
size_t foldlen;
/* fold one codepoint at a time */
diff --git a/contrib/ltree/lquery_op.c b/contrib/ltree/lquery_op.c
index a28ddbf40de..0adcdd8ff2a 100644
--- a/contrib/ltree/lquery_op.c
+++ b/contrib/ltree/lquery_op.c
@@ -27,14 +27,14 @@ getlexeme(char *start, char *end, int *len)
char *ptr;
while (start < end && t_iseq(start, '_'))
- start += pg_mblen(start);
+ start += pg_mblen_range(start, end);
ptr = start;
if (ptr >= end)
return NULL;
while (ptr < end && !t_iseq(ptr, '_'))
- ptr += pg_mblen(ptr);
+ ptr += pg_mblen_range(ptr, end);
*len = ptr - start;
return start;
diff --git a/contrib/ltree/ltree.h b/contrib/ltree/ltree.h
index 78478dec173..b0ded40eba9 100644
--- a/contrib/ltree/ltree.h
+++ b/contrib/ltree/ltree.h
@@ -127,7 +127,7 @@ typedef struct
#define LQUERY_HASNOT 0x01
/* valid label chars are alphanumerics, underscores and hyphens */
-#define ISLABEL(x) ( t_isalnum(x) || t_iseq(x, '_') || t_iseq(x, '-') )
+#define ISLABEL(x) ( t_isalnum_cstr(x) || t_iseq(x, '_') || t_iseq(x, '-') )
/* full text query */
diff --git a/contrib/ltree/ltree_io.c b/contrib/ltree/ltree_io.c
index 59c4462df80..54c4ca3c5c3 100644
--- a/contrib/ltree/ltree_io.c
+++ b/contrib/ltree/ltree_io.c
@@ -54,7 +54,7 @@ parse_ltree(const char *buf, struct Node *escontext)
ptr = buf;
while (*ptr)
{
- charlen = pg_mblen(ptr);
+ charlen = pg_mblen_cstr(ptr);
if (t_iseq(ptr, '.'))
num++;
ptr += charlen;
@@ -69,7 +69,7 @@ parse_ltree(const char *buf, struct Node *escontext)
ptr = buf;
while (*ptr)
{
- charlen = pg_mblen(ptr);
+ charlen = pg_mblen_cstr(ptr);
switch (state)
{
@@ -291,7 +291,7 @@ parse_lquery(const char *buf, struct Node *escontext)
ptr = buf;
while (*ptr)
{
- charlen = pg_mblen(ptr);
+ charlen = pg_mblen_cstr(ptr);
if (t_iseq(ptr, '.'))
num++;
@@ -311,7 +311,7 @@ parse_lquery(const char *buf, struct Node *escontext)
ptr = buf;
while (*ptr)
{
- charlen = pg_mblen(ptr);
+ charlen = pg_mblen_cstr(ptr);
switch (state)
{
diff --git a/contrib/ltree/ltxtquery_io.c b/contrib/ltree/ltxtquery_io.c
index 91a2222eaa9..d15f3235393 100644
--- a/contrib/ltree/ltxtquery_io.c
+++ b/contrib/ltree/ltxtquery_io.c
@@ -64,7 +64,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint
for (;;)
{
- charlen = pg_mblen(state->buf);
+ charlen = pg_mblen_cstr(state->buf);
switch (state->state)
{
diff --git a/contrib/ltree_plpython/ltree_plpython.c b/contrib/ltree_plpython/ltree_plpython.c
index 0493aeb2423..d4e7b613fa1 100644
--- a/contrib/ltree_plpython/ltree_plpython.c
+++ b/contrib/ltree_plpython/ltree_plpython.c
@@ -13,6 +13,9 @@ PG_MODULE_MAGIC_EXT(
typedef PyObject *(*PLyUnicode_FromStringAndSize_t) (const char *s, Py_ssize_t size);
static PLyUnicode_FromStringAndSize_t PLyUnicode_FromStringAndSize_p;
+/* Static asserts verify that typedefs above match original declarations */
+StaticAssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t);
+
/*
* Module initialize function: fetch function pointers for cross-module calls.
@@ -20,8 +23,6 @@ static PLyUnicode_FromStringAndSize_t PLyUnicode_FromStringAndSize_p;
void
_PG_init(void)
{
- /* Asserts verify that typedefs above match original declarations */
- AssertVariableIsOfType(&PLyUnicode_FromStringAndSize, PLyUnicode_FromStringAndSize_t);
PLyUnicode_FromStringAndSize_p = (PLyUnicode_FromStringAndSize_t)
load_external_function("$libdir/" PLPYTHON_LIBNAME, "PLyUnicode_FromStringAndSize",
true, NULL);
diff --git a/contrib/meson.build b/contrib/meson.build
index def13257cbe..5a752eac347 100644
--- a/contrib/meson.build
+++ b/contrib/meson.build
@@ -48,6 +48,7 @@ subdir('pgcrypto')
subdir('pg_freespacemap')
subdir('pg_logicalinspect')
subdir('pg_overexplain')
+subdir('pg_plan_advice')
subdir('pg_prewarm')
subdir('pgrowlocks')
subdir('pg_stat_statements')
diff --git a/contrib/oid2name/oid2name.c b/contrib/oid2name/oid2name.c
index 51802907138..63e6ce2dae8 100644
--- a/contrib/oid2name/oid2name.c
+++ b/contrib/oid2name/oid2name.c
@@ -469,7 +469,7 @@ void
sql_exec_dumpalltables(PGconn *conn, struct options *opts)
{
char todo[1024];
- char *addfields = ",c.oid AS \"Oid\", nspname AS \"Schema\", spcname as \"Tablespace\" ";
+ char *addfields = ",c.oid AS \"Oid\", nspname AS \"Schema\", spcname as \"Tablespace\", pg_relation_filepath(c.oid) as \"Path\" ";
snprintf(todo, sizeof(todo),
"SELECT pg_catalog.pg_relation_filenode(c.oid) as \"Filenode\", relname as \"Table Name\" %s "
@@ -507,7 +507,7 @@ sql_exec_searchtables(PGconn *conn, struct options *opts)
*comma_filenumbers,
*comma_tables;
bool written = false;
- char *addfields = ",c.oid AS \"Oid\", nspname AS \"Schema\", spcname as \"Tablespace\" ";
+ char *addfields = ",c.oid AS \"Oid\", nspname AS \"Schema\", spcname as \"Tablespace\", pg_relation_filepath(c.oid) as \"Path\" ";
/* get tables qualifiers, whether names, filenumbers, or OIDs */
comma_oids = get_comma_elts(opts->oids);
diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c
index 8277fa256c3..2f0dfff175a 100644
--- a/contrib/pageinspect/heapfuncs.c
+++ b/contrib/pageinspect/heapfuncs.c
@@ -101,7 +101,7 @@ text_to_bits(char *str, int len)
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("invalid character \"%.*s\" in t_bits string",
- pg_mblen(str + off), str + off)));
+ pg_mblen_cstr(str + off), str + off)));
if (off % 8 == 7)
bits[off / 8] = byte;
diff --git a/contrib/pg_overexplain/expected/pg_overexplain.out b/contrib/pg_overexplain/expected/pg_overexplain.out
index 55d34666d87..f376d2e7996 100644
--- a/contrib/pg_overexplain/expected/pg_overexplain.out
+++ b/contrib/pg_overexplain/expected/pg_overexplain.out
@@ -104,6 +104,7 @@ $$);
Parallel Safe: true
Plan Node ID: 2
Append RTIs: 1
+ Child Append RTIs: none
-> Seq Scan on brassica vegetables_1
Disabled Nodes: 0
Parallel Safe: true
@@ -142,7 +143,7 @@ $$);
Relation Kind: relation
Relation Lock Mode: AccessShareLock
Unprunable RTIs: 1 3 4
-(53 rows)
+(54 rows)
-- Test a different output format.
SELECT explain_filter($$
@@ -197,6 +198,7 @@ $$);
none +
none +
1 +
+ none +
0 +
+
+
@@ -452,6 +454,8 @@ SELECT * FROM vegetables WHERE genus = 'daucus';
Seq Scan on daucus vegetables
Filter: (genus = 'daucus'::text)
Scan RTI: 2
+ Elided Node Type: Append
+ Elided Node RTIs: 1
RTI 1 (relation, inherited, in-from-clause):
Eref: vegetables (id, name, genus)
Relation: vegetables
@@ -465,7 +469,7 @@ SELECT * FROM vegetables WHERE genus = 'daucus';
Relation Kind: relation
Relation Lock Mode: AccessShareLock
Unprunable RTIs: 1 2
-(16 rows)
+(18 rows)
-- Also test a case that involves a write.
EXPLAIN (RANGE_TABLE, COSTS OFF)
@@ -489,3 +493,122 @@ INSERT INTO vegetables (name, genus) VALUES ('broccoflower', 'brassica');
Result RTIs: 1
(15 rows)
+-- should show "Subplan: sub"
+EXPLAIN (RANGE_TABLE, COSTS OFF)
+SELECT * FROM vegetables v,
+ (SELECT * FROM vegetables WHERE genus = 'daucus' OFFSET 0) sub;
+ QUERY PLAN
+----------------------------------------------
+ Nested Loop
+ -> Seq Scan on daucus vegetables
+ Filter: (genus = 'daucus'::text)
+ Scan RTI: 6
+ Elided Node Type: Append
+ Elided Node RTIs: 5
+ Elided Node Type: SubqueryScan
+ Elided Node RTIs: 2
+ -> Append
+ Append RTIs: 1
+ Child Append RTIs: none
+ -> Seq Scan on brassica v_1
+ Scan RTI: 3
+ -> Seq Scan on daucus v_2
+ Scan RTI: 4
+ RTI 1 (relation, inherited, in-from-clause):
+ Alias: v ()
+ Eref: v (id, name, genus)
+ Relation: vegetables
+ Relation Kind: partitioned_table
+ Relation Lock Mode: AccessShareLock
+ Permission Info Index: 1
+ RTI 2 (subquery, in-from-clause):
+ Alias: sub ()
+ Eref: sub (id, name, genus)
+ RTI 3 (relation, in-from-clause):
+ Alias: v (id, name, genus)
+ Eref: v (id, name, genus)
+ Relation: brassica
+ Relation Kind: relation
+ Relation Lock Mode: AccessShareLock
+ RTI 4 (relation, in-from-clause):
+ Alias: v (id, name, genus)
+ Eref: v (id, name, genus)
+ Relation: daucus
+ Relation Kind: relation
+ Relation Lock Mode: AccessShareLock
+ RTI 5 (relation, inherited, in-from-clause):
+ Subplan: sub
+ Eref: vegetables (id, name, genus)
+ Relation: vegetables
+ Relation Kind: partitioned_table
+ Relation Lock Mode: AccessShareLock
+ Permission Info Index: 2
+ RTI 6 (relation, in-from-clause):
+ Subplan: sub
+ Alias: vegetables (id, name, genus)
+ Eref: vegetables (id, name, genus)
+ Relation: daucus
+ Relation Kind: relation
+ Relation Lock Mode: AccessShareLock
+ Unprunable RTIs: 1 3 4 5 6
+(52 rows)
+
+-- should show "Subplan: unnamed_subquery"
+EXPLAIN (RANGE_TABLE, COSTS OFF)
+SELECT * FROM vegetables v,
+ (SELECT * FROM vegetables WHERE genus = 'daucus' OFFSET 0);
+ QUERY PLAN
+----------------------------------------------
+ Nested Loop
+ -> Seq Scan on daucus vegetables
+ Filter: (genus = 'daucus'::text)
+ Scan RTI: 6
+ Elided Node Type: Append
+ Elided Node RTIs: 5
+ Elided Node Type: SubqueryScan
+ Elided Node RTIs: 2
+ -> Append
+ Append RTIs: 1
+ Child Append RTIs: none
+ -> Seq Scan on brassica v_1
+ Scan RTI: 3
+ -> Seq Scan on daucus v_2
+ Scan RTI: 4
+ RTI 1 (relation, inherited, in-from-clause):
+ Alias: v ()
+ Eref: v (id, name, genus)
+ Relation: vegetables
+ Relation Kind: partitioned_table
+ Relation Lock Mode: AccessShareLock
+ Permission Info Index: 1
+ RTI 2 (subquery, in-from-clause):
+ Eref: unnamed_subquery (id, name, genus)
+ RTI 3 (relation, in-from-clause):
+ Alias: v (id, name, genus)
+ Eref: v (id, name, genus)
+ Relation: brassica
+ Relation Kind: relation
+ Relation Lock Mode: AccessShareLock
+ RTI 4 (relation, in-from-clause):
+ Alias: v (id, name, genus)
+ Eref: v (id, name, genus)
+ Relation: daucus
+ Relation Kind: relation
+ Relation Lock Mode: AccessShareLock
+ RTI 5 (relation, inherited, in-from-clause):
+ Subplan: unnamed_subquery
+ Eref: vegetables (id, name, genus)
+ Relation: vegetables
+ Relation Kind: partitioned_table
+ Relation Lock Mode: AccessShareLock
+ Permission Info Index: 2
+ RTI 6 (relation, in-from-clause):
+ Subplan: unnamed_subquery
+ Alias: vegetables (id, name, genus)
+ Eref: vegetables (id, name, genus)
+ Relation: daucus
+ Relation Kind: relation
+ Relation Lock Mode: AccessShareLock
+ Unprunable RTIs: 1 3 4 5 6
+(51 rows)
+
diff --git a/contrib/pg_overexplain/pg_overexplain.c b/contrib/pg_overexplain/pg_overexplain.c
index 316ffd1c87f..36e6aac0e2c 100644
--- a/contrib/pg_overexplain/pg_overexplain.c
+++ b/contrib/pg_overexplain/pg_overexplain.c
@@ -54,6 +54,8 @@ static void overexplain_alias(const char *qlabel, Alias *alias,
ExplainState *es);
static void overexplain_bitmapset(const char *qlabel, Bitmapset *bms,
ExplainState *es);
+static void overexplain_bitmapset_list(const char *qlabel, List *bms_list,
+ ExplainState *es);
static void overexplain_intlist(const char *qlabel, List *list,
ExplainState *es);
@@ -191,6 +193,8 @@ overexplain_per_node_hook(PlanState *planstate, List *ancestors,
*/
if (options->range_table)
{
+ bool opened_elided_nodes = false;
+
switch (nodeTag(plan))
{
case T_SeqScan:
@@ -230,11 +234,17 @@ overexplain_per_node_hook(PlanState *planstate, List *ancestors,
overexplain_bitmapset("Append RTIs",
((Append *) plan)->apprelids,
es);
+ overexplain_bitmapset_list("Child Append RTIs",
+ ((Append *) plan)->child_append_relid_sets,
+ es);
break;
case T_MergeAppend:
overexplain_bitmapset("Append RTIs",
((MergeAppend *) plan)->apprelids,
es);
+ overexplain_bitmapset_list("Child Append RTIs",
+ ((MergeAppend *) plan)->child_append_relid_sets,
+ es);
break;
case T_Result:
@@ -251,6 +261,43 @@ overexplain_per_node_hook(PlanState *planstate, List *ancestors,
default:
break;
}
+
+ foreach_node(ElidedNode, n, es->pstmt->elidedNodes)
+ {
+ char *elidednodetag;
+
+ if (n->plan_node_id != plan->plan_node_id)
+ continue;
+
+ if (!opened_elided_nodes)
+ {
+ ExplainOpenGroup("Elided Nodes", "Elided Nodes", false, es);
+ opened_elided_nodes = true;
+ }
+
+ switch (n->elided_type)
+ {
+ case T_Append:
+ elidednodetag = "Append";
+ break;
+ case T_MergeAppend:
+ elidednodetag = "MergeAppend";
+ break;
+ case T_SubqueryScan:
+ elidednodetag = "SubqueryScan";
+ break;
+ default:
+ elidednodetag = psprintf("%d", n->elided_type);
+ break;
+ }
+
+ ExplainOpenGroup("Elided Node", NULL, true, es);
+ ExplainPropertyText("Elided Node Type", elidednodetag, es);
+ overexplain_bitmapset("Elided Node RTIs", n->relids, es);
+ ExplainCloseGroup("Elided Node", NULL, true, es);
+ }
+ if (opened_elided_nodes)
+ ExplainCloseGroup("Elided Nodes", "Elided Nodes", false, es);
}
}
@@ -395,6 +442,8 @@ static void
overexplain_range_table(PlannedStmt *plannedstmt, ExplainState *es)
{
Index rti;
+ ListCell *lc_subrtinfo = list_head(plannedstmt->subrtinfos);
+ SubPlanRTInfo *rtinfo = NULL;
/* Open group, one entry per RangeTblEntry */
ExplainOpenGroup("Range Table", "Range Table", false, es);
@@ -405,6 +454,18 @@ overexplain_range_table(PlannedStmt *plannedstmt, ExplainState *es)
RangeTblEntry *rte = rt_fetch(rti, plannedstmt->rtable);
char *kind = NULL;
char *relkind;
+ SubPlanRTInfo *next_rtinfo;
+
+ /* Advance to next SubRTInfo, if it's time. */
+ if (lc_subrtinfo != NULL)
+ {
+ next_rtinfo = lfirst(lc_subrtinfo);
+ if (rti > next_rtinfo->rtoffset)
+ {
+ rtinfo = next_rtinfo;
+ lc_subrtinfo = lnext(plannedstmt->subrtinfos, lc_subrtinfo);
+ }
+ }
/* NULL entries are possible; skip them */
if (rte == NULL)
@@ -469,6 +530,28 @@ overexplain_range_table(PlannedStmt *plannedstmt, ExplainState *es)
ExplainPropertyBool("In From Clause", rte->inFromCl, es);
}
+ /*
+ * Indicate which subplan is the origin of which RTE. Note dummy
+ * subplans. Here again, we crunch more onto one line in text format.
+ */
+ if (rtinfo != NULL)
+ {
+ if (es->format == EXPLAIN_FORMAT_TEXT)
+ {
+ if (!rtinfo->dummy)
+ ExplainPropertyText("Subplan", rtinfo->plan_name, es);
+ else
+ ExplainPropertyText("Subplan",
+ psprintf("%s (dummy)",
+ rtinfo->plan_name), es);
+ }
+ else
+ {
+ ExplainPropertyText("Subplan", rtinfo->plan_name, es);
+ ExplainPropertyBool("Subplan Is Dummy", rtinfo->dummy, es);
+ }
+ }
+
/* rte->alias is optional; rte->eref is requested */
if (rte->alias != NULL)
overexplain_alias("Alias", rte->alias, es);
@@ -740,6 +823,54 @@ overexplain_bitmapset(const char *qlabel, Bitmapset *bms, ExplainState *es)
pfree(buf.data);
}
+/*
+ * Emit a text property describing the contents of a list of bitmapsets.
+ * If a bitmapset contains exactly 1 member, we just print an integer;
+ * otherwise, we surround the list of members by parentheses.
+ *
+ * If there are no bitmapsets in the list, we print the word "none".
+ */
+static void
+overexplain_bitmapset_list(const char *qlabel, List *bms_list,
+ ExplainState *es)
+{
+ StringInfoData buf;
+
+ initStringInfo(&buf);
+
+ foreach_node(Bitmapset, bms, bms_list)
+ {
+ if (bms_membership(bms) == BMS_SINGLETON)
+ appendStringInfo(&buf, " %d", bms_singleton_member(bms));
+ else
+ {
+ int x = -1;
+ bool first = true;
+
+ appendStringInfoString(&buf, " (");
+ while ((x = bms_next_member(bms, x)) >= 0)
+ {
+ if (first)
+ first = false;
+ else
+ appendStringInfoChar(&buf, ' ');
+ appendStringInfo(&buf, "%d", x);
+ }
+ appendStringInfoChar(&buf, ')');
+ }
+ }
+
+ if (buf.len == 0)
+ {
+ ExplainPropertyText(qlabel, "none", es);
+ return;
+ }
+
+ Assert(buf.data[0] == ' ');
+ ExplainPropertyText(qlabel, buf.data + 1, es);
+ pfree(buf.data);
+}
+
/*
* Emit a text property describing the contents of a list of integers, OIDs,
* or XIDs -- either a space-separated list of integer members, or the word
diff --git a/contrib/pg_overexplain/sql/pg_overexplain.sql b/contrib/pg_overexplain/sql/pg_overexplain.sql
index 42e275ac2f9..34a957cbed3 100644
--- a/contrib/pg_overexplain/sql/pg_overexplain.sql
+++ b/contrib/pg_overexplain/sql/pg_overexplain.sql
@@ -110,3 +110,13 @@ SELECT * FROM vegetables WHERE genus = 'daucus';
-- Also test a case that involves a write.
EXPLAIN (RANGE_TABLE, COSTS OFF)
INSERT INTO vegetables (name, genus) VALUES ('broccoflower', 'brassica');
+
+-- should show "Subplan: sub"
+EXPLAIN (RANGE_TABLE, COSTS OFF)
+SELECT * FROM vegetables v,
+ (SELECT * FROM vegetables WHERE genus = 'daucus' OFFSET 0) sub;
+
+-- should show "Subplan: unnamed_subquery"
+EXPLAIN (RANGE_TABLE, COSTS OFF)
+SELECT * FROM vegetables v,
+ (SELECT * FROM vegetables WHERE genus = 'daucus' OFFSET 0);
diff --git a/contrib/pg_plan_advice/.gitignore b/contrib/pg_plan_advice/.gitignore
new file mode 100644
index 00000000000..19a14253019
--- /dev/null
+++ b/contrib/pg_plan_advice/.gitignore
@@ -0,0 +1,3 @@
+/pgpa_parser.h
+/pgpa_parser.c
+/pgpa_scanner.c
diff --git a/contrib/pg_plan_advice/Makefile b/contrib/pg_plan_advice/Makefile
new file mode 100644
index 00000000000..1d4c559aed8
--- /dev/null
+++ b/contrib/pg_plan_advice/Makefile
@@ -0,0 +1,50 @@
+# contrib/pg_plan_advice/Makefile
+
+MODULE_big = pg_plan_advice
+OBJS = \
+ $(WIN32RES) \
+ pg_plan_advice.o \
+ pgpa_ast.o \
+ pgpa_collector.o \
+ pgpa_identifier.o \
+ pgpa_join.o \
+ pgpa_output.o \
+ pgpa_parser.o \
+ pgpa_planner.o \
+ pgpa_scan.o \
+ pgpa_scanner.o \
+ pgpa_trove.o \
+ pgpa_walker.o
+
+EXTENSION = pg_plan_advice
+DATA = pg_plan_advice--1.0.sql
+PGFILEDESC = "pg_plan_advice - help the planner get the right plan"
+
+REGRESS = gather join_order join_strategy partitionwise scan
+TAP_TESTS = 1
+
+EXTRA_CLEAN = pgpa_parser.h pgpa_parser.c pgpa_scanner.c
+
+# required for 001_regress.pl
+REGRESS_SHLIB=$(abs_top_builddir)/src/test/regress/regress$(DLSUFFIX)
+export REGRESS_SHLIB
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_plan_advice
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+# See notes in src/backend/parser/Makefile about the following two rules
+pgpa_parser.h: pgpa_parser.c
+ touch $@
+
+pgpa_parser.c: BISONFLAGS += -d
+
+# Force these dependencies to be known even without dependency info built:
+pgpa_parser.o pgpa_scanner.o: pgpa_parser.h
diff --git a/contrib/pg_plan_advice/README b/contrib/pg_plan_advice/README
new file mode 100644
index 00000000000..0b888fd82f2
--- /dev/null
+++ b/contrib/pg_plan_advice/README
@@ -0,0 +1,260 @@
+contrib/pg_plan_advice/README
+
+Plan Advice
+===========
+
+This module implements a mini-language for "plan advice" that allows for
+control of certain key planner decisions. Goals include (1) enforcing plan
+stability (my previous plan was good and I would like to keep getting a
+similar one) and (2) allowing users to experiment with plans other than
+the one preferred by the optimizer. Non-goals include (1) controlling
+every possible planner decision and (2) forcing consideration of plans
+that the optimizer rejects for reasons other than cost. (There is some
+room for bikeshedding about what exactly this non-goal means: what if
+we skip path generation entirely for a certain case on the theory that
+we know it cannot win on cost? Does that count as a cost-based rejection
+even though no cost was ever computed?)
+
+Generally, plan advice is a series of whitespace-separated advice items,
+each of which applies an advice tag to a list of advice targets. For
+example, "SEQ_SCAN(foo) HASH_JOIN(bar@ss)" contains two items of advice,
+the first of which applies the SEQ_SCAN tag to "foo" and the second of
+which applies the HASH_JOIN tag to "bar@ss". In this simple example, each
+target identifies a single relation; see "Relation Identifiers", below.
+Advice tags can also be applied to groups of relations; for example,
+"HASH_JOIN(baz (bletch quux))" applies the HASH_JOIN tag to the single
+relation identifier "baz" as well as to the 2-item list containing
+"bletch" and "quux".
+
+Critically, this module knows both how to generate plan advice from an
+already-existing plan, and also how to enforce it during future planning
+cycles. Everything it does is intended to be "round-trip safe": if you
+generate advice from a plan and then feed that back into a future planing
+cycle, each piece of advice should be guaranteed to apply to the exactly the
+same part of the query from which it was generated without ambiguity or
+guesswork, and it should succesfully enforce the same planning decision that
+led to it being generated in the first place. Note that there is no
+intention that these guarantees hold in the presence of intervening DDL;
+e.g. if you change the properties of a function so that a subquery is no
+longer inlined, or if you drop an index named in the plan advice, the advice
+isn't going to work any more. That's expected.
+
+This module aims to force the planner to follow any provided advice without
+regard to whether it is appears to be good advice or bad advice. If the
+user provides bad advice, whether derived from a previously-generated plan
+or manually written, they may get a bad plan. We regard this as user error,
+not a defect in this module. It seems likely that applying advice
+judiciously and only when truly required to avoid problems will be a more
+successful strategy than applying it with a broad brush, but users are free
+to experiment with whatever strategies they think best.
+
+Relation Identifiers
+====================
+
+Uniquely identifying the part of a query to which a certain piece of
+advice applies is harder than it sounds. Our basic approach is to use
+relation aliases as a starting point, and then disambiguate. There are
+three ways that same relation alias can occur multiple times:
+
+1. It can appear in more than one subquery.
+
+2. It can appear more than once in the same subquery,
+ e.g. (foo JOIN bar) x JOIN foo.
+
+3. The table can be partitioned.
+
+Any combination of these things can occur simultaneously. Therefore, our
+general syntax for a relation identifier is:
+
+alias_name#occurrence_number/partition_schema.partition_name@plan_name
+
+All components except for the alias_name are optional and included only
+when required. When a component is omitted, the associated punctuation
+must also be omitted. Occurrence numbers are counted ignoring children of
+partitioned tables. When the generated occurrence number is 1, we omit
+the occurrence number. The partition schema and partition name are included
+only for children of partitioned tables. In generated advice, the
+partition_schema is always included whenever there is a partition_name,
+but user-written advice may mention the name and omit the schema. The
+plan_name is omitted for the top-level PlannerInfo.
+
+Scan Advice
+===========
+
+For many types of scan, no advice is generated or possible; for instance,
+a subquery is always scanned using a subquery scan. While that scan may be
+elided via setrefs processing, this doesn't change the fact that only one
+basic approach exists. Hence, scan advice applies mostly to relations, which
+can be scanned in multiple ways.
+
+We tend to think of a scan as targeting a single relation, and that's
+normally the case, but it doesn't have to be. For instance, if a join is
+proven empty, the whole thing may be replaced with a single Result node
+which, in effect, is a degenerate scan of every relation in the collapsed
+portion of the join tree. Similarly, it's possible to inject a custom scan
+in such a way that it replaces an entire join. If we ever emit advice
+for these cases, it would target sets of relation identifiers surrounded
+by parentheses, e.g. SOME_SORT_OF_SCAN(foo (bar baz)) would mean that the
+the given scan type would be used for foo as a single relation and also the
+combination of bar and baz as a join product. We have no such cases at
+present.
+
+For index and index-only scans, both the relation being scanned and the
+index or indexes being used must be specified. For example, INDEX_SCAN(foo
+foo_a_idx bar bar_b_idx) indicates that an index scan (not an index-only
+scan) should be used on foo_a_idx when scanning foo, and that an index scan
+should be used on bar_b_idx when scanning bar.
+
+Bitmap heap scans currently do not allow for an index specification:
+BITMAP_HEAP_SCAN(foo bar) simply means that each of foo and bar should use
+some sort of bitmap heap scan.
+
+Join Order Advice
+=================
+
+The JOIN_ORDER tag specifies the order in which several tables that are
+part of the same join problem should be joined. Each subquery (except for
+those that are inlined) is a separate join problem. Within a subquery,
+partitionwise joins can create additional, separate join problems. Hence,
+queries involving partitionwise joins may use JOIN_ORDER() many times.
+
+We take the canonical join structure to be an outer-deep tree, so
+JOIN_ORDER(t1 t2 t3) says that t1 is the driving table and should be joined
+first to t2 and then to t3. If the join problem involves additional tables,
+they can be joined in any order after the join between t1, t2, and t3 has
+been constructured. Generated join advice always mentions all tables
+in the join problem, but manually written join advice need not do so.
+
+For trees which are not outer-deep, parentheses can be used. For example,
+JOIN_ORDER(t1 (t2 t3)) says that the top-level join should have t1 on the
+outer side and a join between t2 and t3 on the inner side. That join should
+be constructed so that t2 is on the outer side and t3 is on the inner side.
+
+In some cases, it's not possible to fully specify the join order in this way.
+For example, if t2 and t3 are being scanned by a single custom scan or foreign
+scan, or if a partitionwise join is being performed between those tables, then
+it's impossible to say that t2 is the outer table and t3 is the inner table,
+or the other way around; it's just undefined. In such cases, we generate
+join advice that uses curly braces, intending to indicate a lack of ordering:
+JOIN_ORDER(t1 {t2 t3}) says that the uppermost join should have t1 on the outer
+side and some kind of join between t2 and t3 on the inner side, but without
+saying how that join must be performed or anything about which relation should
+appear on which side of the join, or even whether this kind of join has sides.
+
+Join Strategy Advice
+====================
+
+Tags such as NESTED_LOOP_PLAIN specify the method that should be used to
+perform a certain join. More specifically, NESTED_LOOP_PLAIN(x (y z)) says
+that the plan should put the relation whose identifier is "x" on the inner
+side of a plain nested loop (one without materialization or memoization)
+and that it should also put a join between the relation whose identifier is
+"y" and the relation whose identifier is "z" on the inner side of a nested
+loop. Hence, for an N-table join problem, there will be N-1 pieces of join
+strategy advice; no join strategy advice is required for the outermost
+table in the join problem.
+
+Considering that we have both join order advice and join strategy advice,
+it might seem natural to say that NESTED_LOOP_PLAIN(x) should be redefined
+to mean that x should appear by itself on one side or the other of a nested
+loop, rather than specifically on the inner side, but this definition appears
+useless in practice. It gives the planner too much freedom to do things that
+bear little resemblance to what the user probably had in mind. This makes
+only a limited amount of practical difference in the case of a merge join or
+unparameterized nested loop, but for a parameterized nested loop or a hash
+join, the two sides are treated very differently and saying that a certain
+relation should be involved in one of those operations without saying which
+role it should take isn't saying much.
+
+This choice of definition implies that join strategy advice also imposes some
+join order constraints. For example, given a join between foo and bar,
+HASH_JOIN(bar) implies that foo is the driving table. Otherwise, it would
+be impossible to put bar beneath the inner side of a Hash Join.
+
+Note that, given this definition, it's reasonable to consider deleting the
+join order advice but applying the join strategy advice. For example,
+consider a star schema with tables fact, dim1, dim2, dim3, dim4, and dim5.
+The automatically generated advice might specify JOIN_ORDER(fact dim1 dim3
+dim4 dim2 dim5) HASH_JOIN(dim2 dim4) NESTED_LOOP_PLAIN(dim1 dim3 dim5).
+Deleting the JOIN_ORDER advice allows the planner to reorder the joins
+however it likes while still forcing the same choice of join method. This
+seems potentially useful, and is one reason why a unified syntax that controls
+both join order and join method in a single locution was not chosen.
+
+Advice Completeness
+===================
+
+An essential guiding principle is that no inference may made on the basis
+of the absence of advice. The user is entitled to remove any portion of the
+generated advice which they deem unsuitable or counterproductive and the
+result should only be to increase the flexibility afforded to the planner.
+This means that if advice can say that a certain optimization or technique
+should be used, it should also be able to say that the optimization or
+technique should not be used. We should never assume that the absence of an
+instruction to do a certain thing means that it should not be done; all
+instructions must be explicit.
+
+Semijoin Uniqueness
+===================
+
+Faced with a semijoin, the planner considers both a direct implementation
+and a plan where the one side is made unique and then an inner join is
+performed. We emit SEMIJOIN_UNIQUE() advice when this transformation occurs
+and SEMIJOIN_NON_UNIQUE() advice when it doesn't. These items work like
+join strategy advice: the inner side of the relevant join is named, and the
+chosen join order must be compatible with the advice having some effect.
+
+Partitionwise
+=============
+
+PARTITIONWISE() advise can be used to specify both those partitionwise joins
+which should be performed and those which should not be performed; the idea
+is that each argument to PARTITIONWISE specifies a set of relations that
+should be scanned partitionwise after being joined to each other and nothing
+else. Hence, for example, PARTITIONWISE((t1 t2) t3) specifies that the
+query should contain a partitionwise join between t1 and t2 and that t3
+should not be part of any partitionwise join. If there are no other rels
+in the query, specifying just PARTITIONWISE((t1 t2)) would have the same
+effect, since there would be no other rels to which t3 could be joined in
+a partitionwise fashion.
+
+Parallel Query (Gather, etc.)
+=============================
+
+Each argument to GATHER() or GATHER_MERGE() is a single relation or an
+exact set of relations on top of which a Gather or Gather Merge node,
+respectively, should be placed. Each argument to NO_GATHER() is a single
+relation that should not appear beneath any Gather or Gather Merge node;
+that is, parallelism should not be used.
+
+Implicit Join Order Constraints
+===============================
+
+When JOIN_ORDER() advice is not provided for a particular join problem,
+other pieces of advice may still incidentally constraint the join order.
+For example, a user who specifies HASH_JOIN((foo bar)) is explicitly saying
+that there should be a hash join with exactly foo and bar on the outer
+side of it, but that also implies that foo and bar must be joined to
+each other before either of them is joined to anything else. Otherwise,
+the join the user is attempting to constraint won't actually occur in the
+query, which ends up looking like the system has just decided to ignore
+the advice altogether.
+
+Future Work
+===========
+
+We don't handle choice of aggregation: it would be nice to be able to force
+sorted or grouped aggregation. I'm guessing this can be left to future work.
+
+More seriously, we don't know anything about eager aggregation, which could
+have a large impact on the shape of the plan tree. XXX: This needs some study
+to determine how large a problem it is, and might need to be fixed sooner
+rather than later.
+
+We don't offer any control over estimates, only outcomes. It seems like a
+good idea to incorporate that ability at some future point, as pg_hint_plan
+does. However, since primary goal of the initial development work is to be
+able to induce the planner to recreate a desired plan that worked well in
+the past, this has not been included in the initial development effort.
+
+XXX Need to investigate whether and how well supplying advice works with GEQO
diff --git a/contrib/pg_plan_advice/expected/gather.out b/contrib/pg_plan_advice/expected/gather.out
new file mode 100644
index 00000000000..0cc0dedf859
--- /dev/null
+++ b/contrib/pg_plan_advice/expected/gather.out
@@ -0,0 +1,371 @@
+LOAD 'pg_plan_advice';
+SET max_parallel_workers_per_gather = 1;
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET debug_parallel_query = off;
+CREATE TABLE gt_dim (id serial primary key, dim text)
+ WITH (autovacuum_enabled = false);
+INSERT INTO gt_dim (dim) SELECT random()::text FROM generate_series(1,100) g;
+VACUUM ANALYZE gt_dim;
+CREATE TABLE gt_fact (
+ id int not null,
+ dim_id integer not null references gt_dim (id)
+) WITH (autovacuum_enabled = false);
+INSERT INTO gt_fact
+ SELECT g, (g%3)+1 FROM generate_series(1,100000) g;
+VACUUM ANALYZE gt_fact;
+-- By default, we expect Gather Merge with a parallel hash join.
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+ QUERY PLAN
+-------------------------------------------------------
+ Gather Merge
+ Workers Planned: 1
+ -> Sort
+ Sort Key: f.dim_id
+ -> Parallel Hash Join
+ Hash Cond: (f.dim_id = d.id)
+ -> Parallel Seq Scan on gt_fact f
+ -> Parallel Hash
+ -> Parallel Seq Scan on gt_dim d
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ HASH_JOIN(d)
+ SEQ_SCAN(f d)
+ GATHER_MERGE((f d))
+(14 rows)
+
+-- Force Gather or Gather Merge of both relations together.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'gather_merge((f d))';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+ QUERY PLAN
+-------------------------------------------------------
+ Gather Merge
+ Workers Planned: 1
+ -> Sort
+ Sort Key: f.dim_id
+ -> Parallel Hash Join
+ Hash Cond: (f.dim_id = d.id)
+ -> Parallel Seq Scan on gt_fact f
+ -> Parallel Hash
+ -> Parallel Seq Scan on gt_dim d
+ Supplied Plan Advice:
+ GATHER_MERGE((f d)) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ HASH_JOIN(d)
+ SEQ_SCAN(f d)
+ GATHER_MERGE((f d))
+(16 rows)
+
+SET LOCAL pg_plan_advice.advice = 'gather((f d))';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+ QUERY PLAN
+-------------------------------------------------------
+ Sort
+ Sort Key: f.dim_id
+ -> Gather
+ Workers Planned: 1
+ -> Parallel Hash Join
+ Hash Cond: (f.dim_id = d.id)
+ -> Parallel Seq Scan on gt_fact f
+ -> Parallel Hash
+ -> Parallel Seq Scan on gt_dim d
+ Supplied Plan Advice:
+ GATHER((f d)) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ HASH_JOIN(d)
+ SEQ_SCAN(f d)
+ GATHER((f d))
+(16 rows)
+
+COMMIT;
+-- Force a separate Gather or Gather Merge operation for each relation.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'gather_merge(f d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+ QUERY PLAN
+--------------------------------------------------
+ Merge Join
+ Merge Cond: (f.dim_id = d.id)
+ -> Gather Merge
+ Workers Planned: 1
+ -> Sort
+ Sort Key: f.dim_id
+ -> Parallel Seq Scan on gt_fact f
+ -> Gather Merge
+ Workers Planned: 1
+ -> Sort
+ Sort Key: d.id
+ -> Parallel Seq Scan on gt_dim d
+ Supplied Plan Advice:
+ GATHER_MERGE(f) /* matched */
+ GATHER_MERGE(d) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ MERGE_JOIN_PLAIN(d)
+ SEQ_SCAN(f d)
+ GATHER_MERGE(f d)
+(20 rows)
+
+SET LOCAL pg_plan_advice.advice = 'gather(f d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+ QUERY PLAN
+--------------------------------------------------
+ Merge Join
+ Merge Cond: (f.dim_id = d.id)
+ -> Sort
+ Sort Key: f.dim_id
+ -> Gather
+ Workers Planned: 1
+ -> Parallel Seq Scan on gt_fact f
+ -> Sort
+ Sort Key: d.id
+ -> Gather
+ Workers Planned: 1
+ -> Parallel Seq Scan on gt_dim d
+ Supplied Plan Advice:
+ GATHER(f) /* matched */
+ GATHER(d) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ MERGE_JOIN_PLAIN(d)
+ SEQ_SCAN(f d)
+ GATHER(f d)
+(20 rows)
+
+SET LOCAL pg_plan_advice.advice = 'gather((d d/d.d))';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+ QUERY PLAN
+--------------------------------------------------
+ Merge Join
+ Merge Cond: (f.dim_id = d.id)
+ -> Gather Merge
+ Workers Planned: 1
+ -> Sort
+ Sort Key: f.dim_id
+ -> Parallel Seq Scan on gt_fact f
+ -> Index Scan using gt_dim_pkey on gt_dim d
+ Supplied Plan Advice:
+ GATHER((d d/d.d)) /* partially matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ MERGE_JOIN_PLAIN(d)
+ SEQ_SCAN(f)
+ INDEX_SCAN(d public.gt_dim_pkey)
+ GATHER_MERGE(f)
+ NO_GATHER(d)
+(17 rows)
+
+COMMIT;
+-- Force a Gather or Gather Merge on one relation but no parallelism on other.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'gather_merge(f) no_gather(d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+ QUERY PLAN
+--------------------------------------------------
+ Merge Join
+ Merge Cond: (f.dim_id = d.id)
+ -> Gather Merge
+ Workers Planned: 1
+ -> Sort
+ Sort Key: f.dim_id
+ -> Parallel Seq Scan on gt_fact f
+ -> Index Scan using gt_dim_pkey on gt_dim d
+ Supplied Plan Advice:
+ GATHER_MERGE(f) /* matched */
+ NO_GATHER(d) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ MERGE_JOIN_PLAIN(d)
+ SEQ_SCAN(f)
+ INDEX_SCAN(d public.gt_dim_pkey)
+ GATHER_MERGE(f)
+ NO_GATHER(d)
+(18 rows)
+
+SET LOCAL pg_plan_advice.advice = 'gather_merge(d) no_gather(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+ QUERY PLAN
+-------------------------------------------------
+ Merge Join
+ Merge Cond: (f.dim_id = d.id)
+ -> Sort
+ Sort Key: f.dim_id
+ -> Seq Scan on gt_fact f
+ -> Gather Merge
+ Workers Planned: 1
+ -> Sort
+ Sort Key: d.id
+ -> Parallel Seq Scan on gt_dim d
+ Supplied Plan Advice:
+ GATHER_MERGE(d) /* matched */
+ NO_GATHER(f) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ MERGE_JOIN_PLAIN(d)
+ SEQ_SCAN(f d)
+ GATHER_MERGE(d)
+ NO_GATHER(f)
+(19 rows)
+
+SET LOCAL pg_plan_advice.advice = 'gather(f) no_gather(d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+ QUERY PLAN
+--------------------------------------------------
+ Merge Join
+ Merge Cond: (d.id = f.dim_id)
+ -> Index Scan using gt_dim_pkey on gt_dim d
+ -> Sort
+ Sort Key: f.dim_id
+ -> Gather
+ Workers Planned: 1
+ -> Parallel Seq Scan on gt_fact f
+ Supplied Plan Advice:
+ GATHER(f) /* matched */
+ NO_GATHER(d) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(d f)
+ MERGE_JOIN_PLAIN(f)
+ SEQ_SCAN(f)
+ INDEX_SCAN(d public.gt_dim_pkey)
+ GATHER(f)
+ NO_GATHER(d)
+(18 rows)
+
+SET LOCAL pg_plan_advice.advice = 'gather(d) no_gather(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+ QUERY PLAN
+-------------------------------------------------
+ Merge Join
+ Merge Cond: (f.dim_id = d.id)
+ -> Sort
+ Sort Key: f.dim_id
+ -> Seq Scan on gt_fact f
+ -> Sort
+ Sort Key: d.id
+ -> Gather
+ Workers Planned: 1
+ -> Parallel Seq Scan on gt_dim d
+ Supplied Plan Advice:
+ GATHER(d) /* matched */
+ NO_GATHER(f) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ MERGE_JOIN_PLAIN(d)
+ SEQ_SCAN(f d)
+ GATHER(d)
+ NO_GATHER(f)
+(19 rows)
+
+COMMIT;
+-- Force no Gather or Gather Merge use at all.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'no_gather(f d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+ QUERY PLAN
+------------------------------------------------
+ Merge Join
+ Merge Cond: (d.id = f.dim_id)
+ -> Index Scan using gt_dim_pkey on gt_dim d
+ -> Sort
+ Sort Key: f.dim_id
+ -> Seq Scan on gt_fact f
+ Supplied Plan Advice:
+ NO_GATHER(f) /* matched */
+ NO_GATHER(d) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(d f)
+ MERGE_JOIN_PLAIN(f)
+ SEQ_SCAN(f)
+ INDEX_SCAN(d public.gt_dim_pkey)
+ NO_GATHER(f d)
+(15 rows)
+
+COMMIT;
+-- Can't force Gather Merge without the ORDER BY clause, but just Gather is OK.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'gather_merge((f d))';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+-------------------------------------------------
+ Gather
+ Disabled: true
+ Workers Planned: 1
+ -> Parallel Hash Join
+ Hash Cond: (f.dim_id = d.id)
+ -> Parallel Seq Scan on gt_fact f
+ -> Parallel Hash
+ -> Parallel Seq Scan on gt_dim d
+ Supplied Plan Advice:
+ GATHER_MERGE((f d)) /* matched, failed */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ HASH_JOIN(d)
+ SEQ_SCAN(f d)
+ GATHER((f d))
+(15 rows)
+
+SET LOCAL pg_plan_advice.advice = 'gather((f d))';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+-------------------------------------------------
+ Gather
+ Workers Planned: 1
+ -> Parallel Hash Join
+ Hash Cond: (f.dim_id = d.id)
+ -> Parallel Seq Scan on gt_fact f
+ -> Parallel Hash
+ -> Parallel Seq Scan on gt_dim d
+ Supplied Plan Advice:
+ GATHER((f d)) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ HASH_JOIN(d)
+ SEQ_SCAN(f d)
+ GATHER((f d))
+(14 rows)
+
+COMMIT;
+-- Test conflicting advice.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'gather((f d)) no_gather(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+ QUERY PLAN
+-------------------------------------------------------
+ Gather Merge
+ Workers Planned: 1
+ -> Sort
+ Sort Key: f.dim_id
+ -> Parallel Hash Join
+ Hash Cond: (f.dim_id = d.id)
+ -> Parallel Seq Scan on gt_fact f
+ -> Parallel Hash
+ -> Parallel Seq Scan on gt_dim d
+ Supplied Plan Advice:
+ GATHER((f d)) /* matched, conflicting, failed */
+ NO_GATHER(f) /* matched, conflicting, failed */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ HASH_JOIN(d)
+ SEQ_SCAN(f d)
+ GATHER_MERGE((f d))
+(17 rows)
+
+COMMIT;
diff --git a/contrib/pg_plan_advice/expected/join_order.out b/contrib/pg_plan_advice/expected/join_order.out
new file mode 100644
index 00000000000..db0dcef7012
--- /dev/null
+++ b/contrib/pg_plan_advice/expected/join_order.out
@@ -0,0 +1,509 @@
+LOAD 'pg_plan_advice';
+SET max_parallel_workers_per_gather = 0;
+CREATE TABLE jo_dim1 (id integer primary key, dim1 text, val1 int)
+ WITH (autovacuum_enabled = false);
+INSERT INTO jo_dim1 (id, dim1, val1)
+ SELECT g, 'some filler text ' || g, (g % 3) + 1
+ FROM generate_series(1,100) g;
+VACUUM ANALYZE jo_dim1;
+CREATE TABLE jo_dim2 (id integer primary key, dim2 text, val2 int)
+ WITH (autovacuum_enabled = false);
+INSERT INTO jo_dim2 (id, dim2, val2)
+ SELECT g, 'some filler text ' || g, (g % 7) + 1
+ FROM generate_series(1,1000) g;
+VACUUM ANALYZE jo_dim2;
+CREATE TABLE jo_fact (
+ id int primary key,
+ dim1_id integer not null references jo_dim1 (id),
+ dim2_id integer not null references jo_dim2 (id)
+) WITH (autovacuum_enabled = false);
+INSERT INTO jo_fact
+ SELECT g, (g%100)+1, (g%100)+1 FROM generate_series(1,100000) g;
+VACUUM ANALYZE jo_fact;
+-- We expect to join to d2 first and then d1, since the condition on d2
+-- is more selective.
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+ QUERY PLAN
+------------------------------------------
+ Hash Join
+ Hash Cond: (f.dim1_id = d1.id)
+ -> Hash Join
+ Hash Cond: (f.dim2_id = d2.id)
+ -> Seq Scan on jo_fact f
+ -> Hash
+ -> Seq Scan on jo_dim2 d2
+ Filter: (val2 = 1)
+ -> Hash
+ -> Seq Scan on jo_dim1 d1
+ Filter: (val1 = 1)
+ Generated Plan Advice:
+ JOIN_ORDER(f d2 d1)
+ HASH_JOIN(d2 d1)
+ SEQ_SCAN(f d2 d1)
+ NO_GATHER(f d1 d2)
+(16 rows)
+
+-- Force a few different join orders. Some of these are very inefficient,
+-- but the planner considers them all viable.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'join_order(f d1 d2)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+ QUERY PLAN
+------------------------------------------
+ Hash Join
+ Hash Cond: (f.dim2_id = d2.id)
+ -> Hash Join
+ Hash Cond: (f.dim1_id = d1.id)
+ -> Seq Scan on jo_fact f
+ -> Hash
+ -> Seq Scan on jo_dim1 d1
+ Filter: (val1 = 1)
+ -> Hash
+ -> Seq Scan on jo_dim2 d2
+ Filter: (val2 = 1)
+ Supplied Plan Advice:
+ JOIN_ORDER(f d1 d2) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d1 d2)
+ HASH_JOIN(d1 d2)
+ SEQ_SCAN(f d1 d2)
+ NO_GATHER(f d1 d2)
+(18 rows)
+
+SET LOCAL pg_plan_advice.advice = 'join_order(f d2 d1)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+ QUERY PLAN
+------------------------------------------
+ Hash Join
+ Hash Cond: (f.dim1_id = d1.id)
+ -> Hash Join
+ Hash Cond: (f.dim2_id = d2.id)
+ -> Seq Scan on jo_fact f
+ -> Hash
+ -> Seq Scan on jo_dim2 d2
+ Filter: (val2 = 1)
+ -> Hash
+ -> Seq Scan on jo_dim1 d1
+ Filter: (val1 = 1)
+ Supplied Plan Advice:
+ JOIN_ORDER(f d2 d1) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d2 d1)
+ HASH_JOIN(d2 d1)
+ SEQ_SCAN(f d2 d1)
+ NO_GATHER(f d1 d2)
+(18 rows)
+
+SET LOCAL pg_plan_advice.advice = 'join_order(d1 f d2)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+ QUERY PLAN
+-----------------------------------------
+ Hash Join
+ Hash Cond: (f.dim2_id = d2.id)
+ -> Hash Join
+ Hash Cond: (d1.id = f.dim1_id)
+ -> Seq Scan on jo_dim1 d1
+ Filter: (val1 = 1)
+ -> Hash
+ -> Seq Scan on jo_fact f
+ -> Hash
+ -> Seq Scan on jo_dim2 d2
+ Filter: (val2 = 1)
+ Supplied Plan Advice:
+ JOIN_ORDER(d1 f d2) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(d1 f d2)
+ HASH_JOIN(f d2)
+ SEQ_SCAN(d1 f d2)
+ NO_GATHER(f d1 d2)
+(18 rows)
+
+SET LOCAL pg_plan_advice.advice = 'join_order(f (d1 d2))';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+ QUERY PLAN
+-------------------------------------------------------------
+ Merge Join
+ Merge Cond: ((f.dim2_id = d2.id) AND (f.dim1_id = d1.id))
+ -> Sort
+ Sort Key: f.dim2_id, f.dim1_id
+ -> Seq Scan on jo_fact f
+ -> Sort
+ Sort Key: d2.id, d1.id
+ -> Nested Loop
+ -> Seq Scan on jo_dim1 d1
+ Filter: (val1 = 1)
+ -> Materialize
+ -> Seq Scan on jo_dim2 d2
+ Filter: (val2 = 1)
+ Supplied Plan Advice:
+ JOIN_ORDER(f (d1 d2)) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f (d1 d2))
+ MERGE_JOIN_PLAIN((d1 d2))
+ NESTED_LOOP_MATERIALIZE(d2)
+ SEQ_SCAN(f d1 d2)
+ NO_GATHER(f d1 d2)
+(21 rows)
+
+SET LOCAL pg_plan_advice.advice = 'join_order(f {d1 d2})';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+ QUERY PLAN
+-------------------------------------------------------------
+ Merge Join
+ Merge Cond: ((f.dim2_id = d2.id) AND (f.dim1_id = d1.id))
+ -> Sort
+ Sort Key: f.dim2_id, f.dim1_id
+ -> Seq Scan on jo_fact f
+ -> Sort
+ Sort Key: d2.id, d1.id
+ -> Nested Loop
+ -> Seq Scan on jo_dim2 d2
+ Filter: (val2 = 1)
+ -> Materialize
+ -> Seq Scan on jo_dim1 d1
+ Filter: (val1 = 1)
+ Supplied Plan Advice:
+ JOIN_ORDER(f {d1 d2}) /* matched, failed */
+ Generated Plan Advice:
+ JOIN_ORDER(f (d2 d1))
+ MERGE_JOIN_PLAIN((d1 d2))
+ NESTED_LOOP_MATERIALIZE(d1)
+ SEQ_SCAN(f d2 d1)
+ NO_GATHER(f d1 d2)
+(21 rows)
+
+COMMIT;
+-- Force a join order by mentioning just a prefix of the join list.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'join_order(d2)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+ QUERY PLAN
+------------------------------------------------
+ Hash Join
+ Hash Cond: (d2.id = f.dim2_id)
+ -> Seq Scan on jo_dim2 d2
+ Filter: (val2 = 1)
+ -> Hash
+ -> Hash Join
+ Hash Cond: (f.dim1_id = d1.id)
+ -> Seq Scan on jo_fact f
+ -> Hash
+ -> Seq Scan on jo_dim1 d1
+ Filter: (val1 = 1)
+ Supplied Plan Advice:
+ JOIN_ORDER(d2) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(d2 (f d1))
+ HASH_JOIN(d1 (f d1))
+ SEQ_SCAN(d2 f d1)
+ NO_GATHER(f d1 d2)
+(18 rows)
+
+SET LOCAL pg_plan_advice.advice = 'join_order(d2 d1)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+ QUERY PLAN
+-------------------------------------------------------------
+ Merge Join
+ Merge Cond: ((d2.id = f.dim2_id) AND (d1.id = f.dim1_id))
+ -> Sort
+ Sort Key: d2.id, d1.id
+ -> Nested Loop
+ -> Seq Scan on jo_dim2 d2
+ Filter: (val2 = 1)
+ -> Materialize
+ -> Seq Scan on jo_dim1 d1
+ Filter: (val1 = 1)
+ -> Sort
+ Sort Key: f.dim2_id, f.dim1_id
+ -> Seq Scan on jo_fact f
+ Supplied Plan Advice:
+ JOIN_ORDER(d2 d1) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(d2 d1 f)
+ MERGE_JOIN_PLAIN(f)
+ NESTED_LOOP_MATERIALIZE(d1)
+ SEQ_SCAN(d2 d1 f)
+ NO_GATHER(f d1 d2)
+(21 rows)
+
+COMMIT;
+-- jo_fact is not partitioned, but let's try pretending that it is and
+-- verifying that the advice does not apply.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'join_order(f/d1 d1 d2)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+ QUERY PLAN
+-------------------------------------------------------------
+ Nested Loop
+ Disabled: true
+ -> Nested Loop
+ Disabled: true
+ -> Seq Scan on jo_fact f
+ -> Index Scan using jo_dim1_pkey on jo_dim1 d1
+ Index Cond: (id = f.dim1_id)
+ Filter: (val1 = 1)
+ -> Index Scan using jo_dim2_pkey on jo_dim2 d2
+ Index Cond: (id = f.dim2_id)
+ Filter: (val2 = 1)
+ Supplied Plan Advice:
+ JOIN_ORDER(f/d1 d1 d2) /* partially matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d1 d2)
+ NESTED_LOOP_PLAIN(d1 d2)
+ SEQ_SCAN(f)
+ INDEX_SCAN(d1 public.jo_dim1_pkey d2 public.jo_dim2_pkey)
+ NO_GATHER(f d1 d2)
+(19 rows)
+
+SET LOCAL pg_plan_advice.advice = 'join_order(f/d1 (d1 d2))';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+ QUERY PLAN
+--------------------------------------------------------------
+ Nested Loop
+ Disabled: true
+ Join Filter: ((d1.id = f.dim1_id) AND (d2.id = f.dim2_id))
+ -> Nested Loop
+ -> Seq Scan on jo_dim1 d1
+ Filter: (val1 = 1)
+ -> Materialize
+ -> Seq Scan on jo_dim2 d2
+ Filter: (val2 = 1)
+ -> Seq Scan on jo_fact f
+ Supplied Plan Advice:
+ JOIN_ORDER(f/d1 (d1 d2)) /* partially matched */
+ Generated Plan Advice:
+ JOIN_ORDER(d1 d2 f)
+ NESTED_LOOP_PLAIN(f)
+ NESTED_LOOP_MATERIALIZE(d2)
+ SEQ_SCAN(d1 d2 f)
+ NO_GATHER(f d1 d2)
+(18 rows)
+
+COMMIT;
+-- The unusual formulation of this query is intended to prevent the query
+-- planner from reducing the FULL JOIN to some other join type, so that we
+-- can test what happens with a join type that cannot be reordered.
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_dim1 d1
+ INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0)
+ ON d1.id = f.dim1_id OR f.dim1_id IS NULL;
+ QUERY PLAN
+-------------------------------------------------------------
+ Nested Loop
+ Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL))
+ -> Merge Full Join
+ Merge Cond: (((d2.id + 0)) = ((f.dim2_id + 0)))
+ -> Sort
+ Sort Key: ((d2.id + 0))
+ -> Seq Scan on jo_dim2 d2
+ -> Sort
+ Sort Key: ((f.dim2_id + 0))
+ -> Seq Scan on jo_fact f
+ -> Materialize
+ -> Seq Scan on jo_dim1 d1
+ Generated Plan Advice:
+ JOIN_ORDER(d2 f d1)
+ MERGE_JOIN_PLAIN(f)
+ NESTED_LOOP_MATERIALIZE(d1)
+ SEQ_SCAN(d2 f d1)
+ NO_GATHER(d1 f d2)
+(18 rows)
+
+-- We should not be able to force the planner to join f to d1 first, because
+-- that is not a valid join order, but we should be able to force the planner
+-- to make either d2 or f the driving table.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'join_order(f d1 d2)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_dim1 d1
+ INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0)
+ ON d1.id = f.dim1_id OR f.dim1_id IS NULL;
+ QUERY PLAN
+-------------------------------------------------------------
+ Nested Loop
+ Disabled: true
+ Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL))
+ -> Merge Full Join
+ Disabled: true
+ Merge Cond: (((d2.id + 0)) = ((f.dim2_id + 0)))
+ -> Sort
+ Sort Key: ((d2.id + 0))
+ -> Seq Scan on jo_dim2 d2
+ -> Sort
+ Sort Key: ((f.dim2_id + 0))
+ -> Seq Scan on jo_fact f
+ -> Seq Scan on jo_dim1 d1
+ Supplied Plan Advice:
+ JOIN_ORDER(f d1 d2) /* partially matched */
+ Generated Plan Advice:
+ JOIN_ORDER(d2 f d1)
+ MERGE_JOIN_PLAIN(f)
+ NESTED_LOOP_PLAIN(d1)
+ SEQ_SCAN(d2 f d1)
+ NO_GATHER(d1 f d2)
+(21 rows)
+
+SET LOCAL pg_plan_advice.advice = 'join_order(f d2 d1)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_dim1 d1
+ INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0)
+ ON d1.id = f.dim1_id OR f.dim1_id IS NULL;
+ QUERY PLAN
+-------------------------------------------------------------
+ Nested Loop
+ Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL))
+ -> Merge Full Join
+ Merge Cond: (((f.dim2_id + 0)) = ((d2.id + 0)))
+ -> Sort
+ Sort Key: ((f.dim2_id + 0))
+ -> Seq Scan on jo_fact f
+ -> Sort
+ Sort Key: ((d2.id + 0))
+ -> Seq Scan on jo_dim2 d2
+ -> Materialize
+ -> Seq Scan on jo_dim1 d1
+ Supplied Plan Advice:
+ JOIN_ORDER(f d2 d1) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d2 d1)
+ MERGE_JOIN_PLAIN(d2)
+ NESTED_LOOP_MATERIALIZE(d1)
+ SEQ_SCAN(f d2 d1)
+ NO_GATHER(d1 f d2)
+(20 rows)
+
+SET LOCAL pg_plan_advice.advice = 'join_order(d2 f d1)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_dim1 d1
+ INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0)
+ ON d1.id = f.dim1_id OR f.dim1_id IS NULL;
+ QUERY PLAN
+-------------------------------------------------------------
+ Nested Loop
+ Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL))
+ -> Merge Full Join
+ Merge Cond: (((d2.id + 0)) = ((f.dim2_id + 0)))
+ -> Sort
+ Sort Key: ((d2.id + 0))
+ -> Seq Scan on jo_dim2 d2
+ -> Sort
+ Sort Key: ((f.dim2_id + 0))
+ -> Seq Scan on jo_fact f
+ -> Materialize
+ -> Seq Scan on jo_dim1 d1
+ Supplied Plan Advice:
+ JOIN_ORDER(d2 f d1) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(d2 f d1)
+ MERGE_JOIN_PLAIN(f)
+ NESTED_LOOP_MATERIALIZE(d1)
+ SEQ_SCAN(d2 f d1)
+ NO_GATHER(d1 f d2)
+(20 rows)
+
+COMMIT;
+-- Two incompatible join orders should conflict. In the second case,
+-- the conflict is implicit: if d1 is on the inner side of a join of any
+-- type, it cannot also be the driving table.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'join_order(f) join_order(d1)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_dim1 d1
+ INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0)
+ ON d1.id = f.dim1_id OR f.dim1_id IS NULL;
+ QUERY PLAN
+-------------------------------------------------------------
+ Nested Loop
+ Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL))
+ -> Merge Full Join
+ Merge Cond: (((f.dim2_id + 0)) = ((d2.id + 0)))
+ -> Sort
+ Sort Key: ((f.dim2_id + 0))
+ -> Seq Scan on jo_fact f
+ -> Sort
+ Sort Key: ((d2.id + 0))
+ -> Seq Scan on jo_dim2 d2
+ -> Materialize
+ -> Seq Scan on jo_dim1 d1
+ Supplied Plan Advice:
+ JOIN_ORDER(f) /* matched, conflicting */
+ JOIN_ORDER(d1) /* matched, conflicting, failed */
+ Generated Plan Advice:
+ JOIN_ORDER(f d2 d1)
+ MERGE_JOIN_PLAIN(d2)
+ NESTED_LOOP_MATERIALIZE(d1)
+ SEQ_SCAN(f d2 d1)
+ NO_GATHER(d1 f d2)
+(21 rows)
+
+SET LOCAL pg_plan_advice.advice = 'join_order(d1) hash_join(d1)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_dim1 d1
+ INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0)
+ ON d1.id = f.dim1_id OR f.dim1_id IS NULL;
+ QUERY PLAN
+---------------------------------------------------------------
+ Nested Loop
+ Join Filter: ((d1.id = f.dim1_id) OR (f.dim1_id IS NULL))
+ -> Seq Scan on jo_dim1 d1
+ -> Materialize
+ -> Merge Full Join
+ Merge Cond: (((d2.id + 0)) = ((f.dim2_id + 0)))
+ -> Sort
+ Sort Key: ((d2.id + 0))
+ -> Seq Scan on jo_dim2 d2
+ -> Sort
+ Sort Key: ((f.dim2_id + 0))
+ -> Seq Scan on jo_fact f
+ Supplied Plan Advice:
+ JOIN_ORDER(d1) /* matched, conflicting */
+ HASH_JOIN(d1) /* matched, conflicting, failed */
+ Generated Plan Advice:
+ JOIN_ORDER(d1 (d2 f))
+ MERGE_JOIN_PLAIN(f)
+ NESTED_LOOP_MATERIALIZE((f d2))
+ SEQ_SCAN(d1 d2 f)
+ NO_GATHER(d1 f d2)
+(21 rows)
+
+COMMIT;
diff --git a/contrib/pg_plan_advice/expected/join_strategy.out b/contrib/pg_plan_advice/expected/join_strategy.out
new file mode 100644
index 00000000000..0f9db692190
--- /dev/null
+++ b/contrib/pg_plan_advice/expected/join_strategy.out
@@ -0,0 +1,339 @@
+LOAD 'pg_plan_advice';
+SET max_parallel_workers_per_gather = 0;
+CREATE TABLE join_dim (id serial primary key, dim text)
+ WITH (autovacuum_enabled = false);
+INSERT INTO join_dim (dim) SELECT random()::text FROM generate_series(1,100) g;
+VACUUM ANALYZE join_dim;
+CREATE TABLE join_fact (
+ id int primary key,
+ dim_id integer not null references join_dim (id)
+) WITH (autovacuum_enabled = false);
+INSERT INTO join_fact
+ SELECT g, (g%3)+1 FROM generate_series(1,100000) g;
+CREATE INDEX join_fact_dim_id ON join_fact (dim_id);
+VACUUM ANALYZE join_fact;
+-- We expect a hash join by default.
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+------------------------------------
+ Hash Join
+ Hash Cond: (f.dim_id = d.id)
+ -> Seq Scan on join_fact f
+ -> Hash
+ -> Seq Scan on join_dim d
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ HASH_JOIN(d)
+ SEQ_SCAN(f d)
+ NO_GATHER(f d)
+(10 rows)
+
+-- Try forcing each join method in turn with join_dim as the inner table.
+-- All of these should work except for MERGE_JOIN_MATERIALIZE; that will
+-- fail, because the planner knows that join_dim (id) is unique, and will
+-- refuse to add mark/restore overhead.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'HASH_JOIN(d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+------------------------------------
+ Hash Join
+ Hash Cond: (f.dim_id = d.id)
+ -> Seq Scan on join_fact f
+ -> Hash
+ -> Seq Scan on join_dim d
+ Supplied Plan Advice:
+ HASH_JOIN(d) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ HASH_JOIN(d)
+ SEQ_SCAN(f d)
+ NO_GATHER(f d)
+(12 rows)
+
+SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_MATERIALIZE(d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+----------------------------------------------------------------
+ Merge Join
+ Disabled: true
+ Merge Cond: (f.dim_id = d.id)
+ -> Index Scan using join_fact_dim_id on join_fact f
+ -> Index Scan using join_dim_pkey on join_dim d
+ Supplied Plan Advice:
+ MERGE_JOIN_MATERIALIZE(d) /* matched, failed */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ MERGE_JOIN_PLAIN(d)
+ INDEX_SCAN(f public.join_fact_dim_id d public.join_dim_pkey)
+ NO_GATHER(f d)
+(12 rows)
+
+SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_PLAIN(d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+----------------------------------------------------------------
+ Merge Join
+ Merge Cond: (f.dim_id = d.id)
+ -> Index Scan using join_fact_dim_id on join_fact f
+ -> Index Scan using join_dim_pkey on join_dim d
+ Supplied Plan Advice:
+ MERGE_JOIN_PLAIN(d) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ MERGE_JOIN_PLAIN(d)
+ INDEX_SCAN(f public.join_fact_dim_id d public.join_dim_pkey)
+ NO_GATHER(f d)
+(11 rows)
+
+SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MATERIALIZE(d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+--------------------------------------------
+ Nested Loop
+ Join Filter: (f.dim_id = d.id)
+ -> Seq Scan on join_fact f
+ -> Materialize
+ -> Seq Scan on join_dim d
+ Supplied Plan Advice:
+ NESTED_LOOP_MATERIALIZE(d) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ NESTED_LOOP_MATERIALIZE(d)
+ SEQ_SCAN(f d)
+ NO_GATHER(f d)
+(12 rows)
+
+SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MEMOIZE(d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+----------------------------------------------------------
+ Nested Loop
+ -> Seq Scan on join_fact f
+ -> Memoize
+ Cache Key: f.dim_id
+ Cache Mode: logical
+ -> Index Scan using join_dim_pkey on join_dim d
+ Index Cond: (id = f.dim_id)
+ Supplied Plan Advice:
+ NESTED_LOOP_MEMOIZE(d) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ NESTED_LOOP_MEMOIZE(d)
+ SEQ_SCAN(f)
+ INDEX_SCAN(d public.join_dim_pkey)
+ NO_GATHER(f d)
+(15 rows)
+
+SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+----------------------------------------------------
+ Nested Loop
+ -> Seq Scan on join_fact f
+ -> Index Scan using join_dim_pkey on join_dim d
+ Index Cond: (id = f.dim_id)
+ Supplied Plan Advice:
+ NESTED_LOOP_PLAIN(d) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ NESTED_LOOP_PLAIN(d)
+ SEQ_SCAN(f)
+ INDEX_SCAN(d public.join_dim_pkey)
+ NO_GATHER(f d)
+(12 rows)
+
+COMMIT;
+-- Now try forcing each join method in turn with join_fact as the inner
+-- table. All of these should work.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'HASH_JOIN(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+-------------------------------------
+ Hash Join
+ Hash Cond: (d.id = f.dim_id)
+ -> Seq Scan on join_dim d
+ -> Hash
+ -> Seq Scan on join_fact f
+ Supplied Plan Advice:
+ HASH_JOIN(f) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(d f)
+ HASH_JOIN(f)
+ SEQ_SCAN(d f)
+ NO_GATHER(f d)
+(12 rows)
+
+SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_MATERIALIZE(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+----------------------------------------------------------------
+ Merge Join
+ Merge Cond: (d.id = f.dim_id)
+ -> Index Scan using join_dim_pkey on join_dim d
+ -> Materialize
+ -> Index Scan using join_fact_dim_id on join_fact f
+ Supplied Plan Advice:
+ MERGE_JOIN_MATERIALIZE(f) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(d f)
+ MERGE_JOIN_MATERIALIZE(f)
+ INDEX_SCAN(d public.join_dim_pkey f public.join_fact_dim_id)
+ NO_GATHER(f d)
+(12 rows)
+
+SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_PLAIN(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+----------------------------------------------------------------
+ Merge Join
+ Merge Cond: (d.id = f.dim_id)
+ -> Index Scan using join_dim_pkey on join_dim d
+ -> Index Scan using join_fact_dim_id on join_fact f
+ Supplied Plan Advice:
+ MERGE_JOIN_PLAIN(f) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(d f)
+ MERGE_JOIN_PLAIN(f)
+ INDEX_SCAN(d public.join_dim_pkey f public.join_fact_dim_id)
+ NO_GATHER(f d)
+(11 rows)
+
+SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MATERIALIZE(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+--------------------------------------------
+ Nested Loop
+ Join Filter: (f.dim_id = d.id)
+ -> Seq Scan on join_dim d
+ -> Materialize
+ -> Seq Scan on join_fact f
+ Supplied Plan Advice:
+ NESTED_LOOP_MATERIALIZE(f) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(d f)
+ NESTED_LOOP_MATERIALIZE(f)
+ SEQ_SCAN(d f)
+ NO_GATHER(f d)
+(12 rows)
+
+SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MEMOIZE(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+--------------------------------------------------------------
+ Nested Loop
+ -> Seq Scan on join_dim d
+ -> Memoize
+ Cache Key: d.id
+ Cache Mode: logical
+ -> Index Scan using join_fact_dim_id on join_fact f
+ Index Cond: (dim_id = d.id)
+ Supplied Plan Advice:
+ NESTED_LOOP_MEMOIZE(f) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(d f)
+ NESTED_LOOP_MEMOIZE(f)
+ SEQ_SCAN(d)
+ INDEX_SCAN(f public.join_fact_dim_id)
+ NO_GATHER(f d)
+(15 rows)
+
+SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+--------------------------------------------------------
+ Nested Loop
+ -> Seq Scan on join_dim d
+ -> Index Scan using join_fact_dim_id on join_fact f
+ Index Cond: (dim_id = d.id)
+ Supplied Plan Advice:
+ NESTED_LOOP_PLAIN(f) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(d f)
+ NESTED_LOOP_PLAIN(f)
+ SEQ_SCAN(d)
+ INDEX_SCAN(f public.join_fact_dim_id)
+ NO_GATHER(f d)
+(12 rows)
+
+COMMIT;
+-- Non-working cases. We can't force a foreign join between these tables,
+-- because they aren't foreign tables. We also can't use two different
+-- strategies on the same table, nor can we put both tables on the inner
+-- side of the same join.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'FOREIGN_JOIN((f d))';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+----------------------------------------------------
+ Nested Loop
+ Disabled: true
+ -> Seq Scan on join_fact f
+ -> Index Scan using join_dim_pkey on join_dim d
+ Index Cond: (id = f.dim_id)
+ Supplied Plan Advice:
+ FOREIGN_JOIN((f d)) /* matched, failed */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ NESTED_LOOP_PLAIN(d)
+ SEQ_SCAN(f)
+ INDEX_SCAN(d public.join_dim_pkey)
+ NO_GATHER(f d)
+(13 rows)
+
+SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f) NESTED_LOOP_MATERIALIZE(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+-----------------------------------------------------------------
+ Merge Join
+ Merge Cond: (d.id = f.dim_id)
+ -> Index Scan using join_dim_pkey on join_dim d
+ -> Index Scan using join_fact_dim_id on join_fact f
+ Supplied Plan Advice:
+ NESTED_LOOP_PLAIN(f) /* matched, conflicting, failed */
+ NESTED_LOOP_MATERIALIZE(f) /* matched, conflicting, failed */
+ Generated Plan Advice:
+ JOIN_ORDER(d f)
+ MERGE_JOIN_PLAIN(f)
+ INDEX_SCAN(d public.join_dim_pkey f public.join_fact_dim_id)
+ NO_GATHER(f d)
+(12 rows)
+
+SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+----------------------------------------------------
+ Nested Loop
+ Disabled: true
+ -> Seq Scan on join_fact f
+ -> Index Scan using join_dim_pkey on join_dim d
+ Index Cond: (id = f.dim_id)
+ Supplied Plan Advice:
+ NESTED_LOOP_PLAIN(f) /* matched, failed */
+ NESTED_LOOP_PLAIN(d) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ NESTED_LOOP_PLAIN(d)
+ SEQ_SCAN(f)
+ INDEX_SCAN(d public.join_dim_pkey)
+ NO_GATHER(f d)
+(14 rows)
+
+COMMIT;
diff --git a/contrib/pg_plan_advice/expected/local_collector.out b/contrib/pg_plan_advice/expected/local_collector.out
new file mode 100644
index 00000000000..f2adef39ed8
--- /dev/null
+++ b/contrib/pg_plan_advice/expected/local_collector.out
@@ -0,0 +1,67 @@
+CREATE EXTENSION pg_plan_advice;
+SET debug_parallel_query = off;
+-- Try clearing advice before we've collected any.
+SELECT pg_clear_collected_local_advice();
+ pg_clear_collected_local_advice
+---------------------------------
+
+(1 row)
+
+-- Set a small advice collection limit so that we'll exceed it.
+SET pg_plan_advice.local_collection_limit = 2;
+-- Enable the collector.
+SET pg_plan_advice.local_collector = on;
+-- Set up a dummy table.
+CREATE TABLE dummy_table (a int primary key, b text)
+ WITH (autovacuum_enabled = false, parallel_workers = 0);
+-- Test queries.
+SELECT * FROM dummy_table a, dummy_table b;
+ a | b | a | b
+---+---+---+---
+(0 rows)
+
+SELECT * FROM dummy_table;
+ a | b
+---+---
+(0 rows)
+
+-- Should return the advice from the second test query.
+SELECT advice FROM pg_get_collected_local_advice() ORDER BY id DESC LIMIT 1;
+ advice
+------------------------
+ SEQ_SCAN(dummy_table) +
+ NO_GATHER(dummy_table)
+(1 row)
+
+-- Now try clearing advice again.
+SELECT pg_clear_collected_local_advice();
+ pg_clear_collected_local_advice
+---------------------------------
+
+(1 row)
+
+-- Raise the collection limit so that the collector uses multiple chunks.
+SET pg_plan_advice.local_collection_limit = 2000;
+-- Push a bunch of queries through the collector.
+DO $$
+BEGIN
+ FOR x IN 1..2000 LOOP
+ EXECUTE 'SELECT * FROM dummy_table';
+ END LOOP;
+END
+$$;
+-- Check that the collector worked.
+SELECT COUNT(*) FROM pg_get_collected_local_advice();
+ count
+-------
+ 2000
+(1 row)
+
+-- And clear one more time, to verify that this doesn't cause a problem
+-- even with a larger number of entries.
+SELECT pg_clear_collected_local_advice();
+ pg_clear_collected_local_advice
+---------------------------------
+
+(1 row)
+
diff --git a/contrib/pg_plan_advice/expected/partitionwise.out b/contrib/pg_plan_advice/expected/partitionwise.out
new file mode 100644
index 00000000000..2b3d0a82443
--- /dev/null
+++ b/contrib/pg_plan_advice/expected/partitionwise.out
@@ -0,0 +1,426 @@
+LOAD 'pg_plan_advice';
+SET max_parallel_workers_per_gather = 0;
+SET enable_partitionwise_join = true;
+CREATE TABLE pt1 (id integer primary key, dim1 text, val1 int)
+ PARTITION BY RANGE (id);
+CREATE TABLE pt1a PARTITION OF pt1 FOR VALUES FROM (1) to (1001)
+ WITH (autovacuum_enabled = false);
+CREATE TABLE pt1b PARTITION OF pt1 FOR VALUES FROM (1001) to (2001)
+ WITH (autovacuum_enabled = false);
+CREATE TABLE pt1c PARTITION OF pt1 FOR VALUES FROM (2001) to (3001)
+ WITH (autovacuum_enabled = false);
+INSERT INTO pt1 (id, dim1, val1)
+ SELECT g, 'some filler text ' || g, (g % 3) + 1
+ FROM generate_series(1,3000) g;
+VACUUM ANALYZE pt1;
+CREATE TABLE pt2 (id integer primary key, dim2 text, val2 int)
+ PARTITION BY RANGE (id);
+CREATE TABLE pt2a PARTITION OF pt2 FOR VALUES FROM (1) to (1001)
+ WITH (autovacuum_enabled = false);
+CREATE TABLE pt2b PARTITION OF pt2 FOR VALUES FROM (1001) to (2001)
+ WITH (autovacuum_enabled = false);
+CREATE TABLE pt2c PARTITION OF pt2 FOR VALUES FROM (2001) to (3001)
+ WITH (autovacuum_enabled = false);
+INSERT INTO pt2 (id, dim2, val2)
+ SELECT g, 'some other text ' || g, (g % 5) + 1
+ FROM generate_series(1,3000,2) g;
+VACUUM ANALYZE pt2;
+CREATE TABLE pt3 (id integer primary key, dim3 text, val3 int)
+ PARTITION BY RANGE (id);
+CREATE TABLE pt3a PARTITION OF pt3 FOR VALUES FROM (1) to (1001)
+ WITH (autovacuum_enabled = false);
+CREATE TABLE pt3b PARTITION OF pt3 FOR VALUES FROM (1001) to (2001)
+ WITH (autovacuum_enabled = false);
+CREATE TABLE pt3c PARTITION OF pt3 FOR VALUES FROM (2001) to (3001)
+ WITH (autovacuum_enabled = false);
+INSERT INTO pt3 (id, dim3, val3)
+ SELECT g, 'a third random text ' || g, (g % 7) + 1
+ FROM generate_series(1,3000,3) g;
+VACUUM ANALYZE pt3;
+CREATE TABLE ptmismatch (id integer primary key, dimm text, valm int)
+ PARTITION BY RANGE (id);
+CREATE TABLE ptmismatcha PARTITION OF ptmismatch
+ FOR VALUES FROM (1) to (1501)
+ WITH (autovacuum_enabled = false);
+CREATE TABLE ptmismatchb PARTITION OF ptmismatch
+ FOR VALUES FROM (1501) to (3001)
+ WITH (autovacuum_enabled = false);
+INSERT INTO ptmismatch (id, dimm, valm)
+ SELECT g, 'yet another text ' || g, (g % 2) + 1
+ FROM generate_series(1,3000) g;
+VACUUM ANALYZE ptmismatch;
+EXPLAIN (PLAN_ADVICE, COSTS OFF)
+SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id
+ AND val1 = 1 AND val2 = 1 AND val3 = 1;
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Append
+ -> Nested Loop
+ -> Hash Join
+ Hash Cond: (pt2_1.id = pt3_1.id)
+ -> Seq Scan on pt2a pt2_1
+ Filter: (val2 = 1)
+ -> Hash
+ -> Seq Scan on pt3a pt3_1
+ Filter: (val3 = 1)
+ -> Index Scan using pt1a_pkey on pt1a pt1_1
+ Index Cond: (id = pt2_1.id)
+ Filter: (val1 = 1)
+ -> Nested Loop
+ -> Hash Join
+ Hash Cond: (pt2_2.id = pt3_2.id)
+ -> Seq Scan on pt2b pt2_2
+ Filter: (val2 = 1)
+ -> Hash
+ -> Seq Scan on pt3b pt3_2
+ Filter: (val3 = 1)
+ -> Index Scan using pt1b_pkey on pt1b pt1_2
+ Index Cond: (id = pt2_2.id)
+ Filter: (val1 = 1)
+ -> Nested Loop
+ -> Hash Join
+ Hash Cond: (pt2_3.id = pt3_3.id)
+ -> Seq Scan on pt2c pt2_3
+ Filter: (val2 = 1)
+ -> Hash
+ -> Seq Scan on pt3c pt3_3
+ Filter: (val3 = 1)
+ -> Index Scan using pt1c_pkey on pt1c pt1_3
+ Index Cond: (id = pt2_3.id)
+ Filter: (val1 = 1)
+ Generated Plan Advice:
+ JOIN_ORDER(pt2/public.pt2a pt3/public.pt3a pt1/public.pt1a)
+ JOIN_ORDER(pt2/public.pt2b pt3/public.pt3b pt1/public.pt1b)
+ JOIN_ORDER(pt2/public.pt2c pt3/public.pt3c pt1/public.pt1c)
+ NESTED_LOOP_PLAIN(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c)
+ HASH_JOIN(pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c)
+ SEQ_SCAN(pt2/public.pt2a pt3/public.pt3a pt2/public.pt2b pt3/public.pt3b
+ pt2/public.pt2c pt3/public.pt3c)
+ INDEX_SCAN(pt1/public.pt1a public.pt1a_pkey pt1/public.pt1b public.pt1b_pkey
+ pt1/public.pt1c public.pt1c_pkey)
+ PARTITIONWISE((pt1 pt2 pt3))
+ NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a
+ pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c)
+(47 rows)
+
+-- Suppress partitionwise join, or do it just partially.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE(pt1 pt2 pt3)';
+EXPLAIN (PLAN_ADVICE, COSTS OFF)
+SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id
+ AND val1 = 1 AND val2 = 1 AND val3 = 1;
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Nested Loop
+ -> Hash Join
+ Hash Cond: (pt2.id = pt3.id)
+ -> Append
+ -> Seq Scan on pt2a pt2_1
+ Filter: (val2 = 1)
+ -> Seq Scan on pt2b pt2_2
+ Filter: (val2 = 1)
+ -> Seq Scan on pt2c pt2_3
+ Filter: (val2 = 1)
+ -> Hash
+ -> Append
+ -> Seq Scan on pt3a pt3_1
+ Filter: (val3 = 1)
+ -> Seq Scan on pt3b pt3_2
+ Filter: (val3 = 1)
+ -> Seq Scan on pt3c pt3_3
+ Filter: (val3 = 1)
+ -> Append
+ -> Index Scan using pt1a_pkey on pt1a pt1_1
+ Index Cond: (id = pt2.id)
+ Filter: (val1 = 1)
+ -> Index Scan using pt1b_pkey on pt1b pt1_2
+ Index Cond: (id = pt2.id)
+ Filter: (val1 = 1)
+ -> Index Scan using pt1c_pkey on pt1c pt1_3
+ Index Cond: (id = pt2.id)
+ Filter: (val1 = 1)
+ Supplied Plan Advice:
+ PARTITIONWISE(pt1) /* matched */
+ PARTITIONWISE(pt2) /* matched */
+ PARTITIONWISE(pt3) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(pt2 pt3 pt1)
+ NESTED_LOOP_PLAIN(pt1)
+ HASH_JOIN(pt3)
+ SEQ_SCAN(pt2/public.pt2a pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a
+ pt3/public.pt3b pt3/public.pt3c)
+ INDEX_SCAN(pt1/public.pt1a public.pt1a_pkey pt1/public.pt1b public.pt1b_pkey
+ pt1/public.pt1c public.pt1c_pkey)
+ PARTITIONWISE(pt2 pt3 pt1)
+ NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a
+ pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c)
+(43 rows)
+
+SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 pt2) pt3)';
+EXPLAIN (PLAN_ADVICE, COSTS OFF)
+SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id
+ AND val1 = 1 AND val2 = 1 AND val3 = 1;
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Hash Join
+ Hash Cond: (pt1.id = pt3.id)
+ -> Append
+ -> Hash Join
+ Hash Cond: (pt1_1.id = pt2_1.id)
+ -> Seq Scan on pt1a pt1_1
+ Filter: (val1 = 1)
+ -> Hash
+ -> Seq Scan on pt2a pt2_1
+ Filter: (val2 = 1)
+ -> Hash Join
+ Hash Cond: (pt1_2.id = pt2_2.id)
+ -> Seq Scan on pt1b pt1_2
+ Filter: (val1 = 1)
+ -> Hash
+ -> Seq Scan on pt2b pt2_2
+ Filter: (val2 = 1)
+ -> Hash Join
+ Hash Cond: (pt1_3.id = pt2_3.id)
+ -> Seq Scan on pt1c pt1_3
+ Filter: (val1 = 1)
+ -> Hash
+ -> Seq Scan on pt2c pt2_3
+ Filter: (val2 = 1)
+ -> Hash
+ -> Append
+ -> Seq Scan on pt3a pt3_1
+ Filter: (val3 = 1)
+ -> Seq Scan on pt3b pt3_2
+ Filter: (val3 = 1)
+ -> Seq Scan on pt3c pt3_3
+ Filter: (val3 = 1)
+ Supplied Plan Advice:
+ PARTITIONWISE((pt1 pt2)) /* matched */
+ PARTITIONWISE(pt3) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(pt1/public.pt1a pt2/public.pt2a)
+ JOIN_ORDER(pt1/public.pt1b pt2/public.pt2b)
+ JOIN_ORDER(pt1/public.pt1c pt2/public.pt2c)
+ JOIN_ORDER({pt1 pt2} pt3)
+ HASH_JOIN(pt2/public.pt2a pt2/public.pt2b pt2/public.pt2c pt3)
+ SEQ_SCAN(pt1/public.pt1a pt2/public.pt2a pt1/public.pt1b pt2/public.pt2b
+ pt1/public.pt1c pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b
+ pt3/public.pt3c)
+ PARTITIONWISE((pt1 pt2) pt3)
+ NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a
+ pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c)
+(47 rows)
+
+COMMIT;
+-- Test conflicting advice.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 pt2) (pt1 pt3))';
+EXPLAIN (PLAN_ADVICE, COSTS OFF)
+SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id
+ AND val1 = 1 AND val2 = 1 AND val3 = 1;
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Append
+ Disabled: true
+ -> Nested Loop
+ -> Hash Join
+ Hash Cond: (pt2_1.id = pt3_1.id)
+ -> Seq Scan on pt2a pt2_1
+ Filter: (val2 = 1)
+ -> Hash
+ -> Seq Scan on pt3a pt3_1
+ Filter: (val3 = 1)
+ -> Index Scan using pt1a_pkey on pt1a pt1_1
+ Index Cond: (id = pt2_1.id)
+ Filter: (val1 = 1)
+ -> Nested Loop
+ -> Hash Join
+ Hash Cond: (pt2_2.id = pt3_2.id)
+ -> Seq Scan on pt2b pt2_2
+ Filter: (val2 = 1)
+ -> Hash
+ -> Seq Scan on pt3b pt3_2
+ Filter: (val3 = 1)
+ -> Index Scan using pt1b_pkey on pt1b pt1_2
+ Index Cond: (id = pt2_2.id)
+ Filter: (val1 = 1)
+ -> Nested Loop
+ -> Hash Join
+ Hash Cond: (pt2_3.id = pt3_3.id)
+ -> Seq Scan on pt2c pt2_3
+ Filter: (val2 = 1)
+ -> Hash
+ -> Seq Scan on pt3c pt3_3
+ Filter: (val3 = 1)
+ -> Index Scan using pt1c_pkey on pt1c pt1_3
+ Index Cond: (id = pt2_3.id)
+ Filter: (val1 = 1)
+ Supplied Plan Advice:
+ PARTITIONWISE((pt1 pt2)) /* matched, conflicting, failed */
+ PARTITIONWISE((pt1 pt3)) /* matched, conflicting, failed */
+ Generated Plan Advice:
+ JOIN_ORDER(pt2/public.pt2a pt3/public.pt3a pt1/public.pt1a)
+ JOIN_ORDER(pt2/public.pt2b pt3/public.pt3b pt1/public.pt1b)
+ JOIN_ORDER(pt2/public.pt2c pt3/public.pt3c pt1/public.pt1c)
+ NESTED_LOOP_PLAIN(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c)
+ HASH_JOIN(pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c)
+ SEQ_SCAN(pt2/public.pt2a pt3/public.pt3a pt2/public.pt2b pt3/public.pt3b
+ pt2/public.pt2c pt3/public.pt3c)
+ INDEX_SCAN(pt1/public.pt1a public.pt1a_pkey pt1/public.pt1b public.pt1b_pkey
+ pt1/public.pt1c public.pt1c_pkey)
+ PARTITIONWISE((pt1 pt2 pt3))
+ NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a
+ pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c)
+(51 rows)
+
+COMMIT;
+-- Can't force a partitionwise join with a mismatched table.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 ptmismatch))';
+EXPLAIN (PLAN_ADVICE, COSTS OFF)
+SELECT * FROM pt1, ptmismatch WHERE pt1.id = ptmismatch.id;
+ QUERY PLAN
+---------------------------------------------------------------------------
+ Nested Loop
+ Disabled: true
+ -> Append
+ -> Seq Scan on pt1a pt1_1
+ -> Seq Scan on pt1b pt1_2
+ -> Seq Scan on pt1c pt1_3
+ -> Append
+ -> Index Scan using ptmismatcha_pkey on ptmismatcha ptmismatch_1
+ Index Cond: (id = pt1.id)
+ -> Index Scan using ptmismatchb_pkey on ptmismatchb ptmismatch_2
+ Index Cond: (id = pt1.id)
+ Supplied Plan Advice:
+ PARTITIONWISE((pt1 ptmismatch)) /* matched, failed */
+ Generated Plan Advice:
+ JOIN_ORDER(pt1 ptmismatch)
+ NESTED_LOOP_PLAIN(ptmismatch)
+ SEQ_SCAN(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c)
+ INDEX_SCAN(ptmismatch/public.ptmismatcha public.ptmismatcha_pkey
+ ptmismatch/public.ptmismatchb public.ptmismatchb_pkey)
+ PARTITIONWISE(pt1 ptmismatch)
+ NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c
+ ptmismatch/public.ptmismatcha ptmismatch/public.ptmismatchb)
+(22 rows)
+
+COMMIT;
+-- Force join order for a particular branch of the partitionwise join with
+-- and without mentioning the schema name.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'JOIN_ORDER(pt3/public.pt3a pt2/public.pt2a pt1/public.pt1a)';
+EXPLAIN (PLAN_ADVICE, COSTS OFF)
+SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id
+ AND val1 = 1 AND val2 = 1 AND val3 = 1;
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Append
+ -> Nested Loop
+ -> Hash Join
+ Hash Cond: (pt3_1.id = pt2_1.id)
+ -> Seq Scan on pt3a pt3_1
+ Filter: (val3 = 1)
+ -> Hash
+ -> Seq Scan on pt2a pt2_1
+ Filter: (val2 = 1)
+ -> Index Scan using pt1a_pkey on pt1a pt1_1
+ Index Cond: (id = pt2_1.id)
+ Filter: (val1 = 1)
+ -> Nested Loop
+ -> Hash Join
+ Hash Cond: (pt2_2.id = pt3_2.id)
+ -> Seq Scan on pt2b pt2_2
+ Filter: (val2 = 1)
+ -> Hash
+ -> Seq Scan on pt3b pt3_2
+ Filter: (val3 = 1)
+ -> Index Scan using pt1b_pkey on pt1b pt1_2
+ Index Cond: (id = pt2_2.id)
+ Filter: (val1 = 1)
+ -> Nested Loop
+ -> Hash Join
+ Hash Cond: (pt2_3.id = pt3_3.id)
+ -> Seq Scan on pt2c pt2_3
+ Filter: (val2 = 1)
+ -> Hash
+ -> Seq Scan on pt3c pt3_3
+ Filter: (val3 = 1)
+ -> Index Scan using pt1c_pkey on pt1c pt1_3
+ Index Cond: (id = pt2_3.id)
+ Filter: (val1 = 1)
+ Supplied Plan Advice:
+ JOIN_ORDER(pt3/public.pt3a pt2/public.pt2a pt1/public.pt1a) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(pt3/public.pt3a pt2/public.pt2a pt1/public.pt1a)
+ JOIN_ORDER(pt2/public.pt2b pt3/public.pt3b pt1/public.pt1b)
+ JOIN_ORDER(pt2/public.pt2c pt3/public.pt3c pt1/public.pt1c)
+ NESTED_LOOP_PLAIN(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c)
+ HASH_JOIN(pt2/public.pt2a pt3/public.pt3b pt3/public.pt3c)
+ SEQ_SCAN(pt3/public.pt3a pt2/public.pt2a pt2/public.pt2b pt3/public.pt3b
+ pt2/public.pt2c pt3/public.pt3c)
+ INDEX_SCAN(pt1/public.pt1a public.pt1a_pkey pt1/public.pt1b public.pt1b_pkey
+ pt1/public.pt1c public.pt1c_pkey)
+ PARTITIONWISE((pt1 pt2 pt3))
+ NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a
+ pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c)
+(49 rows)
+
+SET LOCAL pg_plan_advice.advice = 'JOIN_ORDER(pt3/pt3a pt2/pt2a pt1/pt1a)';
+EXPLAIN (PLAN_ADVICE, COSTS OFF)
+SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id
+ AND val1 = 1 AND val2 = 1 AND val3 = 1;
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Append
+ -> Nested Loop
+ -> Hash Join
+ Hash Cond: (pt3_1.id = pt2_1.id)
+ -> Seq Scan on pt3a pt3_1
+ Filter: (val3 = 1)
+ -> Hash
+ -> Seq Scan on pt2a pt2_1
+ Filter: (val2 = 1)
+ -> Index Scan using pt1a_pkey on pt1a pt1_1
+ Index Cond: (id = pt2_1.id)
+ Filter: (val1 = 1)
+ -> Nested Loop
+ -> Hash Join
+ Hash Cond: (pt2_2.id = pt3_2.id)
+ -> Seq Scan on pt2b pt2_2
+ Filter: (val2 = 1)
+ -> Hash
+ -> Seq Scan on pt3b pt3_2
+ Filter: (val3 = 1)
+ -> Index Scan using pt1b_pkey on pt1b pt1_2
+ Index Cond: (id = pt2_2.id)
+ Filter: (val1 = 1)
+ -> Nested Loop
+ -> Hash Join
+ Hash Cond: (pt2_3.id = pt3_3.id)
+ -> Seq Scan on pt2c pt2_3
+ Filter: (val2 = 1)
+ -> Hash
+ -> Seq Scan on pt3c pt3_3
+ Filter: (val3 = 1)
+ -> Index Scan using pt1c_pkey on pt1c pt1_3
+ Index Cond: (id = pt2_3.id)
+ Filter: (val1 = 1)
+ Supplied Plan Advice:
+ JOIN_ORDER(pt3/pt3a pt2/pt2a pt1/pt1a) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(pt3/public.pt3a pt2/public.pt2a pt1/public.pt1a)
+ JOIN_ORDER(pt2/public.pt2b pt3/public.pt3b pt1/public.pt1b)
+ JOIN_ORDER(pt2/public.pt2c pt3/public.pt3c pt1/public.pt1c)
+ NESTED_LOOP_PLAIN(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c)
+ HASH_JOIN(pt2/public.pt2a pt3/public.pt3b pt3/public.pt3c)
+ SEQ_SCAN(pt3/public.pt3a pt2/public.pt2a pt2/public.pt2b pt3/public.pt3b
+ pt2/public.pt2c pt3/public.pt3c)
+ INDEX_SCAN(pt1/public.pt1a public.pt1a_pkey pt1/public.pt1b public.pt1b_pkey
+ pt1/public.pt1c public.pt1c_pkey)
+ PARTITIONWISE((pt1 pt2 pt3))
+ NO_GATHER(pt1/public.pt1a pt1/public.pt1b pt1/public.pt1c pt2/public.pt2a
+ pt2/public.pt2b pt2/public.pt2c pt3/public.pt3a pt3/public.pt3b pt3/public.pt3c)
+(49 rows)
+
+COMMIT;
diff --git a/contrib/pg_plan_advice/expected/prepared.out b/contrib/pg_plan_advice/expected/prepared.out
new file mode 100644
index 00000000000..07a7c623659
--- /dev/null
+++ b/contrib/pg_plan_advice/expected/prepared.out
@@ -0,0 +1,67 @@
+LOAD 'pg_plan_advice';
+SET max_parallel_workers_per_gather = 0;
+CREATE TABLE ptab (id integer, val text) WITH (autovacuum_enabled = false);
+SET pg_plan_advice.always_store_advice_details = false;
+-- Not prepared, so advice should be generated.
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM ptab;
+ QUERY PLAN
+------------------------
+ Seq Scan on ptab
+ Generated Plan Advice:
+ SEQ_SCAN(ptab)
+ NO_GATHER(ptab)
+(4 rows)
+
+-- Prepared, so advice should not be generated.
+PREPARE pt1 AS SELECT * FROM ptab;
+EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt1;
+ QUERY PLAN
+------------------
+ Seq Scan on ptab
+(1 row)
+
+SET pg_plan_advice.always_store_advice_details = true;
+-- Prepared, but always_store_advice_details = true, so should show advice.
+PREPARE pt2 AS SELECT * FROM ptab;
+EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt2;
+ QUERY PLAN
+------------------------
+ Seq Scan on ptab
+ Generated Plan Advice:
+ SEQ_SCAN(ptab)
+ NO_GATHER(ptab)
+(4 rows)
+
+-- Not prepared, so feedback should be generated.
+SET pg_plan_advice.always_store_advice_details = false;
+SET pg_plan_advice.advice = 'SEQ_SCAN(ptab)';
+EXPLAIN (COSTS OFF)
+SELECT * FROM ptab;
+ QUERY PLAN
+--------------------------------
+ Seq Scan on ptab
+ Supplied Plan Advice:
+ SEQ_SCAN(ptab) /* matched */
+(3 rows)
+
+-- Prepared, so advice should not be generated.
+PREPARE pt3 AS SELECT * FROM ptab;
+EXPLAIN (COSTS OFF) EXECUTE pt1;
+ QUERY PLAN
+------------------
+ Seq Scan on ptab
+(1 row)
+
+SET pg_plan_advice.always_store_advice_details = true;
+-- Prepared, but always_store_advice_details = true, so should show feedback.
+PREPARE pt4 AS SELECT * FROM ptab;
+EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt2;
+ QUERY PLAN
+------------------------
+ Seq Scan on ptab
+ Generated Plan Advice:
+ SEQ_SCAN(ptab)
+ NO_GATHER(ptab)
+(4 rows)
+
diff --git a/contrib/pg_plan_advice/expected/scan.out b/contrib/pg_plan_advice/expected/scan.out
new file mode 100644
index 00000000000..d05ead369b4
--- /dev/null
+++ b/contrib/pg_plan_advice/expected/scan.out
@@ -0,0 +1,757 @@
+LOAD 'pg_plan_advice';
+SET max_parallel_workers_per_gather = 0;
+SET seq_page_cost = 0.1;
+SET random_page_cost = 0.1;
+SET cpu_tuple_cost = 0;
+SET cpu_index_tuple_cost = 0;
+CREATE TABLE scan_table (a int primary key, b text)
+ WITH (autovacuum_enabled = false);
+INSERT INTO scan_table
+ SELECT g, 'some text ' || g FROM generate_series(1, 100000) g;
+CREATE INDEX scan_table_b ON scan_table USING brin (b);
+VACUUM ANALYZE scan_table;
+-- Sequential scan
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table;
+ QUERY PLAN
+-------------------------
+ Seq Scan on scan_table
+ Generated Plan Advice:
+ SEQ_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(4 rows)
+
+-- Index scan
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+ QUERY PLAN
+-------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table
+ Index Cond: (a = 1)
+ Generated Plan Advice:
+ INDEX_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(5 rows)
+
+-- Index-only scan
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1;
+ QUERY PLAN
+------------------------------------------------------
+ Index Only Scan using scan_table_pkey on scan_table
+ Index Cond: (a = 1)
+ Generated Plan Advice:
+ INDEX_ONLY_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(5 rows)
+
+-- Bitmap heap scan
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table
+ WHERE b > 'some text 8';
+ QUERY PLAN
+-----------------------------------------------
+ Bitmap Heap Scan on scan_table
+ Recheck Cond: (b > 'some text 8'::text)
+ -> Bitmap Index Scan on scan_table_b
+ Index Cond: (b > 'some text 8'::text)
+ Generated Plan Advice:
+ BITMAP_HEAP_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(7 rows)
+
+-- TID scan
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)';
+ QUERY PLAN
+-----------------------------------
+ Tid Scan on scan_table
+ TID Cond: (ctid = '(0,1)'::tid)
+ Generated Plan Advice:
+ TID_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(5 rows)
+
+-- TID range scan
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table
+ WHERE ctid > '(1,1)' AND ctid < '(2,1)';
+ QUERY PLAN
+---------------------------------------------------------------
+ Tid Range Scan on scan_table
+ TID Cond: ((ctid > '(1,1)'::tid) AND (ctid < '(2,1)'::tid))
+ Generated Plan Advice:
+ TID_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(5 rows)
+
+-- Try forcing each of our test queries to use the scan type they
+-- wanted to use anyway. This should succeed.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table;
+ QUERY PLAN
+--------------------------------------
+ Seq Scan on scan_table
+ Supplied Plan Advice:
+ SEQ_SCAN(scan_table) /* matched */
+ Generated Plan Advice:
+ SEQ_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(6 rows)
+
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+ QUERY PLAN
+--------------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ INDEX_SCAN(scan_table scan_table_pkey) /* matched */
+ Generated Plan Advice:
+ INDEX_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(7 rows)
+
+SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1;
+ QUERY PLAN
+-------------------------------------------------------------
+ Index Only Scan using scan_table_pkey on scan_table
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ INDEX_ONLY_SCAN(scan_table scan_table_pkey) /* matched */
+ Generated Plan Advice:
+ INDEX_ONLY_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(7 rows)
+
+SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table
+ WHERE b > 'some text 8';
+ QUERY PLAN
+-----------------------------------------------
+ Bitmap Heap Scan on scan_table
+ Recheck Cond: (b > 'some text 8'::text)
+ -> Bitmap Index Scan on scan_table_b
+ Index Cond: (b > 'some text 8'::text)
+ Supplied Plan Advice:
+ BITMAP_HEAP_SCAN(scan_table) /* matched */
+ Generated Plan Advice:
+ BITMAP_HEAP_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(9 rows)
+
+SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)';
+ QUERY PLAN
+--------------------------------------
+ Tid Scan on scan_table
+ TID Cond: (ctid = '(0,1)'::tid)
+ Supplied Plan Advice:
+ TID_SCAN(scan_table) /* matched */
+ Generated Plan Advice:
+ TID_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(7 rows)
+
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table
+ WHERE ctid > '(1,1)' AND ctid < '(2,1)';
+ QUERY PLAN
+---------------------------------------------------------------
+ Tid Range Scan on scan_table
+ TID Cond: ((ctid > '(1,1)'::tid) AND (ctid < '(2,1)'::tid))
+ Supplied Plan Advice:
+ TID_SCAN(scan_table) /* matched */
+ Generated Plan Advice:
+ TID_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(7 rows)
+
+COMMIT;
+-- Try to force a full scan of the table to use some other scan type. All
+-- of these will fail. An index scan or bitmap heap scan could potentially
+-- generate the correct answer, but the planner does not even consider these
+-- possibilities due to the lack of a WHERE clause.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table;
+ QUERY PLAN
+----------------------------------------------------------------
+ Seq Scan on scan_table
+ Disabled: true
+ Supplied Plan Advice:
+ INDEX_SCAN(scan_table scan_table_pkey) /* matched, failed */
+ Generated Plan Advice:
+ SEQ_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(7 rows)
+
+SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table;
+ QUERY PLAN
+---------------------------------------------------------------------
+ Seq Scan on scan_table
+ Disabled: true
+ Supplied Plan Advice:
+ INDEX_ONLY_SCAN(scan_table scan_table_pkey) /* matched, failed */
+ Generated Plan Advice:
+ SEQ_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(7 rows)
+
+SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table;
+ QUERY PLAN
+------------------------------------------------------
+ Seq Scan on scan_table
+ Disabled: true
+ Supplied Plan Advice:
+ BITMAP_HEAP_SCAN(scan_table) /* matched, failed */
+ Generated Plan Advice:
+ SEQ_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(7 rows)
+
+SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table;
+ QUERY PLAN
+----------------------------------------------
+ Seq Scan on scan_table
+ Disabled: true
+ Supplied Plan Advice:
+ TID_SCAN(scan_table) /* matched, failed */
+ Generated Plan Advice:
+ SEQ_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(7 rows)
+
+COMMIT;
+-- Try again to force index use. This should now succeed for the INDEX_SCAN
+-- and BITMAP_HEAP_SCAN, but the INDEX_ONLY_SCAN can't be forced because the
+-- query fetches columns not included in the index.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0;
+ QUERY PLAN
+--------------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table
+ Index Cond: (a > 0)
+ Supplied Plan Advice:
+ INDEX_SCAN(scan_table scan_table_pkey) /* matched */
+ Generated Plan Advice:
+ INDEX_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(7 rows)
+
+SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0;
+ QUERY PLAN
+---------------------------------------------------------------------
+ Seq Scan on scan_table
+ Disabled: true
+ Filter: (a > 0)
+ Supplied Plan Advice:
+ INDEX_ONLY_SCAN(scan_table scan_table_pkey) /* matched, failed */
+ Generated Plan Advice:
+ SEQ_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(8 rows)
+
+SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0;
+ QUERY PLAN
+----------------------------------------------
+ Bitmap Heap Scan on scan_table
+ Recheck Cond: (a > 0)
+ -> Bitmap Index Scan on scan_table_pkey
+ Index Cond: (a > 0)
+ Supplied Plan Advice:
+ BITMAP_HEAP_SCAN(scan_table) /* matched */
+ Generated Plan Advice:
+ BITMAP_HEAP_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(9 rows)
+
+COMMIT;
+-- We can force a primary key lookup to use a sequential scan, but we
+-- can't force it to use an index-only scan (due to the column list)
+-- or a TID scan (due to the absence of a TID qual).
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+ QUERY PLAN
+--------------------------------------
+ Seq Scan on scan_table
+ Filter: (a = 1)
+ Supplied Plan Advice:
+ SEQ_SCAN(scan_table) /* matched */
+ Generated Plan Advice:
+ SEQ_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(7 rows)
+
+SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+ QUERY PLAN
+---------------------------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table
+ Disabled: true
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ INDEX_ONLY_SCAN(scan_table scan_table_pkey) /* matched, failed */
+ Generated Plan Advice:
+ INDEX_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(8 rows)
+
+SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+ QUERY PLAN
+-------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table
+ Disabled: true
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ TID_SCAN(scan_table) /* matched, failed */
+ Generated Plan Advice:
+ INDEX_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(8 rows)
+
+COMMIT;
+-- We can forcibly downgrade an index-only scan to an index scan, but we can't
+-- force the use of an index that the planner thinks is inapplicable.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+ QUERY PLAN
+--------------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ INDEX_SCAN(scan_table scan_table_pkey) /* matched */
+ Generated Plan Advice:
+ INDEX_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(7 rows)
+
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table public.scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+ QUERY PLAN
+---------------------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ INDEX_SCAN(scan_table public.scan_table_pkey) /* matched */
+ Generated Plan Advice:
+ INDEX_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(7 rows)
+
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_b)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+ QUERY PLAN
+-------------------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table
+ Disabled: true
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ INDEX_SCAN(scan_table scan_table_b) /* matched, failed */
+ Generated Plan Advice:
+ INDEX_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(8 rows)
+
+COMMIT;
+-- We can force the use of a sequential scan in place of a bitmap heap scan,
+-- but a plain index scan on a BRIN index is not possible.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table
+ WHERE b > 'some text 8';
+ QUERY PLAN
+--------------------------------------
+ Seq Scan on scan_table
+ Filter: (b > 'some text 8'::text)
+ Supplied Plan Advice:
+ SEQ_SCAN(scan_table) /* matched */
+ Generated Plan Advice:
+ SEQ_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(7 rows)
+
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_b)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+ QUERY PLAN
+-------------------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table
+ Disabled: true
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ INDEX_SCAN(scan_table scan_table_b) /* matched, failed */
+ Generated Plan Advice:
+ INDEX_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(8 rows)
+
+COMMIT;
+-- We can force the use of a sequential scan rather than a TID scan or
+-- TID range scan.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)';
+ QUERY PLAN
+--------------------------------------
+ Seq Scan on scan_table
+ Filter: (ctid = '(0,1)'::tid)
+ Supplied Plan Advice:
+ SEQ_SCAN(scan_table) /* matched */
+ Generated Plan Advice:
+ SEQ_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(7 rows)
+
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table
+ WHERE ctid > '(1,1)' AND ctid < '(2,1)';
+ QUERY PLAN
+-------------------------------------------------------------
+ Seq Scan on scan_table
+ Filter: ((ctid > '(1,1)'::tid) AND (ctid < '(2,1)'::tid))
+ Supplied Plan Advice:
+ SEQ_SCAN(scan_table) /* matched */
+ Generated Plan Advice:
+ SEQ_SCAN(scan_table)
+ NO_GATHER(scan_table)
+(7 rows)
+
+COMMIT;
+-- Test more complex scenarios with index scans.
+BEGIN;
+-- Should still work if we mention the schema.
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table public.scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+ QUERY PLAN
+---------------------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ INDEX_SCAN(scan_table public.scan_table_pkey) /* matched */
+ Generated Plan Advice:
+ INDEX_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(7 rows)
+
+-- But not if we mention the wrong schema.
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table cilbup.scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ INDEX_SCAN(scan_table cilbup.scan_table_pkey) /* matched, inapplicable, failed */
+ Generated Plan Advice:
+ INDEX_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(7 rows)
+
+-- It's OK to repeat the same advice.
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+ QUERY PLAN
+--------------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ INDEX_SCAN(scan_table scan_table_pkey) /* matched */
+ INDEX_SCAN(scan_table scan_table_pkey) /* matched */
+ Generated Plan Advice:
+ INDEX_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(8 rows)
+
+-- But it doesn't work if the index target is even notionally different.
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey scan_table public.scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+ QUERY PLAN
+----------------------------------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ INDEX_SCAN(scan_table scan_table_pkey) /* matched, conflicting */
+ INDEX_SCAN(scan_table public.scan_table_pkey) /* matched, conflicting */
+ Generated Plan Advice:
+ INDEX_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(8 rows)
+
+COMMIT;
+-- Test assorted incorrect advice.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(nothing)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1;
+ QUERY PLAN
+------------------------------------------------------
+ Index Only Scan using scan_table_pkey on scan_table
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ SEQ_SCAN(nothing) /* not matched */
+ Generated Plan Advice:
+ INDEX_ONLY_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(7 rows)
+
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(nothing whatsoever)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1;
+ QUERY PLAN
+------------------------------------------------------
+ Index Only Scan using scan_table_pkey on scan_table
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ INDEX_SCAN(nothing whatsoever) /* not matched */
+ Generated Plan Advice:
+ INDEX_ONLY_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(7 rows)
+
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table bogus)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1;
+ QUERY PLAN
+--------------------------------------------------------------------
+ Index Only Scan using scan_table_pkey on scan_table
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ INDEX_SCAN(scan_table bogus) /* matched, inapplicable, failed */
+ Generated Plan Advice:
+ INDEX_ONLY_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(7 rows)
+
+SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(nothing whatsoever)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1;
+ QUERY PLAN
+---------------------------------------------------------
+ Index Only Scan using scan_table_pkey on scan_table
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ INDEX_ONLY_SCAN(nothing whatsoever) /* not matched */
+ Generated Plan Advice:
+ INDEX_ONLY_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(7 rows)
+
+SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table bogus)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1;
+ QUERY PLAN
+-------------------------------------------------------------------------
+ Index Only Scan using scan_table_pkey on scan_table
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ INDEX_ONLY_SCAN(scan_table bogus) /* matched, inapplicable, failed */
+ Generated Plan Advice:
+ INDEX_ONLY_SCAN(scan_table public.scan_table_pkey)
+ NO_GATHER(scan_table)
+(7 rows)
+
+COMMIT;
+-- Test our ability to refer to multiple instances of the same alias.
+BEGIN;
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x
+ LEFT JOIN scan_table s ON g = s.a;
+ QUERY PLAN
+-------------------------------------------------------------------
+ Nested Loop Left Join
+ -> Nested Loop Left Join
+ -> Function Scan on generate_series g
+ -> Index Scan using scan_table_pkey on scan_table s
+ Index Cond: (a = g.g)
+ -> Index Scan using scan_table_pkey on scan_table s_1
+ Index Cond: (a = g.g)
+ Generated Plan Advice:
+ JOIN_ORDER(g s s#2)
+ NESTED_LOOP_PLAIN(s s#2)
+ INDEX_SCAN(s public.scan_table_pkey s#2 public.scan_table_pkey)
+ NO_GATHER(s s#2)
+(12 rows)
+
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x
+ LEFT JOIN scan_table s ON g = s.a;
+ QUERY PLAN
+----------------------------------------------------------
+ Nested Loop Left Join
+ -> Hash Left Join
+ Hash Cond: (g.g = s.a)
+ -> Function Scan on generate_series g
+ -> Hash
+ -> Seq Scan on scan_table s
+ -> Index Scan using scan_table_pkey on scan_table s_1
+ Index Cond: (a = g.g)
+ Supplied Plan Advice:
+ SEQ_SCAN(s) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(g s s#2)
+ NESTED_LOOP_PLAIN(s#2)
+ HASH_JOIN(s)
+ SEQ_SCAN(s)
+ INDEX_SCAN(s#2 public.scan_table_pkey)
+ NO_GATHER(s s#2)
+(17 rows)
+
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s#2)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x
+ LEFT JOIN scan_table s ON g = s.a;
+ QUERY PLAN
+--------------------------------------------------------------
+ Hash Left Join
+ Hash Cond: (g.g = s_1.a)
+ -> Nested Loop Left Join
+ -> Function Scan on generate_series g
+ -> Index Scan using scan_table_pkey on scan_table s
+ Index Cond: (a = g.g)
+ -> Hash
+ -> Seq Scan on scan_table s_1
+ Supplied Plan Advice:
+ SEQ_SCAN(s#2) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(g s s#2)
+ NESTED_LOOP_PLAIN(s)
+ HASH_JOIN(s#2)
+ SEQ_SCAN(s#2)
+ INDEX_SCAN(s public.scan_table_pkey)
+ NO_GATHER(s s#2)
+(17 rows)
+
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s) SEQ_SCAN(s#2)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x
+ LEFT JOIN scan_table s ON g = s.a;
+ QUERY PLAN
+------------------------------------------------
+ Hash Left Join
+ Hash Cond: (g.g = s_1.a)
+ -> Hash Left Join
+ Hash Cond: (g.g = s.a)
+ -> Function Scan on generate_series g
+ -> Hash
+ -> Seq Scan on scan_table s
+ -> Hash
+ -> Seq Scan on scan_table s_1
+ Supplied Plan Advice:
+ SEQ_SCAN(s) /* matched */
+ SEQ_SCAN(s#2) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(g s s#2)
+ HASH_JOIN(s s#2)
+ SEQ_SCAN(s s#2)
+ NO_GATHER(s s#2)
+(17 rows)
+
+COMMIT;
+-- Test our ability to refer to scans within a subquery.
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x;
+ QUERY PLAN
+--------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table s
+ Index Cond: (a = 1)
+ Generated Plan Advice:
+ INDEX_SCAN(s@x public.scan_table_pkey)
+ NO_GATHER(s@x)
+(5 rows)
+
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0);
+ QUERY PLAN
+---------------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table s
+ Index Cond: (a = 1)
+ Generated Plan Advice:
+ INDEX_SCAN(s@unnamed_subquery public.scan_table_pkey)
+ NO_GATHER(s@unnamed_subquery)
+(5 rows)
+
+BEGIN;
+-- Should not match.
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x;
+ QUERY PLAN
+--------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table s
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ SEQ_SCAN(s) /* not matched */
+ Generated Plan Advice:
+ INDEX_SCAN(s@x public.scan_table_pkey)
+ NO_GATHER(s@x)
+(7 rows)
+
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0);
+ QUERY PLAN
+---------------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table s
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ SEQ_SCAN(s) /* not matched */
+ Generated Plan Advice:
+ INDEX_SCAN(s@unnamed_subquery public.scan_table_pkey)
+ NO_GATHER(s@unnamed_subquery)
+(7 rows)
+
+-- Should match first query only.
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s@x)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x;
+ QUERY PLAN
+-------------------------------
+ Seq Scan on scan_table s
+ Filter: (a = 1)
+ Supplied Plan Advice:
+ SEQ_SCAN(s@x) /* matched */
+ Generated Plan Advice:
+ SEQ_SCAN(s@x)
+ NO_GATHER(s@x)
+(7 rows)
+
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0);
+ QUERY PLAN
+---------------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table s
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ SEQ_SCAN(s@x) /* not matched */
+ Generated Plan Advice:
+ INDEX_SCAN(s@unnamed_subquery public.scan_table_pkey)
+ NO_GATHER(s@unnamed_subquery)
+(7 rows)
+
+-- Should match second query only.
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s@unnamed_subquery)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x;
+ QUERY PLAN
+--------------------------------------------------
+ Index Scan using scan_table_pkey on scan_table s
+ Index Cond: (a = 1)
+ Supplied Plan Advice:
+ SEQ_SCAN(s@unnamed_subquery) /* not matched */
+ Generated Plan Advice:
+ INDEX_SCAN(s@x public.scan_table_pkey)
+ NO_GATHER(s@x)
+(7 rows)
+
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0);
+ QUERY PLAN
+----------------------------------------------
+ Seq Scan on scan_table s
+ Filter: (a = 1)
+ Supplied Plan Advice:
+ SEQ_SCAN(s@unnamed_subquery) /* matched */
+ Generated Plan Advice:
+ SEQ_SCAN(s@unnamed_subquery)
+ NO_GATHER(s@unnamed_subquery)
+(7 rows)
+
+COMMIT;
diff --git a/contrib/pg_plan_advice/expected/semijoin.out b/contrib/pg_plan_advice/expected/semijoin.out
new file mode 100644
index 00000000000..6f203c5a68e
--- /dev/null
+++ b/contrib/pg_plan_advice/expected/semijoin.out
@@ -0,0 +1,377 @@
+LOAD 'pg_plan_advice';
+SET max_parallel_workers_per_gather = 0;
+CREATE TABLE sj_wide (
+ id integer primary key,
+ val1 integer,
+ padding text storage plain
+) WITH (autovacuum_enabled = false);
+INSERT INTO sj_wide
+ SELECT g, g%10+1, repeat(' ', 300) FROM generate_series(1, 1000) g;
+CREATE INDEX ON sj_wide (val1);
+VACUUM ANALYZE sj_wide;
+CREATE TABLE sj_narrow (
+ id integer primary key,
+ val1 integer
+) WITH (autovacuum_enabled = false);
+INSERT INTO sj_narrow
+ SELECT g, g%10+1 FROM generate_series(1, 1000) g;
+CREATE INDEX ON sj_narrow (val1);
+VACUUM ANALYZE sj_narrow;
+-- We expect this to make the VALUES list unique and use index lookups to
+-- find the rows in sj_wide, so as to avoid a full scan of sj_wide.
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM sj_wide
+ WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
+ QUERY PLAN
+-----------------------------------------------------------
+ Nested Loop
+ -> HashAggregate
+ Group Key: "*VALUES*".column1, "*VALUES*".column2
+ -> Values Scan on "*VALUES*"
+ -> Index Scan using sj_wide_pkey on sj_wide
+ Index Cond: (id = "*VALUES*".column1)
+ Filter: (val1 = "*VALUES*".column2)
+ Generated Plan Advice:
+ JOIN_ORDER("*VALUES*" sj_wide)
+ NESTED_LOOP_PLAIN(sj_wide)
+ INDEX_SCAN(sj_wide public.sj_wide_pkey)
+ SEMIJOIN_UNIQUE("*VALUES*")
+ NO_GATHER(sj_wide)
+(13 rows)
+
+-- If we ask for a unique semijoin, we should get the same plan as with
+-- no advice. If we ask for a non-unique semijoin, we should see a Semi
+-- Join operation in the plan tree.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'semijoin_unique("*VALUES*")';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM sj_wide
+ WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
+ QUERY PLAN
+-----------------------------------------------------------
+ Nested Loop
+ -> HashAggregate
+ Group Key: "*VALUES*".column1, "*VALUES*".column2
+ -> Values Scan on "*VALUES*"
+ -> Index Scan using sj_wide_pkey on sj_wide
+ Index Cond: (id = "*VALUES*".column1)
+ Filter: (val1 = "*VALUES*".column2)
+ Supplied Plan Advice:
+ SEMIJOIN_UNIQUE("*VALUES*") /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER("*VALUES*" sj_wide)
+ NESTED_LOOP_PLAIN(sj_wide)
+ INDEX_SCAN(sj_wide public.sj_wide_pkey)
+ SEMIJOIN_UNIQUE("*VALUES*")
+ NO_GATHER(sj_wide)
+(15 rows)
+
+SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique("*VALUES*")';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM sj_wide
+ WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
+ QUERY PLAN
+------------------------------------------------------------------------------------------
+ Hash Semi Join
+ Hash Cond: ((sj_wide.id = "*VALUES*".column1) AND (sj_wide.val1 = "*VALUES*".column2))
+ -> Seq Scan on sj_wide
+ -> Hash
+ -> Values Scan on "*VALUES*"
+ Supplied Plan Advice:
+ SEMIJOIN_NON_UNIQUE("*VALUES*") /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(sj_wide "*VALUES*")
+ HASH_JOIN("*VALUES*")
+ SEQ_SCAN(sj_wide)
+ SEMIJOIN_NON_UNIQUE("*VALUES*")
+ NO_GATHER(sj_wide)
+(13 rows)
+
+COMMIT;
+-- Because this table is narrower than the previous one, a sequential scan
+-- is less expensive, and we choose a straightforward Semi Join plan by
+-- default. (Note that this is also very sensitive to the length of the IN
+-- list, which affects how many index lookups the alternative plan will need.)
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM sj_narrow
+ WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
+ QUERY PLAN
+----------------------------------------------------------------------------------------------
+ Hash Semi Join
+ Hash Cond: ((sj_narrow.id = "*VALUES*".column1) AND (sj_narrow.val1 = "*VALUES*".column2))
+ -> Seq Scan on sj_narrow
+ -> Hash
+ -> Values Scan on "*VALUES*"
+ Generated Plan Advice:
+ JOIN_ORDER(sj_narrow "*VALUES*")
+ HASH_JOIN("*VALUES*")
+ SEQ_SCAN(sj_narrow)
+ SEMIJOIN_NON_UNIQUE("*VALUES*")
+ NO_GATHER(sj_narrow)
+(11 rows)
+
+-- Here, we expect advising a unique semijoin to swith to the same plan that
+-- we got with sj_wide, and advising a non-unique semijoin should not change
+-- the plan.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'semijoin_unique("*VALUES*")';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM sj_narrow
+ WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
+ QUERY PLAN
+----------------------------------------------------------------------------------------------
+ Hash Join
+ Hash Cond: ((sj_narrow.id = "*VALUES*".column1) AND (sj_narrow.val1 = "*VALUES*".column2))
+ -> Seq Scan on sj_narrow
+ -> Hash
+ -> HashAggregate
+ Group Key: "*VALUES*".column1, "*VALUES*".column2
+ -> Values Scan on "*VALUES*"
+ Supplied Plan Advice:
+ SEMIJOIN_UNIQUE("*VALUES*") /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(sj_narrow "*VALUES*")
+ HASH_JOIN("*VALUES*")
+ SEQ_SCAN(sj_narrow)
+ SEMIJOIN_UNIQUE("*VALUES*")
+ NO_GATHER(sj_narrow)
+(15 rows)
+
+SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique("*VALUES*")';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM sj_narrow
+ WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
+ QUERY PLAN
+----------------------------------------------------------------------------------------------
+ Hash Semi Join
+ Hash Cond: ((sj_narrow.id = "*VALUES*".column1) AND (sj_narrow.val1 = "*VALUES*".column2))
+ -> Seq Scan on sj_narrow
+ -> Hash
+ -> Values Scan on "*VALUES*"
+ Supplied Plan Advice:
+ SEMIJOIN_NON_UNIQUE("*VALUES*") /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(sj_narrow "*VALUES*")
+ HASH_JOIN("*VALUES*")
+ SEQ_SCAN(sj_narrow)
+ SEMIJOIN_NON_UNIQUE("*VALUES*")
+ NO_GATHER(sj_narrow)
+(13 rows)
+
+COMMIT;
+-- In the above example, we made the outer side of the join unique, but here,
+-- we should make the inner side unique.
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g
+ WHERE g in (select val1 from sj_narrow);
+ QUERY PLAN
+------------------------------------------
+ Hash Join
+ Hash Cond: (g.g = sj_narrow.val1)
+ -> Function Scan on generate_series g
+ -> Hash
+ -> HashAggregate
+ Group Key: sj_narrow.val1
+ -> Seq Scan on sj_narrow
+ Generated Plan Advice:
+ JOIN_ORDER(g sj_narrow)
+ HASH_JOIN(sj_narrow)
+ SEQ_SCAN(sj_narrow)
+ SEMIJOIN_UNIQUE(sj_narrow)
+ NO_GATHER(sj_narrow)
+(13 rows)
+
+-- We should be able to force a plan with or without the make-unique strategy,
+-- with either side as the driving table.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g
+ WHERE g in (select val1 from sj_narrow);
+ QUERY PLAN
+--------------------------------------------
+ Hash Join
+ Hash Cond: (g.g = sj_narrow.val1)
+ -> Function Scan on generate_series g
+ -> Hash
+ -> HashAggregate
+ Group Key: sj_narrow.val1
+ -> Seq Scan on sj_narrow
+ Supplied Plan Advice:
+ SEMIJOIN_UNIQUE(sj_narrow) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(g sj_narrow)
+ HASH_JOIN(sj_narrow)
+ SEQ_SCAN(sj_narrow)
+ SEMIJOIN_UNIQUE(sj_narrow)
+ NO_GATHER(sj_narrow)
+(15 rows)
+
+SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(sj_narrow)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g
+ WHERE g in (select val1 from sj_narrow);
+ QUERY PLAN
+------------------------------------------------
+ Hash Semi Join
+ Hash Cond: (g.g = sj_narrow.val1)
+ -> Function Scan on generate_series g
+ -> Hash
+ -> Seq Scan on sj_narrow
+ Supplied Plan Advice:
+ SEMIJOIN_NON_UNIQUE(sj_narrow) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(g sj_narrow)
+ HASH_JOIN(sj_narrow)
+ SEQ_SCAN(sj_narrow)
+ SEMIJOIN_NON_UNIQUE(sj_narrow)
+ NO_GATHER(sj_narrow)
+(13 rows)
+
+SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow) join_order(sj_narrow)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g
+ WHERE g in (select val1 from sj_narrow);
+ QUERY PLAN
+------------------------------------------------
+ Hash Join
+ Hash Cond: (sj_narrow.val1 = g.g)
+ -> HashAggregate
+ Group Key: sj_narrow.val1
+ -> Seq Scan on sj_narrow
+ -> Hash
+ -> Function Scan on generate_series g
+ Supplied Plan Advice:
+ SEMIJOIN_UNIQUE(sj_narrow) /* matched */
+ JOIN_ORDER(sj_narrow) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(sj_narrow g)
+ HASH_JOIN(g)
+ SEQ_SCAN(sj_narrow)
+ SEMIJOIN_UNIQUE(sj_narrow)
+ NO_GATHER(sj_narrow)
+(16 rows)
+
+SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(sj_narrow) join_order(sj_narrow)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g
+ WHERE g in (select val1 from sj_narrow);
+ QUERY PLAN
+------------------------------------------------
+ Hash Right Semi Join
+ Hash Cond: (sj_narrow.val1 = g.g)
+ -> Seq Scan on sj_narrow
+ -> Hash
+ -> Function Scan on generate_series g
+ Supplied Plan Advice:
+ SEMIJOIN_NON_UNIQUE(sj_narrow) /* matched */
+ JOIN_ORDER(sj_narrow) /* matched */
+ Generated Plan Advice:
+ JOIN_ORDER(sj_narrow g)
+ HASH_JOIN(g)
+ SEQ_SCAN(sj_narrow)
+ SEMIJOIN_NON_UNIQUE(sj_narrow)
+ NO_GATHER(sj_narrow)
+(14 rows)
+
+COMMIT;
+-- However, mentioning the wrong side of the join should result in an advice
+-- failure.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'semijoin_unique(g)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g
+ WHERE g in (select val1 from sj_narrow);
+ QUERY PLAN
+--------------------------------------------
+ Nested Loop
+ Disabled: true
+ Join Filter: (g.g = sj_narrow.val1)
+ -> HashAggregate
+ Group Key: sj_narrow.val1
+ -> Seq Scan on sj_narrow
+ -> Function Scan on generate_series g
+ Supplied Plan Advice:
+ SEMIJOIN_UNIQUE(g) /* matched, failed */
+ Generated Plan Advice:
+ JOIN_ORDER(sj_narrow g)
+ NESTED_LOOP_PLAIN(g)
+ SEQ_SCAN(sj_narrow)
+ SEMIJOIN_UNIQUE(sj_narrow)
+ NO_GATHER(sj_narrow)
+(15 rows)
+
+SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(g)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g
+ WHERE g in (select val1 from sj_narrow);
+ QUERY PLAN
+------------------------------------------------
+ Nested Loop
+ Disabled: true
+ Join Filter: (g.g = sj_narrow.val1)
+ -> HashAggregate
+ Group Key: sj_narrow.val1
+ -> Seq Scan on sj_narrow
+ -> Function Scan on generate_series g
+ Supplied Plan Advice:
+ SEMIJOIN_NON_UNIQUE(g) /* matched, failed */
+ Generated Plan Advice:
+ JOIN_ORDER(sj_narrow g)
+ NESTED_LOOP_PLAIN(g)
+ SEQ_SCAN(sj_narrow)
+ SEMIJOIN_UNIQUE(sj_narrow)
+ NO_GATHER(sj_narrow)
+(15 rows)
+
+COMMIT;
+-- Test conflicting advice.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow) semijoin_non_unique(sj_narrow)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g
+ WHERE g in (select val1 from sj_narrow);
+ QUERY PLAN
+---------------------------------------------------------------------
+ Hash Join
+ Hash Cond: (g.g = sj_narrow.val1)
+ -> Function Scan on generate_series g
+ -> Hash
+ -> HashAggregate
+ Group Key: sj_narrow.val1
+ -> Seq Scan on sj_narrow
+ Supplied Plan Advice:
+ SEMIJOIN_UNIQUE(sj_narrow) /* matched, conflicting */
+ SEMIJOIN_NON_UNIQUE(sj_narrow) /* matched, conflicting, failed */
+ Generated Plan Advice:
+ JOIN_ORDER(g sj_narrow)
+ HASH_JOIN(sj_narrow)
+ SEQ_SCAN(sj_narrow)
+ SEMIJOIN_UNIQUE(sj_narrow)
+ NO_GATHER(sj_narrow)
+(16 rows)
+
+COMMIT;
+-- Try applying SEMIJOIN_UNIQUE() to a non-semijoin.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'semijoin_unique(g)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g, sj_narrow s WHERE g = s.val1;
+ QUERY PLAN
+----------------------------------------------------------
+ Merge Join
+ Merge Cond: (s.val1 = g.g)
+ -> Index Scan using sj_narrow_val1_idx on sj_narrow s
+ -> Sort
+ Sort Key: g.g
+ -> Function Scan on generate_series g
+ Supplied Plan Advice:
+ SEMIJOIN_UNIQUE(g) /* matched, inapplicable, failed */
+ Generated Plan Advice:
+ JOIN_ORDER(s g)
+ MERGE_JOIN_PLAIN(g)
+ INDEX_SCAN(s public.sj_narrow_val1_idx)
+ NO_GATHER(s)
+(13 rows)
+
+COMMIT;
diff --git a/contrib/pg_plan_advice/expected/syntax.out b/contrib/pg_plan_advice/expected/syntax.out
new file mode 100644
index 00000000000..be61402b569
--- /dev/null
+++ b/contrib/pg_plan_advice/expected/syntax.out
@@ -0,0 +1,192 @@
+LOAD 'pg_plan_advice';
+-- An empty string is allowed. Empty target lists are allowed for most advice
+-- tags, but not for JOIN_ORDER. "Supplied Plan Advice" should be omitted in
+-- text format when there is no actual advice, but not in non-text format.
+SET pg_plan_advice.advice = '';
+EXPLAIN (COSTS OFF) SELECT 1;
+ QUERY PLAN
+------------
+ Result
+(1 row)
+
+SET pg_plan_advice.advice = 'SEQ_SCAN()';
+EXPLAIN (COSTS OFF) SELECT 1;
+ QUERY PLAN
+------------
+ Result
+(1 row)
+
+SET pg_plan_advice.advice = 'NESTED_LOOP_PLAIN()';
+EXPLAIN (COSTS OFF, FORMAT JSON) SELECT 1;
+ QUERY PLAN
+--------------------------------
+ [ +
+ { +
+ "Plan": { +
+ "Node Type": "Result", +
+ "Parallel Aware": false,+
+ "Async Capable": false, +
+ "Disabled": false +
+ }, +
+ "Supplied Plan Advice": ""+
+ } +
+ ]
+(1 row)
+
+SET pg_plan_advice.advice = 'JOIN_ORDER()';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "JOIN_ORDER()"
+DETAIL: Could not parse advice: JOIN_ORDER must have at least one target at or near ")"
+-- Test assorted variations in capitalization, whitespace, and which parts of
+-- the relation identifier are included. These should all work.
+SET pg_plan_advice.advice = 'SEQ_SCAN(x)';
+EXPLAIN (COSTS OFF) SELECT 1;
+ QUERY PLAN
+---------------------------------
+ Result
+ Supplied Plan Advice:
+ SEQ_SCAN(x) /* not matched */
+(3 rows)
+
+SET pg_plan_advice.advice = 'seq_scan(x@y)';
+EXPLAIN (COSTS OFF) SELECT 1;
+ QUERY PLAN
+-----------------------------------
+ Result
+ Supplied Plan Advice:
+ SEQ_SCAN(x@y) /* not matched */
+(3 rows)
+
+SET pg_plan_advice.advice = 'SEQ_scan(x#2)';
+EXPLAIN (COSTS OFF) SELECT 1;
+ QUERY PLAN
+-----------------------------------
+ Result
+ Supplied Plan Advice:
+ SEQ_SCAN(x#2) /* not matched */
+(3 rows)
+
+SET pg_plan_advice.advice = 'SEQ_SCAN (x/y)';
+EXPLAIN (COSTS OFF) SELECT 1;
+ QUERY PLAN
+-----------------------------------
+ Result
+ Supplied Plan Advice:
+ SEQ_SCAN(x/y) /* not matched */
+(3 rows)
+
+SET pg_plan_advice.advice = ' SEQ_SCAN ( x / y . z ) ';
+EXPLAIN (COSTS OFF) SELECT 1;
+ QUERY PLAN
+-------------------------------------
+ Result
+ Supplied Plan Advice:
+ SEQ_SCAN(x/y.z) /* not matched */
+(3 rows)
+
+SET pg_plan_advice.advice = 'SEQ_SCAN("x"#2/"y"."z"@"t")';
+EXPLAIN (COSTS OFF) SELECT 1;
+ QUERY PLAN
+-----------------------------------------
+ Result
+ Supplied Plan Advice:
+ SEQ_SCAN(x#2/y.z@t) /* not matched */
+(3 rows)
+
+-- Syntax errors.
+SET pg_plan_advice.advice = 'SEQUENTIAL_SCAN(x)';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQUENTIAL_SCAN(x)"
+DETAIL: Could not parse advice: syntax error at or near "SEQUENTIAL_SCAN"
+SET pg_plan_advice.advice = 'SEQ_SCAN';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN"
+DETAIL: Could not parse advice: syntax error at end of input
+SET pg_plan_advice.advice = 'SEQ_SCAN(';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN("
+DETAIL: Could not parse advice: syntax error at end of input
+SET pg_plan_advice.advice = 'SEQ_SCAN("';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN(""
+DETAIL: Could not parse advice: unterminated quoted identifier at end of input
+SET pg_plan_advice.advice = 'SEQ_SCAN("")';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN("")"
+DETAIL: Could not parse advice: zero-length delimited identifier at or near """
+SET pg_plan_advice.advice = 'SEQ_SCAN("a"';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN("a""
+DETAIL: Could not parse advice: syntax error at end of input
+SET pg_plan_advice.advice = 'SEQ_SCAN(#';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN(#"
+DETAIL: Could not parse advice: syntax error at or near "#"
+SET pg_plan_advice.advice = '()';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "()"
+DETAIL: Could not parse advice: syntax error at or near "("
+SET pg_plan_advice.advice = '123';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "123"
+DETAIL: Could not parse advice: syntax error at or near "123"
+-- Tags like SEQ_SCAN and NO_GATHER don't allow sublists at all; other tags,
+-- except for JOIN_ORDER, allow at most one level of sublist. Hence, these
+-- examples should error out.
+SET pg_plan_advice.advice = 'SEQ_SCAN((x))';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "SEQ_SCAN((x))"
+DETAIL: Could not parse advice: syntax error at or near "("
+SET pg_plan_advice.advice = 'GATHER(((x)))';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "GATHER(((x)))"
+DETAIL: Could not parse advice: syntax error at or near "("
+-- Legal comments.
+SET pg_plan_advice.advice = '/**/';
+EXPLAIN (COSTS OFF) SELECT 1;
+ QUERY PLAN
+------------
+ Result
+(1 row)
+
+SET pg_plan_advice.advice = 'HASH_JOIN(_)/***/';
+EXPLAIN (COSTS OFF) SELECT 1;
+ QUERY PLAN
+----------------------------------
+ Result
+ Supplied Plan Advice:
+ HASH_JOIN(_) /* not matched */
+(3 rows)
+
+SET pg_plan_advice.advice = '/* comment */ HASH_JOIN(/*x*/y)';
+EXPLAIN (COSTS OFF) SELECT 1;
+ QUERY PLAN
+----------------------------------
+ Result
+ Supplied Plan Advice:
+ HASH_JOIN(y) /* not matched */
+(3 rows)
+
+SET pg_plan_advice.advice = '/* comment */ HASH_JOIN(y//*x*/z)';
+EXPLAIN (COSTS OFF) SELECT 1;
+ QUERY PLAN
+------------------------------------
+ Result
+ Supplied Plan Advice:
+ HASH_JOIN(y/z) /* not matched */
+(3 rows)
+
+-- Unterminated comments.
+SET pg_plan_advice.advice = '/*';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "/*"
+DETAIL: Could not parse advice: unterminated comment at end of input
+SET pg_plan_advice.advice = 'JOIN_ORDER("fOO") /* oops';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "JOIN_ORDER("fOO") /* oops"
+DETAIL: Could not parse advice: unterminated comment at end of input
+-- Nested comments are not supported, so the first of these is legal and
+-- the second is not.
+SET pg_plan_advice.advice = '/*/*/';
+EXPLAIN (COSTS OFF) SELECT 1;
+ QUERY PLAN
+------------
+ Result
+(1 row)
+
+SET pg_plan_advice.advice = '/*/* stuff */*/';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "/*/* stuff */*/"
+DETAIL: Could not parse advice: syntax error at or near "*"
+-- Foreign join requires multiple relation identifiers.
+SET pg_plan_advice.advice = 'FOREIGN_JOIN(a)';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "FOREIGN_JOIN(a)"
+DETAIL: Could not parse advice: FOREIGN_JOIN targets must contain more than one relation identifier at or near ")"
+SET pg_plan_advice.advice = 'FOREIGN_JOIN((a))';
+ERROR: invalid value for parameter "pg_plan_advice.advice": "FOREIGN_JOIN((a))"
+DETAIL: Could not parse advice: FOREIGN_JOIN targets must contain more than one relation identifier at or near ")"
diff --git a/contrib/pg_plan_advice/meson.build b/contrib/pg_plan_advice/meson.build
new file mode 100644
index 00000000000..f7229dddcef
--- /dev/null
+++ b/contrib/pg_plan_advice/meson.build
@@ -0,0 +1,79 @@
+# Copyright (c) 2022-2024, PostgreSQL Global Development Group
+
+pg_plan_advice_sources = files(
+ 'pg_plan_advice.c',
+ 'pgpa_ast.c',
+ 'pgpa_collector.c',
+ 'pgpa_identifier.c',
+ 'pgpa_join.c',
+ 'pgpa_output.c',
+ 'pgpa_planner.c',
+ 'pgpa_scan.c',
+ 'pgpa_trove.c',
+ 'pgpa_walker.c',
+)
+
+pgpa_scanner = custom_target('pgpa_scanner',
+ input: 'pgpa_scanner.l',
+ output: 'pgpa_scanner.c',
+ command: flex_cmd,
+)
+generated_sources += pgpa_scanner
+pg_plan_advice_sources += pgpa_scanner
+
+pgpa_parser = custom_target('pgpa_parser',
+ input: 'pgpa_parser.y',
+ kwargs: bison_kw,
+)
+generated_sources += pgpa_parser.to_list()
+pg_plan_advice_sources += pgpa_parser
+
+if host_system == 'windows'
+ pg_plan_advice_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+ '--NAME', 'pg_plan_advice',
+ '--FILEDESC', 'pg_plan_advice - help the planner get the right plan',])
+endif
+
+pg_plan_advice_inc = include_directories('.')
+
+pg_plan_advice = shared_module('pg_plan_advice',
+ pg_plan_advice_sources,
+ include_directories: pg_plan_advice_inc,
+ kwargs: contrib_mod_args,
+)
+contrib_targets += pg_plan_advice
+
+install_data(
+ 'pg_plan_advice--1.0.sql',
+ 'pg_plan_advice.control',
+ kwargs: contrib_data_args,
+)
+
+install_headers(
+ 'pg_plan_advice.h',
+ install_dir: dir_include_extension / 'pg_plan_advice',
+)
+
+tests += {
+ 'name': 'pg_plan_advice',
+ 'sd': meson.current_source_dir(),
+ 'bd': meson.current_build_dir(),
+ 'regress': {
+ 'sql': [
+ 'gather',
+ 'join_order',
+ 'join_strategy',
+ 'local_collector',
+ 'partitionwise',
+ 'prepared',
+ 'scan',
+ 'semijoin',
+ 'syntax',
+ ],
+ },
+ 'tap': {
+ 'tests': [
+ 't/001_regress.pl',
+ ],
+ },
+}
diff --git a/contrib/pg_plan_advice/pg_plan_advice--1.0.sql b/contrib/pg_plan_advice/pg_plan_advice--1.0.sql
new file mode 100644
index 00000000000..450c42040fd
--- /dev/null
+++ b/contrib/pg_plan_advice/pg_plan_advice--1.0.sql
@@ -0,0 +1,43 @@
+/* contrib/pg_plan_advice/pg_plan_advice--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pg_plan_advice" to load this file. \quit
+
+CREATE FUNCTION pg_clear_collected_local_advice()
+RETURNS void
+AS 'MODULE_PATHNAME', 'pg_clear_collected_local_advice'
+LANGUAGE C STRICT;
+
+CREATE FUNCTION pg_clear_collected_shared_advice()
+RETURNS void
+AS 'MODULE_PATHNAME', 'pg_clear_collected_shared_advice'
+LANGUAGE C STRICT;
+
+CREATE FUNCTION pg_get_collected_local_advice(
+ OUT id bigint,
+ OUT userid oid,
+ OUT dbid oid,
+ OUT queryid bigint,
+ OUT collection_time timestamptz,
+ OUT query text,
+ OUT advice text
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pg_get_collected_local_advice'
+LANGUAGE C STRICT;
+
+CREATE FUNCTION pg_get_collected_shared_advice(
+ OUT id bigint,
+ OUT userid oid,
+ OUT dbid oid,
+ OUT queryid bigint,
+ OUT collection_time timestamptz,
+ OUT query text,
+ OUT advice text
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pg_get_collected_shared_advice'
+LANGUAGE C STRICT;
+
+REVOKE ALL ON FUNCTION pg_clear_collected_shared_advice() FROM PUBLIC;
+REVOKE ALL ON FUNCTION pg_get_collected_shared_advice() FROM PUBLIC;
diff --git a/contrib/pg_plan_advice/pg_plan_advice.c b/contrib/pg_plan_advice/pg_plan_advice.c
new file mode 100644
index 00000000000..99b97843991
--- /dev/null
+++ b/contrib/pg_plan_advice/pg_plan_advice.c
@@ -0,0 +1,563 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_plan_advice.c
+ * main entrypoints for generating and applying planner advice
+ *
+ * Copyright (c) 2016-2024, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pg_plan_advice.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "pg_plan_advice.h"
+#include "pgpa_ast.h"
+#include "pgpa_collector.h"
+#include "pgpa_identifier.h"
+#include "pgpa_output.h"
+#include "pgpa_planner.h"
+#include "pgpa_trove.h"
+#include "pgpa_walker.h"
+
+#include "commands/defrem.h"
+#include "commands/explain.h"
+#include "commands/explain_format.h"
+#include "commands/explain_state.h"
+#include "funcapi.h"
+#include "optimizer/planner.h"
+#include "storage/dsm_registry.h"
+#include "utils/guc.h"
+
+PG_MODULE_MAGIC;
+
+static pgpa_shared_state *pgpa_state = NULL;
+static dsa_area *pgpa_dsa_area = NULL;
+static List *advisor_hook_list = NIL;
+
+/* GUC variables */
+char *pg_plan_advice_advice = NULL;
+bool pg_plan_advice_always_store_advice_details = false;
+static bool pg_plan_advice_always_explain_supplied_advice = true;
+bool pg_plan_advice_feedback_warnings = false;
+bool pg_plan_advice_local_collector = false;
+int pg_plan_advice_local_collection_limit = 0;
+bool pg_plan_advice_shared_collector = false;
+int pg_plan_advice_shared_collection_limit = 0;
+bool pg_plan_advice_trace_mask = false;
+
+/* Saved hook value */
+static explain_per_plan_hook_type prev_explain_per_plan = NULL;
+
+/* Other file-level globals */
+static int es_extension_id;
+static MemoryContext pgpa_memory_context = NULL;
+
+static void pgpa_init_shared_state(void *ptr, void *arg);
+static void pg_plan_advice_explain_option_handler(ExplainState *es,
+ DefElem *opt,
+ ParseState *pstate);
+static void pg_plan_advice_explain_per_plan_hook(PlannedStmt *plannedstmt,
+ IntoClause *into,
+ ExplainState *es,
+ const char *queryString,
+ ParamListInfo params,
+ QueryEnvironment *queryEnv);
+static bool pg_plan_advice_advice_check_hook(char **newval, void **extra,
+ GucSource source);
+static DefElem *find_defelem_by_defname(List *deflist, char *defname);
+
+/*
+ * Initialize this module.
+ */
+void
+_PG_init(void)
+{
+ DefineCustomStringVariable("pg_plan_advice.advice",
+ "advice to apply during query planning",
+ NULL,
+ &pg_plan_advice_advice,
+ NULL,
+ PGC_USERSET,
+ 0,
+ pg_plan_advice_advice_check_hook,
+ NULL,
+ NULL);
+
+ DefineCustomBoolVariable("pg_plan_advice.always_explain_supplied_advice",
+ "EXPLAIN output includes supplied advice even without EXPLAIN (PLAN_ADVICE)",
+ NULL,
+ &pg_plan_advice_always_explain_supplied_advice,
+ true,
+ PGC_USERSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ DefineCustomBoolVariable("pg_plan_advice.always_store_advice_details",
+ "Generate advice strings even when seemingly not required",
+ "Use this option to see generated advice for prepared queries.",
+ &pg_plan_advice_always_store_advice_details,
+ false,
+ PGC_USERSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ DefineCustomBoolVariable("pg_plan_advice.feedback_warnings",
+ "Warn when supplied advice does not apply cleanly",
+ NULL,
+ &pg_plan_advice_feedback_warnings,
+ false,
+ PGC_USERSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ DefineCustomBoolVariable("pg_plan_advice.local_collector",
+ "Enable the local advice collector.",
+ NULL,
+ &pg_plan_advice_local_collector,
+ false,
+ PGC_USERSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ DefineCustomIntVariable("pg_plan_advice.local_collection_limit",
+ "# of advice entries to retain in per-backend memory",
+ NULL,
+ &pg_plan_advice_local_collection_limit,
+ 0,
+ 0, INT_MAX,
+ PGC_USERSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ DefineCustomBoolVariable("pg_plan_advice.shared_collector",
+ "Enable the shared advice collector.",
+ NULL,
+ &pg_plan_advice_shared_collector,
+ false,
+ PGC_SUSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ DefineCustomIntVariable("pg_plan_advice.shared_collection_limit",
+ "# of advice entries to retain in shared memory",
+ NULL,
+ &pg_plan_advice_shared_collection_limit,
+ 0,
+ 0, INT_MAX,
+ PGC_SUSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ DefineCustomBoolVariable("pg_plan_advice.trace_mask",
+ "Emit debugging messages showing the computed strategy mask for each relation",
+ NULL,
+ &pg_plan_advice_trace_mask,
+ false,
+ PGC_USERSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ MarkGUCPrefixReserved("pg_plan_advice");
+
+ /* Get an ID that we can use to cache data in an ExplainState. */
+ es_extension_id = GetExplainExtensionId("pg_plan_advice");
+
+ /* Register the new EXPLAIN options implemented by this module. */
+ RegisterExtensionExplainOption("plan_advice",
+ pg_plan_advice_explain_option_handler);
+
+ /* Install hooks */
+ pgpa_planner_install_hooks();
+ prev_explain_per_plan = explain_per_plan_hook;
+ explain_per_plan_hook = pg_plan_advice_explain_per_plan_hook;
+}
+
+/*
+ * Initialize shared state when first created.
+ */
+static void
+pgpa_init_shared_state(void *ptr, void *arg)
+{
+ pgpa_shared_state *state = (pgpa_shared_state *) ptr;
+
+ LWLockInitialize(&state->lock, LWLockNewTrancheId("pg_plan_advice_lock"));
+ state->dsa_tranche = LWLockNewTrancheId("pg_plan_advice_dsa");
+ state->area = DSA_HANDLE_INVALID;
+ state->shared_collector = InvalidDsaPointer;
+}
+
+/*
+ * Return a pointer to a memory context where long-lived data managed by this
+ * module can be stored.
+ */
+MemoryContext
+pg_plan_advice_get_mcxt(void)
+{
+ if (pgpa_memory_context == NULL)
+ pgpa_memory_context = AllocSetContextCreate(TopMemoryContext,
+ "pg_plan_advice",
+ ALLOCSET_DEFAULT_SIZES);
+
+ return pgpa_memory_context;
+}
+
+/*
+ * Get a pointer to our shared state.
+ *
+ * If no shared state exists, create and initialize it. If it does exist but
+ * this backend has not yet accessed it, attach to it. Otherwise, just return
+ * our cached pointer.
+ *
+ * Along the way, make sure the relevant LWLock tranches are registered.
+ */
+pgpa_shared_state *
+pg_plan_advice_attach(void)
+{
+ if (pgpa_state == NULL)
+ {
+ bool found;
+
+ pgpa_state =
+ GetNamedDSMSegment("pg_plan_advice", sizeof(pgpa_shared_state),
+ pgpa_init_shared_state, &found, NULL);
+ }
+
+ return pgpa_state;
+}
+
+/*
+ * Return a pointer to pg_plan_advice's DSA area, creating it if needed.
+ */
+dsa_area *
+pg_plan_advice_dsa_area(void)
+{
+ if (pgpa_dsa_area == NULL)
+ {
+ pgpa_shared_state *state = pg_plan_advice_attach();
+ dsa_handle area_handle;
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(pg_plan_advice_get_mcxt());
+
+ LWLockAcquire(&state->lock, LW_EXCLUSIVE);
+ area_handle = state->area;
+ if (area_handle == DSA_HANDLE_INVALID)
+ {
+ pgpa_dsa_area = dsa_create(state->dsa_tranche);
+ dsa_pin(pgpa_dsa_area);
+ state->area = dsa_get_handle(pgpa_dsa_area);
+ LWLockRelease(&state->lock);
+ }
+ else
+ {
+ LWLockRelease(&state->lock);
+ pgpa_dsa_area = dsa_attach(area_handle);
+ }
+
+ dsa_pin_mapping(pgpa_dsa_area);
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ return pgpa_dsa_area;
+}
+
+/*
+ * Was the PLAN_ADVICE option specified and not set to false?
+ */
+bool
+pg_plan_advice_should_explain(ExplainState *es)
+{
+ bool *plan_advice = NULL;
+
+ if (es != NULL)
+ plan_advice = GetExplainExtensionState(es, es_extension_id);
+ return plan_advice != NULL && *plan_advice;
+}
+
+/*
+ * Get the advice that should be used while planning a particular query.
+ */
+char *
+pg_plan_advice_get_supplied_query_advice(PlannerGlobal *glob,
+ Query *parse,
+ const char *query_string,
+ int cursorOptions,
+ ExplainState *es)
+{
+ ListCell *lc;
+
+ /*
+ * If any advisors are loaded, consult them. The first one that produces a
+ * non-NULL string wins.
+ */
+ foreach(lc, advisor_hook_list)
+ {
+ pg_plan_advice_advisor_hook hook = lfirst(lc);
+ char *advice_string;
+
+ advice_string = (*hook) (glob, parse, query_string, cursorOptions, es);
+ if (advice_string != NULL)
+ return advice_string;
+ }
+
+ /* Otherwise, just use the value of the GUC. */
+ return pg_plan_advice_advice;
+}
+
+/*
+ * Add an advisor, which can supply advice strings to be used during future
+ * query planning operations.
+ *
+ * The advisor should return NULL if it has no advice string to offer for a
+ * given query. If multiple advisors are added, they will be consulted in the
+ * order added until one of them returns a non-NULL value.
+ */
+void
+pg_plan_advice_add_advisor(pg_plan_advice_advisor_hook hook)
+{
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(pg_plan_advice_get_mcxt());
+ advisor_hook_list = lappend(advisor_hook_list, hook);
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Remove an advisor.
+ */
+void
+pg_plan_advice_remove_advisor(pg_plan_advice_advisor_hook hook)
+{
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(pg_plan_advice_get_mcxt());
+ advisor_hook_list = list_delete_ptr(advisor_hook_list, hook);
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Handler for EXPLAIN (PLAN_ADVICE).
+ */
+static void
+pg_plan_advice_explain_option_handler(ExplainState *es, DefElem *opt,
+ ParseState *pstate)
+{
+ bool *plan_advice;
+
+ plan_advice = GetExplainExtensionState(es, es_extension_id);
+
+ if (plan_advice == NULL)
+ {
+ plan_advice = palloc0_object(bool);
+ SetExplainExtensionState(es, es_extension_id, plan_advice);
+ }
+
+ *plan_advice = defGetBoolean(opt);
+}
+
+/*
+ * Display a string that is likely to consist of multiple lines in EXPLAIN
+ * output.
+ */
+static void
+pg_plan_advice_explain_text_multiline(ExplainState *es, char *qlabel,
+ char *value)
+{
+ char *s;
+
+ /* For non-text formats, it's best not to add any special handling. */
+ if (es->format != EXPLAIN_FORMAT_TEXT)
+ {
+ ExplainPropertyText(qlabel, value, es);
+ return;
+ }
+
+ /* In text format, if there is no data, display nothing. */
+ if (*value == '\0')
+ return;
+
+ /*
+ * It looks nicest to indent each line of the advice separately, beginning
+ * on the line below the label.
+ */
+ ExplainIndentText(es);
+ appendStringInfo(es->str, "%s:\n", qlabel);
+ es->indent++;
+ while ((s = strchr(value, '\n')) != NULL)
+ {
+ ExplainIndentText(es);
+ appendBinaryStringInfo(es->str, value, (s - value) + 1);
+ value = s + 1;
+ }
+
+ /* Don't interpret a terminal newline as a request for an empty line. */
+ if (*value != '\0')
+ {
+ ExplainIndentText(es);
+ appendStringInfo(es->str, "%s\n", value);
+ }
+
+ es->indent--;
+}
+
+/*
+ * Add advice feedback to the EXPLAIN output.
+ */
+static void
+pg_plan_advice_explain_feedback(ExplainState *es, List *feedback)
+{
+ StringInfoData buf;
+
+ initStringInfo(&buf);
+ foreach_node(DefElem, item, feedback)
+ {
+ int flags = defGetInt32(item);
+
+ appendStringInfo(&buf, "%s /* ", item->defname);
+ pgpa_trove_append_flags(&buf, flags);
+ appendStringInfo(&buf, " */\n");
+ }
+
+ pg_plan_advice_explain_text_multiline(es, "Supplied Plan Advice",
+ buf.data);
+}
+
+/*
+ * Add relevant details, if any, to the EXPLAIN output for a single plan.
+ */
+static void
+pg_plan_advice_explain_per_plan_hook(PlannedStmt *plannedstmt,
+ IntoClause *into,
+ ExplainState *es,
+ const char *queryString,
+ ParamListInfo params,
+ QueryEnvironment *queryEnv)
+{
+ bool should_explain;
+ DefElem *pgpa_item;
+ List *pgpa_list;
+
+ if (prev_explain_per_plan)
+ prev_explain_per_plan(plannedstmt, into, es, queryString, params,
+ queryEnv);
+
+ /* Should an advice string be part of the EXPLAIN output? */
+ should_explain = pg_plan_advice_should_explain(es);
+
+ /* Find any data pgpa_planner_shutdown stashed in the PlannedStmt. */
+ pgpa_item = find_defelem_by_defname(plannedstmt->extension_state,
+ "pg_plan_advice");
+ pgpa_list = pgpa_item == NULL ? NULL : (List *) pgpa_item->arg;
+
+ /*
+ * By default, if there is a record of attempting to apply advice during
+ * query planning, we always output that information, but the user can set
+ * pg_plan_advice.always_explain_supplied_advice = false to suppress that
+ * behavior. If they do, we'll only display it when the PLAN_ADVICE option
+ * was specified and not set to false.
+ *
+ * NB: If we're explaining a query planned beforehand -- i.e. a prepared
+ * statement -- the application of query advice may not have been
+ * recorded, and therefore this won't be able to show anything. Use
+ * pg_plan_advice.always_store_advice_details = true to work around this.
+ */
+ if (pgpa_list != NULL && (pg_plan_advice_always_explain_supplied_advice ||
+ should_explain))
+ {
+ DefElem *feedback;
+
+ feedback = find_defelem_by_defname(pgpa_list, "feedback");
+ if (feedback != NULL)
+ pg_plan_advice_explain_feedback(es, (List *) feedback->arg);
+ }
+
+ /*
+ * If the PLAN_ADVICE option was specified -- and not sent to FALSE --
+ * show generated advice.
+ */
+ if (should_explain)
+ {
+ DefElem *advice_string_item;
+ char *advice_string = NULL;
+
+ advice_string_item =
+ find_defelem_by_defname(pgpa_list, "advice_string");
+ if (advice_string_item != NULL)
+ {
+ advice_string = strVal(advice_string_item->arg);
+ pg_plan_advice_explain_text_multiline(es, "Generated Plan Advice",
+ advice_string);
+ }
+ }
+}
+
+/*
+ * Check hook for pg_plan_advice.advice
+ */
+static bool
+pg_plan_advice_advice_check_hook(char **newval, void **extra, GucSource source)
+{
+ MemoryContext oldcontext;
+ MemoryContext tmpcontext;
+ char *error;
+
+ if (*newval == NULL)
+ return true;
+
+ tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
+ "pg_plan_advice.advice",
+ ALLOCSET_DEFAULT_SIZES);
+ oldcontext = MemoryContextSwitchTo(tmpcontext);
+
+ /*
+ * It would be nice to save the parse tree that we construct here for
+ * eventual use when planning with this advice, but *extra can only point
+ * to a single guc_malloc'd chunk, and our parse tree involves an
+ * arbitrary number of memory allocations.
+ */
+ (void) pgpa_parse(*newval, &error);
+
+ if (error != NULL)
+ {
+ GUC_check_errdetail("Could not parse advice: %s", error);
+ return false;
+ }
+
+ MemoryContextSwitchTo(oldcontext);
+ MemoryContextDelete(tmpcontext);
+
+ return true;
+}
+
+/*
+ * Search a list of DefElem objects for a given defname.
+ */
+static DefElem *
+find_defelem_by_defname(List *deflist, char *defname)
+{
+ foreach_node(DefElem, item, deflist)
+ {
+ if (strcmp(item->defname, defname) == 0)
+ return item;
+ }
+
+ return NULL;
+}
diff --git a/contrib/pg_plan_advice/pg_plan_advice.control b/contrib/pg_plan_advice/pg_plan_advice.control
new file mode 100644
index 00000000000..aa6fdc9e7b2
--- /dev/null
+++ b/contrib/pg_plan_advice/pg_plan_advice.control
@@ -0,0 +1,5 @@
+# pg_plan_advice extension
+comment = 'help the planner get the right plan'
+default_version = '1.0'
+module_pathname = '$libdir/pg_plan_advice'
+relocatable = true
diff --git a/contrib/pg_plan_advice/pg_plan_advice.h b/contrib/pg_plan_advice/pg_plan_advice.h
new file mode 100644
index 00000000000..21f66092fa2
--- /dev/null
+++ b/contrib/pg_plan_advice/pg_plan_advice.h
@@ -0,0 +1,61 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_plan_advice.h
+ * main header file for pg_plan_advice contrib module
+ *
+ * Copyright (c) 2016-2024, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pg_plan_advice.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_PLAN_ADVICE_H
+#define PG_PLAN_ADVICE_H
+
+#include "commands/explain_state.h"
+#include "nodes/pathnodes.h"
+#include "nodes/plannodes.h"
+#include "storage/lwlock.h"
+#include "utils/dsa.h"
+
+typedef struct pgpa_shared_state
+{
+ LWLock lock;
+ int dsa_tranche;
+ dsa_handle area;
+ dsa_pointer shared_collector;
+} pgpa_shared_state;
+
+/* Hook for other plugins to supply advice strings */
+typedef char *(*pg_plan_advice_advisor_hook) (PlannerGlobal *glob,
+ Query *parse,
+ const char *query_string,
+ int cursorOptions,
+ ExplainState *es);
+
+/* GUC variables */
+extern char *pg_plan_advice_advice;
+extern bool pg_plan_advice_always_store_advice_details;
+extern bool pg_plan_advice_feedback_warnings;
+extern bool pg_plan_advice_local_collector;
+extern int pg_plan_advice_local_collection_limit;
+extern bool pg_plan_advice_shared_collector;
+extern int pg_plan_advice_shared_collection_limit;
+extern bool pg_plan_advice_trace_mask;
+
+/* Function prototypes (for use by pg_plan_advice itself) */
+extern MemoryContext pg_plan_advice_get_mcxt(void);
+extern pgpa_shared_state *pg_plan_advice_attach(void);
+extern dsa_area *pg_plan_advice_dsa_area(void);
+extern bool pg_plan_advice_should_explain(ExplainState *es);
+extern char *pg_plan_advice_get_supplied_query_advice(PlannerGlobal *glob,
+ Query *parse,
+ const char *query_string,
+ int cursorOptions,
+ ExplainState *es);
+
+/* Function prototypes (for use by other plugins) */
+extern PGDLLEXPORT void pg_plan_advice_add_advisor(pg_plan_advice_advisor_hook hook);
+extern PGDLLEXPORT void pg_plan_advice_remove_advisor(pg_plan_advice_advisor_hook hook);
+
+#endif
diff --git a/contrib/pg_plan_advice/pgpa_ast.c b/contrib/pg_plan_advice/pgpa_ast.c
new file mode 100644
index 00000000000..85bd74859df
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_ast.c
@@ -0,0 +1,351 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_ast.c
+ * additional supporting code related to plan advice parsing
+ *
+ * Copyright (c) 2016-2024, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_ast.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "pgpa_ast.h"
+
+#include "funcapi.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+
+static bool pgpa_identifiers_cover_target(int nrids, pgpa_identifier *rids,
+ pgpa_advice_target *target,
+ bool *rids_used);
+
+/*
+ * Get a C string that corresponds to the specified advice tag.
+ */
+char *
+pgpa_cstring_advice_tag(pgpa_advice_tag_type advice_tag)
+{
+ switch (advice_tag)
+ {
+ case PGPA_TAG_BITMAP_HEAP_SCAN:
+ return "BITMAP_HEAP_SCAN";
+ case PGPA_TAG_FOREIGN_JOIN:
+ return "FOREIGN_JOIN";
+ case PGPA_TAG_GATHER:
+ return "GATHER";
+ case PGPA_TAG_GATHER_MERGE:
+ return "GATHER_MERGE";
+ case PGPA_TAG_HASH_JOIN:
+ return "HASH_JOIN";
+ case PGPA_TAG_INDEX_ONLY_SCAN:
+ return "INDEX_ONLY_SCAN";
+ case PGPA_TAG_INDEX_SCAN:
+ return "INDEX_SCAN";
+ case PGPA_TAG_JOIN_ORDER:
+ return "JOIN_ORDER";
+ case PGPA_TAG_MERGE_JOIN_MATERIALIZE:
+ return "MERGE_JOIN_MATERIALIZE";
+ case PGPA_TAG_MERGE_JOIN_PLAIN:
+ return "MERGE_JOIN_PLAIN";
+ case PGPA_TAG_NESTED_LOOP_MATERIALIZE:
+ return "NESTED_LOOP_MATERIALIZE";
+ case PGPA_TAG_NESTED_LOOP_MEMOIZE:
+ return "NESTED_LOOP_MEMOIZE";
+ case PGPA_TAG_NESTED_LOOP_PLAIN:
+ return "NESTED_LOOP_PLAIN";
+ case PGPA_TAG_NO_GATHER:
+ return "NO_GATHER";
+ case PGPA_TAG_PARTITIONWISE:
+ return "PARTITIONWISE";
+ case PGPA_TAG_SEMIJOIN_NON_UNIQUE:
+ return "SEMIJOIN_NON_UNIQUE";
+ case PGPA_TAG_SEMIJOIN_UNIQUE:
+ return "SEMIJOIN_UNIQUE";
+ case PGPA_TAG_SEQ_SCAN:
+ return "SEQ_SCAN";
+ case PGPA_TAG_TID_SCAN:
+ return "TID_SCAN";
+ }
+
+ pg_unreachable();
+ return NULL;
+}
+
+/*
+ * Convert an advice tag, formatted as a string that has already been
+ * downcased as appropriate, to a pgpa_advice_tag_type.
+ *
+ * If we succeed, set *fail = false and return the result; if we fail,
+ * set *fail = true and reurn an arbitrary value.
+ */
+pgpa_advice_tag_type
+pgpa_parse_advice_tag(const char *tag, bool *fail)
+{
+ *fail = false;
+
+ switch (tag[0])
+ {
+ case 'b':
+ if (strcmp(tag, "bitmap_heap_scan") == 0)
+ return PGPA_TAG_BITMAP_HEAP_SCAN;
+ break;
+ case 'f':
+ if (strcmp(tag, "foreign_join") == 0)
+ return PGPA_TAG_FOREIGN_JOIN;
+ break;
+ case 'g':
+ if (strcmp(tag, "gather") == 0)
+ return PGPA_TAG_GATHER;
+ if (strcmp(tag, "gather_merge") == 0)
+ return PGPA_TAG_GATHER_MERGE;
+ break;
+ case 'h':
+ if (strcmp(tag, "hash_join") == 0)
+ return PGPA_TAG_HASH_JOIN;
+ break;
+ case 'i':
+ if (strcmp(tag, "index_scan") == 0)
+ return PGPA_TAG_INDEX_SCAN;
+ if (strcmp(tag, "index_only_scan") == 0)
+ return PGPA_TAG_INDEX_ONLY_SCAN;
+ break;
+ case 'j':
+ if (strcmp(tag, "join_order") == 0)
+ return PGPA_TAG_JOIN_ORDER;
+ break;
+ case 'm':
+ if (strcmp(tag, "merge_join_materialize") == 0)
+ return PGPA_TAG_MERGE_JOIN_MATERIALIZE;
+ if (strcmp(tag, "merge_join_plain") == 0)
+ return PGPA_TAG_MERGE_JOIN_PLAIN;
+ break;
+ case 'n':
+ if (strcmp(tag, "nested_loop_materialize") == 0)
+ return PGPA_TAG_NESTED_LOOP_MATERIALIZE;
+ if (strcmp(tag, "nested_loop_memoize") == 0)
+ return PGPA_TAG_NESTED_LOOP_MEMOIZE;
+ if (strcmp(tag, "nested_loop_plain") == 0)
+ return PGPA_TAG_NESTED_LOOP_PLAIN;
+ if (strcmp(tag, "no_gather") == 0)
+ return PGPA_TAG_NO_GATHER;
+ break;
+ case 'p':
+ if (strcmp(tag, "partitionwise") == 0)
+ return PGPA_TAG_PARTITIONWISE;
+ break;
+ case 's':
+ if (strcmp(tag, "semijoin_non_unique") == 0)
+ return PGPA_TAG_SEMIJOIN_NON_UNIQUE;
+ if (strcmp(tag, "semijoin_unique") == 0)
+ return PGPA_TAG_SEMIJOIN_UNIQUE;
+ if (strcmp(tag, "seq_scan") == 0)
+ return PGPA_TAG_SEQ_SCAN;
+ break;
+ case 't':
+ if (strcmp(tag, "tid_scan") == 0)
+ return PGPA_TAG_TID_SCAN;
+ break;
+ }
+
+ /* didn't work out */
+ *fail = true;
+
+ /* return an arbitrary value to unwind the call stack */
+ return PGPA_TAG_SEQ_SCAN;
+}
+
+/*
+ * Format a pgpa_advice_target as a string and append result to a StringInfo.
+ */
+void
+pgpa_format_advice_target(StringInfo str, pgpa_advice_target *target)
+{
+ if (target->ttype != PGPA_TARGET_IDENTIFIER)
+ {
+ bool first = true;
+ char *delims;
+
+ if (target->ttype == PGPA_TARGET_UNORDERED_LIST)
+ delims = "{}";
+ else
+ delims = "()";
+
+ appendStringInfoChar(str, delims[0]);
+ foreach_ptr(pgpa_advice_target, child_target, target->children)
+ {
+ if (first)
+ first = false;
+ else
+ appendStringInfoChar(str, ' ');
+ pgpa_format_advice_target(str, child_target);
+ }
+ appendStringInfoChar(str, delims[1]);
+ }
+ else
+ {
+ const char *rt_identifier;
+
+ rt_identifier = pgpa_identifier_string(&target->rid);
+ appendStringInfoString(str, rt_identifier);
+ }
+}
+
+/*
+ * Format a pgpa_index_target as a string and append result to a StringInfo.
+ */
+void
+pgpa_format_index_target(StringInfo str, pgpa_index_target *itarget)
+{
+ if (itarget->indnamespace != NULL)
+ appendStringInfo(str, "%s.",
+ quote_identifier(itarget->indnamespace));
+ appendStringInfoString(str, quote_identifier(itarget->indname));
+}
+
+/*
+ * Determine whether two pgpa_index_target objects are exactly identical.
+ */
+bool
+pgpa_index_targets_equal(pgpa_index_target *i1, pgpa_index_target *i2)
+{
+ /* indnamespace can be NULL, and two NULL values are equal */
+ if ((i1->indnamespace != NULL || i2->indnamespace != NULL) &&
+ (i1->indnamespace == NULL || i2->indnamespace == NULL ||
+ strcmp(i1->indnamespace, i2->indnamespace) != 0))
+ return false;
+ if (strcmp(i1->indname, i2->indname) != 0)
+ return false;
+
+ return true;
+}
+
+/*
+ * Check whether an identifier matches an any part of an advice target.
+ */
+bool
+pgpa_identifier_matches_target(pgpa_identifier *rid, pgpa_advice_target *target)
+{
+ /* For non-identifiers, check all descendents. */
+ if (target->ttype != PGPA_TARGET_IDENTIFIER)
+ {
+ foreach_ptr(pgpa_advice_target, child_target, target->children)
+ {
+ if (pgpa_identifier_matches_target(rid, child_target))
+ return true;
+ }
+ return false;
+ }
+
+ /* Straightforward comparisons of alias name and occcurrence number. */
+ if (strcmp(rid->alias_name, target->rid.alias_name) != 0)
+ return false;
+ if (rid->occurrence != target->rid.occurrence)
+ return false;
+
+ /*
+ * If a relation identifer mentions a partition name, it should also
+ * specify a partition schema. But the target may leave the schema NULL to
+ * match anything.
+ */
+ Assert(rid->partnsp != NULL || rid->partrel == NULL);
+ if (rid->partnsp != NULL && target->rid.partnsp != NULL &&
+ strcmp(rid->partnsp, target->rid.partnsp) != 0)
+ return false;
+
+ /*
+ * These fields can be NULL on either side, but NULL only matches another
+ * NULL.
+ */
+ if (!strings_equal_or_both_null(rid->partrel, target->rid.partrel))
+ return false;
+ if (!strings_equal_or_both_null(rid->plan_name, target->rid.plan_name))
+ return false;
+
+ return true;
+}
+
+/*
+ * Match identifiers to advice targets and return an enum value indicating
+ * the relationship between the set of keys and the set of targets.
+ *
+ * See the comments for pgpa_itm_type.
+ */
+pgpa_itm_type
+pgpa_identifiers_match_target(int nrids, pgpa_identifier *rids,
+ pgpa_advice_target *target)
+{
+ bool all_rids_used = true;
+ bool any_rids_used = false;
+ bool all_targets_used;
+ bool *rids_used = palloc0_array(bool, nrids);
+
+ all_targets_used =
+ pgpa_identifiers_cover_target(nrids, rids, target, rids_used);
+
+ for (int i = 0; i < nrids; ++i)
+ {
+ if (rids_used[i])
+ any_rids_used = true;
+ else
+ all_rids_used = false;
+ }
+
+ if (all_rids_used)
+ {
+ if (all_targets_used)
+ return PGPA_ITM_EQUAL;
+ else
+ return PGPA_ITM_KEYS_ARE_SUBSET;
+ }
+ else
+ {
+ if (all_targets_used)
+ return PGPA_ITM_TARGETS_ARE_SUBSET;
+ else if (any_rids_used)
+ return PGPA_ITM_INTERSECTING;
+ else
+ return PGPA_ITM_DISJOINT;
+ }
+}
+
+/*
+ * Returns true if every target or sub-target is matched by at least one
+ * identifier, and otherwise false.
+ *
+ * Also sets rids_used[i] = true for each idenifier that matches at least one
+ * target.
+ */
+static bool
+pgpa_identifiers_cover_target(int nrids, pgpa_identifier *rids,
+ pgpa_advice_target *target, bool *rids_used)
+{
+ bool result = false;
+
+ if (target->ttype != PGPA_TARGET_IDENTIFIER)
+ {
+ result = true;
+
+ foreach_ptr(pgpa_advice_target, child_target, target->children)
+ {
+ if (!pgpa_identifiers_cover_target(nrids, rids, child_target,
+ rids_used))
+ result = false;
+ }
+ }
+ else
+ {
+ for (int i = 0; i < nrids; ++i)
+ {
+ if (pgpa_identifier_matches_target(&rids[i], target))
+ {
+ rids_used[i] = true;
+ result = true;
+ }
+ }
+ }
+
+ return result;
+}
diff --git a/contrib/pg_plan_advice/pgpa_ast.h b/contrib/pg_plan_advice/pgpa_ast.h
new file mode 100644
index 00000000000..5d3f8d58a71
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_ast.h
@@ -0,0 +1,185 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_ast.h
+ * abstract syntax trees for plan advice, plus parser/scanner support
+ *
+ * Copyright (c) 2016-2024, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_ast.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PGPA_AST_H
+#define PGPA_AST_H
+
+#include "pgpa_identifier.h"
+
+#include "nodes/pg_list.h"
+
+/*
+ * Advice items generally take the form SOME_TAG(item [...]), where an item
+ * can take various forms. The simplest case is a relation identifier, but
+ * some tags allow sublists, and JOIN_ORDER() allows both ordered and unordered
+ * sublists.
+ */
+typedef enum
+{
+ PGPA_TARGET_IDENTIFIER, /* relation identifier */
+ PGPA_TARGET_ORDERED_LIST, /* (item ...) */
+ PGPA_TARGET_UNORDERED_LIST /* {item ...} */
+} pgpa_target_type;
+
+/*
+ * An index specification.
+ */
+typedef struct pgpa_index_target
+{
+ /* Index schema and name */
+ char *indnamespace;
+ char *indname;
+} pgpa_index_target;
+
+/*
+ * A single item about which advice is being given, which could be either
+ * a relation identifier that we want to break out into its constituent fields,
+ * or a sublist of some kind.
+ */
+typedef struct pgpa_advice_target
+{
+ pgpa_target_type ttype;
+
+ /*
+ * This field is meaningful when ttype is PGPA_TARGET_IDENTIFIER.
+ *
+ * All identifiers must have an alias name and an occurrence number; the
+ * remaining fields can be NULL. Note that it's possible to specify a
+ * partition name without a partition schema, but not the reverse.
+ */
+ pgpa_identifier rid;
+
+ /*
+ * This field is set when ttype is PPGA_TARGET_IDENTIFIER and the advice
+ * tag is PGPA_TAG_INDEX_SCAN or PGPA_TAG_INDEX_ONLY_SCAN.
+ */
+ pgpa_index_target *itarget;
+
+ /*
+ * When the ttype is PGPA_TARGET__LIST, this field contains a
+ * list of additional pgpa_advice_target objects. Otherwise, it is unused.
+ */
+ List *children;
+} pgpa_advice_target;
+
+/*
+ * These are all the kinds of advice that we know how to parse. If a keyword
+ * is found at the top level, it must be in this list.
+ *
+ * If you change anything here, also update pgpa_parse_advice_tag and
+ * pgpa_cstring_advice_tag.
+ */
+typedef enum pgpa_advice_tag_type
+{
+ PGPA_TAG_BITMAP_HEAP_SCAN,
+ PGPA_TAG_FOREIGN_JOIN,
+ PGPA_TAG_GATHER,
+ PGPA_TAG_GATHER_MERGE,
+ PGPA_TAG_HASH_JOIN,
+ PGPA_TAG_INDEX_ONLY_SCAN,
+ PGPA_TAG_INDEX_SCAN,
+ PGPA_TAG_JOIN_ORDER,
+ PGPA_TAG_MERGE_JOIN_MATERIALIZE,
+ PGPA_TAG_MERGE_JOIN_PLAIN,
+ PGPA_TAG_NESTED_LOOP_MATERIALIZE,
+ PGPA_TAG_NESTED_LOOP_MEMOIZE,
+ PGPA_TAG_NESTED_LOOP_PLAIN,
+ PGPA_TAG_NO_GATHER,
+ PGPA_TAG_PARTITIONWISE,
+ PGPA_TAG_SEMIJOIN_NON_UNIQUE,
+ PGPA_TAG_SEMIJOIN_UNIQUE,
+ PGPA_TAG_SEQ_SCAN,
+ PGPA_TAG_TID_SCAN
+} pgpa_advice_tag_type;
+
+/*
+ * An item of advice, meaning a tag and the list of all targets to which
+ * it is being applied.
+ *
+ * "targets" is a list of pgpa_advice_target objects.
+ *
+ * The List returned from pgpa_yyparse is list of pgpa_advice_item objects.
+ */
+typedef struct pgpa_advice_item
+{
+ pgpa_advice_tag_type tag;
+ List *targets;
+} pgpa_advice_item;
+
+/*
+ * Result of comparing an array of pgpa_relation_identifier objects to a
+ * pgpa_advice_target.
+ *
+ * PGPA_ITM_EQUAL means all targets are matched by some identifier, and
+ * all identifiers were matched to a target.
+ *
+ * PGPA_ITM_KEYS_ARE_SUBSET means that all identifiers matched to a target,
+ * but there were leftover targets. Generally, this means that the advice is
+ * looking to apply to all of the rels we have plus some additional ones that
+ * we don't have.
+ *
+ * PGPA_ITM_TARGETS_ARE_SUBSET means that all targets are matched by an
+ * identifiers, but there were leftover identifiers. Generally, this means
+ * that the advice is looking to apply to some but not all of the rels we have.
+ *
+ * PGPA_ITM_INTERSECTING means that some identifeirs and targets were matched,
+ * but neither all identifiers nor all targets could be matched to items in
+ * the other set.
+ *
+ * PGPA_ITM_DISJOINT means that no matches between identifeirs and targets were
+ * found.
+ */
+typedef enum
+{
+ PGPA_ITM_EQUAL,
+ PGPA_ITM_KEYS_ARE_SUBSET,
+ PGPA_ITM_TARGETS_ARE_SUBSET,
+ PGPA_ITM_INTERSECTING,
+ PGPA_ITM_DISJOINT
+} pgpa_itm_type;
+
+/* for pgpa_scanner.l and pgpa_parser.y */
+union YYSTYPE;
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void *yyscan_t;
+#endif
+
+/* in pgpa_scanner.l */
+extern int pgpa_yylex(union YYSTYPE *yylval_param, List **result,
+ char **parse_error_msg_p, yyscan_t yyscanner);
+extern void pgpa_yyerror(List **result, char **parse_error_msg_p,
+ yyscan_t yyscanner,
+ const char *message);
+extern void pgpa_scanner_init(const char *str, yyscan_t *yyscannerp);
+extern void pgpa_scanner_finish(yyscan_t yyscanner);
+
+/* in pgpa_parser.y */
+extern int pgpa_yyparse(List **result, char **parse_error_msg_p,
+ yyscan_t yyscanner);
+extern List *pgpa_parse(const char *advice_string, char **error_p);
+
+/* in pgpa_ast.c */
+extern char *pgpa_cstring_advice_tag(pgpa_advice_tag_type advice_tag);
+extern bool pgpa_identifier_matches_target(pgpa_identifier *rid,
+ pgpa_advice_target *target);
+extern pgpa_itm_type pgpa_identifiers_match_target(int nrids,
+ pgpa_identifier *rids,
+ pgpa_advice_target *target);
+extern bool pgpa_index_targets_equal(pgpa_index_target *i1,
+ pgpa_index_target *i2);
+extern pgpa_advice_tag_type pgpa_parse_advice_tag(const char *tag, bool *fail);
+extern void pgpa_format_advice_target(StringInfo str,
+ pgpa_advice_target *target);
+extern void pgpa_format_index_target(StringInfo str,
+ pgpa_index_target *itarget);
+
+#endif
diff --git a/contrib/pg_plan_advice/pgpa_collector.c b/contrib/pg_plan_advice/pgpa_collector.c
new file mode 100644
index 00000000000..a0b0d7e1594
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_collector.c
@@ -0,0 +1,639 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_collector.c
+ * collect advice into backend-local or shared memory
+ *
+ * Copyright (c) 2016-2025, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_collector.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "pg_plan_advice.h"
+#include "pgpa_collector.h"
+
+#include "datatype/timestamp.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "nodes/pg_list.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/timestamp.h"
+
+PG_FUNCTION_INFO_V1(pg_clear_collected_local_advice);
+PG_FUNCTION_INFO_V1(pg_clear_collected_shared_advice);
+PG_FUNCTION_INFO_V1(pg_get_collected_local_advice);
+PG_FUNCTION_INFO_V1(pg_get_collected_shared_advice);
+
+#define ADVICE_CHUNK_SIZE 1024
+#define ADVICE_CHUNK_ARRAY_SIZE 64
+
+#define PG_GET_ADVICE_COLUMNS 7
+
+/*
+ * Advice extracted from one query plan, together with the query string
+ * and various other identifying details.
+ */
+typedef struct pgpa_collected_advice
+{
+ Oid userid; /* user OID */
+ Oid dbid; /* database OID */
+ uint64 queryid; /* query identifier */
+ TimestampTz timestamp; /* query timestamp */
+ int advice_offset; /* start of advice in textual data */
+ char textual_data[FLEXIBLE_ARRAY_MEMBER];
+} pgpa_collected_advice;
+
+/*
+ * A bunch of pointers to pgpa_collected_advice objects, stored in
+ * backend-local memory.
+ */
+typedef struct pgpa_local_advice_chunk
+{
+ pgpa_collected_advice *entries[ADVICE_CHUNK_SIZE];
+} pgpa_local_advice_chunk;
+
+/*
+ * Information about all of the pgpa_collected_advice objects that we're
+ * storing in local memory.
+ *
+ * We assign consecutive IDs, starting from 0, to each pgpa_collected_advice
+ * object that we store. The actual storage is an array of chunks, which
+ * helps keep memcpy() overhead low when we start discarding older data.
+ */
+typedef struct pgpa_local_advice
+{
+ uint64 next_id;
+ uint64 oldest_id;
+ uint64 base_id;
+ int chunk_array_allocated_size;
+ pgpa_local_advice_chunk **chunks;
+} pgpa_local_advice;
+
+/*
+ * Just like pgpa_local_advice_chunk, but stored in a dynamic shared area,
+ * so we must use dsa_pointer instead of native pointers.
+ */
+typedef struct pgpa_shared_advice_chunk
+{
+ dsa_pointer entries[ADVICE_CHUNK_SIZE];
+} pgpa_shared_advice_chunk;
+
+/*
+ * Just like pgpa_local_advice, but stored in a dynamic shared area, so
+ * we must use dsa_pointer instead of native pointers.
+ */
+typedef struct pgpa_shared_advice
+{
+ uint64 next_id;
+ uint64 oldest_id;
+ uint64 base_id;
+ int chunk_array_allocated_size;
+ dsa_pointer chunks;
+} pgpa_shared_advice;
+
+/* Pointers to local and shared collectors */
+static pgpa_local_advice *local_collector = NULL;
+static pgpa_shared_advice *shared_collector = NULL;
+
+/* Static functions */
+static pgpa_collected_advice *pgpa_make_collected_advice(Oid userid,
+ Oid dbid,
+ uint64 queryId,
+ TimestampTz timestamp,
+ const char *query_string,
+ const char *advice_string,
+ dsa_area *area,
+ dsa_pointer *result);
+static void pgpa_store_local_advice(pgpa_collected_advice *ca);
+static void pgpa_trim_local_advice(int limit);
+static void pgpa_store_shared_advice(dsa_pointer ca_pointer);
+static void pgpa_trim_shared_advice(dsa_area *area, int limit);
+
+/* Helper function to extract the query string from pgpa_collected_advice */
+static inline const char *
+query_string(pgpa_collected_advice *ca)
+{
+ return ca->textual_data;
+}
+
+/* Helper function to extract the advice string from pgpa_collected_advice */
+static inline const char *
+advice_string(pgpa_collected_advice *ca)
+{
+ return ca->textual_data + ca->advice_offset;
+}
+
+/*
+ * Store collected query advice into the local or shared advice collector,
+ * as appropriate.
+ */
+void
+pgpa_collect_advice(uint64 queryId, const char *query_string,
+ const char *advice_string)
+{
+ Oid userid = GetUserId();
+ Oid dbid = MyDatabaseId;
+ TimestampTz now = GetCurrentTimestamp();
+
+ if (pg_plan_advice_local_collector &&
+ pg_plan_advice_local_collection_limit > 0)
+ {
+ pgpa_collected_advice *ca;
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(pg_plan_advice_get_mcxt());
+ ca = pgpa_make_collected_advice(userid, dbid, queryId, now,
+ query_string, advice_string,
+ NULL, NULL);
+ pgpa_store_local_advice(ca);
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ if (pg_plan_advice_shared_collector &&
+ pg_plan_advice_shared_collection_limit > 0)
+ {
+ dsa_area *area = pg_plan_advice_dsa_area();
+ dsa_pointer ca_pointer = InvalidDsaPointer; /* placate compiler */
+
+ pgpa_make_collected_advice(userid, dbid, queryId, now,
+ query_string, advice_string, area,
+ &ca_pointer);
+ pgpa_store_shared_advice(ca_pointer);
+ }
+}
+
+/*
+ * Allocate and fill a new pgpa_collected_advice object.
+ *
+ * If area != NULL, it is used to allocate the new object, and the resulting
+ * dsa_pointer is returned via *result.
+ *
+ * If area == NULL, the new object is allocated in the current memory context,
+ * and result is not examined or modified.
+ */
+static pgpa_collected_advice *
+pgpa_make_collected_advice(Oid userid, Oid dbid, uint64 queryId,
+ TimestampTz timestamp,
+ const char *query_string,
+ const char *advice_string,
+ dsa_area *area, dsa_pointer *result)
+{
+ size_t query_string_length = strlen(query_string) + 1;
+ size_t advice_string_length = strlen(advice_string) + 1;
+ size_t total_length;
+ pgpa_collected_advice *ca;
+
+ total_length = offsetof(pgpa_collected_advice, textual_data)
+ + query_string_length + advice_string_length;
+
+ if (area == NULL)
+ ca = palloc(total_length);
+ else
+ {
+ *result = dsa_allocate(area, total_length);
+ ca = dsa_get_address(area, *result);
+ }
+
+ ca->userid = userid;
+ ca->dbid = dbid;
+ ca->queryid = queryId;
+ ca->timestamp = timestamp;
+ ca->advice_offset = query_string_length;
+
+ memcpy(ca->textual_data, query_string, query_string_length);
+ memcpy(&ca->textual_data[ca->advice_offset],
+ advice_string, advice_string_length);
+
+ return ca;
+}
+
+/*
+ * Add a pg_collected_advice object to our backend-local advice collection.
+ *
+ * Caller is responsible for switching to the appropriate memory context;
+ * the provided object should have been allocated in that same context.
+ */
+static void
+pgpa_store_local_advice(pgpa_collected_advice *ca)
+{
+ uint64 chunk_number;
+ uint64 chunk_offset;
+ pgpa_local_advice *la = local_collector;
+
+ /* If the local advice collector isn't initialized yet, do that now. */
+ if (la == NULL)
+ {
+ la = palloc0(sizeof(pgpa_local_advice));
+ la->chunk_array_allocated_size = ADVICE_CHUNK_ARRAY_SIZE;
+ la->chunks = palloc0_array(pgpa_local_advice_chunk *,
+ la->chunk_array_allocated_size);
+ local_collector = la;
+ }
+
+ /* Compute chunk and offset at which to store this advice. */
+ chunk_number = (la->next_id - la->base_id) / ADVICE_CHUNK_SIZE;
+ chunk_offset = (la->next_id - la->base_id) % ADVICE_CHUNK_SIZE;
+
+ /* Extend chunk array, if needed. */
+ if (chunk_number >= la->chunk_array_allocated_size)
+ {
+ int new_size;
+
+ new_size = la->chunk_array_allocated_size + ADVICE_CHUNK_ARRAY_SIZE;
+ la->chunks = repalloc0_array(la->chunks,
+ pgpa_local_advice_chunk *,
+ la->chunk_array_allocated_size,
+ new_size);
+ la->chunk_array_allocated_size = new_size;
+ }
+
+ /* Allocate new chunk, if needed. */
+ if (la->chunks[chunk_number] == NULL)
+ la->chunks[chunk_number] = palloc0_object(pgpa_local_advice_chunk);
+
+ /* Save pointer and bump next-id counter. */
+ Assert(la->chunks[chunk_number]->entries[chunk_offset] == NULL);
+ la->chunks[chunk_number]->entries[chunk_offset] = ca;
+ ++la->next_id;
+
+ /* If we've exceeded the storage limit, discard old data. */
+ pgpa_trim_local_advice(pg_plan_advice_local_collection_limit);
+}
+
+/*
+ * Add a pg_collected_advice object to the shared advice collection.
+ *
+ * 'ca_pointer' should have been allocated from the pg_plan_advice DSA area
+ * and should point to an object of type pgpa_collected_advice.
+ */
+static void
+pgpa_store_shared_advice(dsa_pointer ca_pointer)
+{
+ uint64 chunk_number;
+ uint64 chunk_offset;
+ pgpa_shared_state *state = pg_plan_advice_attach();
+ dsa_area *area = pg_plan_advice_dsa_area();
+ pgpa_shared_advice *sa = shared_collector;
+ dsa_pointer *chunk_array;
+ pgpa_shared_advice_chunk *chunk;
+
+ /* Lock the shared state. */
+ LWLockAcquire(&state->lock, LW_EXCLUSIVE);
+
+ /*
+ * If we're not attached to the shared advice collector yet, fix that now.
+ * If we're the first ones to attach, we may need to create the object.
+ */
+ if (sa == NULL)
+ {
+ if (state->shared_collector == InvalidDsaPointer)
+ state->shared_collector =
+ dsa_allocate0(area, sizeof(pgpa_shared_advice));
+ shared_collector = sa = dsa_get_address(area, state->shared_collector);
+ }
+
+ /*
+ * It's possible that some other backend may have succeeded in creating
+ * the main collector object but failed to allocate an initial chunk
+ * array, so we must be prepared to allocate the chunk array here whether
+ * or not we created the collector object.
+ */
+ if (shared_collector->chunk_array_allocated_size == 0)
+ {
+ sa->chunks =
+ dsa_allocate0(area,
+ sizeof(dsa_pointer) * ADVICE_CHUNK_ARRAY_SIZE);
+ sa->chunk_array_allocated_size = ADVICE_CHUNK_ARRAY_SIZE;
+ }
+
+ /* Compute chunk and offset at which to store this advice. */
+ chunk_number = (sa->next_id - sa->base_id) / ADVICE_CHUNK_SIZE;
+ chunk_offset = (sa->next_id - sa->base_id) % ADVICE_CHUNK_SIZE;
+
+ /* Get the address of the chunk array and, if needed, extend it. */
+ if (chunk_number >= sa->chunk_array_allocated_size)
+ {
+ int new_size;
+ dsa_pointer new_chunks;
+
+ /*
+ * DSA can't enlarge an existing allocation, so we must make a new
+ * allocation and copy data over.
+ */
+ new_size = sa->chunk_array_allocated_size + ADVICE_CHUNK_ARRAY_SIZE;
+ new_chunks = dsa_allocate0(area, sizeof(dsa_pointer) * new_size);
+ chunk_array = dsa_get_address(area, new_chunks);
+ memcpy(chunk_array, dsa_get_address(area, sa->chunks),
+ sizeof(dsa_pointer) * sa->chunk_array_allocated_size);
+ dsa_free(area, sa->chunks);
+ sa->chunks = new_chunks;
+ sa->chunk_array_allocated_size = new_size;
+ }
+ else
+ chunk_array = dsa_get_address(area, sa->chunks);
+
+ /* Get the address of the desired chunk, allocating it if needed. */
+ if (chunk_array[chunk_number] == InvalidDsaPointer)
+ chunk_array[chunk_number] =
+ dsa_allocate0(area, sizeof(pgpa_shared_advice_chunk));
+ chunk = dsa_get_address(area, chunk_array[chunk_number]);
+
+ /* Save pointer and bump next-id counter. */
+ Assert(chunk->entries[chunk_offset] == InvalidDsaPointer);
+ chunk->entries[chunk_offset] = ca_pointer;
+ ++sa->next_id;
+
+ /* If we've exceeded the storage limit, discard old data. */
+ pgpa_trim_shared_advice(area, pg_plan_advice_shared_collection_limit);
+
+ /* Release lock on shared state. */
+ LWLockRelease(&state->lock);
+}
+
+/*
+ * Discard collected advice stored in backend-local memory in excess of the
+ * specified limit.
+ */
+static void
+pgpa_trim_local_advice(int limit)
+{
+ pgpa_local_advice *la = local_collector;
+ uint64 current_count;
+ uint64 trim_count;
+ uint64 total_chunk_count;
+ uint64 trim_chunk_count;
+ uint64 remaining_chunk_count;
+
+ /* If we haven't yet reached the limit, there's nothing to do. */
+ current_count = la->next_id - la->oldest_id;
+ if (current_count <= limit)
+ return;
+
+ /* Free enough entries to get us back down to the limit. */
+ trim_count = current_count - limit;
+ while (trim_count > 0)
+ {
+ uint64 chunk_number;
+ uint64 chunk_offset;
+
+ chunk_number = (la->oldest_id - la->base_id) / ADVICE_CHUNK_SIZE;
+ chunk_offset = (la->oldest_id - la->base_id) % ADVICE_CHUNK_SIZE;
+
+ Assert(la->chunks[chunk_number]->entries[chunk_offset] != NULL);
+ pfree(la->chunks[chunk_number]->entries[chunk_offset]);
+ la->chunks[chunk_number]->entries[chunk_offset] = NULL;
+ ++la->oldest_id;
+ --trim_count;
+ }
+
+ /* Free any chunks that are now entirely unused. */
+ trim_chunk_count = (la->oldest_id - la->base_id) / ADVICE_CHUNK_SIZE;
+ for (uint64 n = 0; n < trim_chunk_count; ++n)
+ pfree(la->chunks[n]);
+
+ /* Slide remaining chunk pointers back toward the base of the array. */
+ total_chunk_count = (la->next_id - la->base_id +
+ ADVICE_CHUNK_SIZE - 1) / ADVICE_CHUNK_SIZE;
+ remaining_chunk_count = total_chunk_count - trim_chunk_count;
+ if (remaining_chunk_count > 0)
+ memmove(&la->chunks[0], &la->chunks[trim_chunk_count],
+ sizeof(pgpa_local_advice_chunk *) * remaining_chunk_count);
+
+ /* Don't leave stale pointers around. */
+ memset(&la->chunks[remaining_chunk_count], 0,
+ sizeof(pgpa_local_advice_chunk *)
+ * (total_chunk_count - remaining_chunk_count));
+
+ /* Adjust base ID value accordingly. */
+ la->base_id += trim_chunk_count * ADVICE_CHUNK_SIZE;
+}
+
+/*
+ * Discard collected advice stored in shared memory in excess of the
+ * specified limit.
+ */
+static void
+pgpa_trim_shared_advice(dsa_area *area, int limit)
+{
+ pgpa_shared_advice *sa = shared_collector;
+ uint64 current_count;
+ uint64 trim_count;
+ uint64 total_chunk_count;
+ uint64 trim_chunk_count;
+ uint64 remaining_chunk_count;
+ dsa_pointer *chunk_array;
+
+ /* If we haven't yet reached the limit, there's nothing to do. */
+ current_count = sa->next_id - sa->oldest_id;
+ if (current_count <= limit)
+ return;
+
+ /* Get a pointer to the chunk array. */
+ chunk_array = dsa_get_address(area, sa->chunks);
+
+ /* Free enough entries to get us back down to the limit. */
+ trim_count = current_count - limit;
+ while (trim_count > 0)
+ {
+ uint64 chunk_number;
+ uint64 chunk_offset;
+ pgpa_shared_advice_chunk *chunk;
+
+ chunk_number = (sa->oldest_id - sa->base_id) / ADVICE_CHUNK_SIZE;
+ chunk_offset = (sa->oldest_id - sa->base_id) % ADVICE_CHUNK_SIZE;
+
+ chunk = dsa_get_address(area, chunk_array[chunk_number]);
+ Assert(chunk->entries[chunk_offset] != InvalidDsaPointer);
+ dsa_free(area, chunk->entries[chunk_offset]);
+ chunk->entries[chunk_offset] = InvalidDsaPointer;
+ ++sa->oldest_id;
+ --trim_count;
+ }
+
+ /* Free any chunks that are now entirely unused. */
+ trim_chunk_count = (sa->oldest_id - sa->base_id) / ADVICE_CHUNK_SIZE;
+ for (uint64 n = 0; n < trim_chunk_count; ++n)
+ dsa_free(area, chunk_array[n]);
+
+ /* Slide remaining chunk pointers back toward the base of the array. */
+ total_chunk_count = (sa->next_id - sa->base_id +
+ ADVICE_CHUNK_SIZE - 1) / ADVICE_CHUNK_SIZE;
+ remaining_chunk_count = total_chunk_count - trim_chunk_count;
+ if (remaining_chunk_count > 0)
+ memmove(&chunk_array[0], &chunk_array[trim_chunk_count],
+ sizeof(dsa_pointer) * remaining_chunk_count);
+
+ /* Don't leave stale pointers around. */
+ memset(&chunk_array[remaining_chunk_count], 0,
+ sizeof(pgpa_shared_advice_chunk *)
+ * (total_chunk_count - remaining_chunk_count));
+
+ /* Adjust base ID value accordingly. */
+ sa->base_id += trim_chunk_count * ADVICE_CHUNK_SIZE;
+}
+
+/*
+ * SQL-callable function to discard advice collected in backend-local memory
+ */
+Datum
+pg_clear_collected_local_advice(PG_FUNCTION_ARGS)
+{
+ if (local_collector != NULL)
+ pgpa_trim_local_advice(0);
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * SQL-callable function to discard advice collected in backend-local memory
+ */
+Datum
+pg_clear_collected_shared_advice(PG_FUNCTION_ARGS)
+{
+ pgpa_shared_state *state = pg_plan_advice_attach();
+ dsa_area *area = pg_plan_advice_dsa_area();
+
+ LWLockAcquire(&state->lock, LW_EXCLUSIVE);
+
+ /*
+ * If we're not attached to the shared advice collector yet, fix that now;
+ * but if the collector doesn't even exist, we can return without doing
+ * anything else.
+ */
+ if (shared_collector == NULL)
+ {
+ if (state->shared_collector == InvalidDsaPointer)
+ {
+ LWLockRelease(&state->lock);
+ return (Datum) 0;
+ }
+ shared_collector = dsa_get_address(area, state->shared_collector);
+ }
+
+ /* Do the real work */
+ pgpa_trim_shared_advice(area, 0);
+
+ LWLockRelease(&state->lock);
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * SQL-callable SRF to return advice collected in backend-local memory
+ */
+Datum
+pg_get_collected_local_advice(PG_FUNCTION_ARGS)
+{
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ pgpa_local_advice *la = local_collector;
+ Oid userid = GetUserId();
+
+ InitMaterializedSRF(fcinfo, 0);
+
+ if (la == NULL)
+ return (Datum) 0;
+
+ /* Loop over all entries. */
+ for (uint64 id = la->oldest_id; id < la->next_id; ++id)
+ {
+ uint64 chunk_number;
+ uint64 chunk_offset;
+ pgpa_collected_advice *ca;
+ Datum values[PG_GET_ADVICE_COLUMNS];
+ bool nulls[PG_GET_ADVICE_COLUMNS] = {0};
+
+ chunk_number = (id - la->base_id) / ADVICE_CHUNK_SIZE;
+ chunk_offset = (id - la->base_id) % ADVICE_CHUNK_SIZE;
+
+ ca = la->chunks[chunk_number]->entries[chunk_offset];
+
+ if (!member_can_set_role(userid, ca->userid))
+ continue;
+
+ values[0] = UInt64GetDatum(id);
+ values[1] = ObjectIdGetDatum(ca->userid);
+ values[2] = ObjectIdGetDatum(ca->dbid);
+ values[3] = UInt64GetDatum(ca->queryid);
+ values[4] = TimestampGetDatum(ca->timestamp);
+ values[5] = CStringGetTextDatum(query_string(ca));
+ values[6] = CStringGetTextDatum(advice_string(ca));
+
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+ values, nulls);
+ }
+
+ return (Datum) 0;
+}
+
+/*
+ * SQL-callable SRF to return advice collected in shared memory
+ */
+Datum
+pg_get_collected_shared_advice(PG_FUNCTION_ARGS)
+{
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ pgpa_shared_state *state = pg_plan_advice_attach();
+ dsa_area *area = pg_plan_advice_dsa_area();
+ dsa_pointer *chunk_array;
+ pgpa_shared_advice *sa = shared_collector;
+
+ InitMaterializedSRF(fcinfo, 0);
+
+ /* Lock the shared state. */
+ LWLockAcquire(&state->lock, LW_SHARED);
+
+ /*
+ * If we're not attached to the shared advice collector yet, fix that now;
+ * but if the collector doesn't even exist, we can return without doing
+ * anything else.
+ */
+ if (sa == NULL)
+ {
+ if (state->shared_collector == InvalidDsaPointer)
+ {
+ LWLockRelease(&state->lock);
+ return (Datum) 0;
+ }
+ shared_collector = sa = dsa_get_address(area, state->shared_collector);
+ }
+
+ /* Get a pointer to the chunk array. */
+ chunk_array = dsa_get_address(area, sa->chunks);
+
+ /* Loop over all entries. */
+ for (uint64 id = sa->oldest_id; id < sa->next_id; ++id)
+ {
+ uint64 chunk_number;
+ uint64 chunk_offset;
+ pgpa_shared_advice_chunk *chunk;
+ pgpa_collected_advice *ca;
+ Datum values[PG_GET_ADVICE_COLUMNS];
+ bool nulls[PG_GET_ADVICE_COLUMNS] = {0};
+
+ chunk_number = (id - sa->base_id) / ADVICE_CHUNK_SIZE;
+ chunk_offset = (id - sa->base_id) % ADVICE_CHUNK_SIZE;
+
+ chunk = dsa_get_address(area, chunk_array[chunk_number]);
+ ca = dsa_get_address(area, chunk->entries[chunk_offset]);
+
+ values[0] = UInt64GetDatum(id);
+ values[1] = ObjectIdGetDatum(ca->userid);
+ values[2] = ObjectIdGetDatum(ca->dbid);
+ values[3] = UInt64GetDatum(ca->queryid);
+ values[4] = TimestampGetDatum(ca->timestamp);
+ values[5] = CStringGetTextDatum(query_string(ca));
+ values[6] = CStringGetTextDatum(advice_string(ca));
+
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+ values, nulls);
+ }
+
+ /* Release lock on shared state. */
+ LWLockRelease(&state->lock);
+
+ return (Datum) 0;
+}
diff --git a/contrib/pg_plan_advice/pgpa_collector.h b/contrib/pg_plan_advice/pgpa_collector.h
new file mode 100644
index 00000000000..b6e746a06d7
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_collector.h
@@ -0,0 +1,18 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_collector.h
+ * collect advice into backend-local or shared memory
+ *
+ * Copyright (c) 2016-2025, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_collector.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PGPA_COLLECTOR_H
+#define PGPA_COLLECTOR_H
+
+extern void pgpa_collect_advice(uint64 queryId, const char *query_string,
+ const char *advice_string);
+
+#endif
diff --git a/contrib/pg_plan_advice/pgpa_identifier.c b/contrib/pg_plan_advice/pgpa_identifier.c
new file mode 100644
index 00000000000..51b4b0c60a6
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_identifier.c
@@ -0,0 +1,476 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_identifier.c
+ * create appropriate identifiers for range table entries
+ *
+ * The goal of this module is to be able to produce identifiers for range
+ * table entries that are unique, understandable to human beings, and
+ * able to be reconstructed during future planning cycles. As an
+ * exception, we do not care about, or want to produce, identifiers for
+ * RTE_JOIN entries. This is because (1) we would end up with a ton of
+ * RTEs with unhelpful names like unnamed_join_17; (2) not all joins have
+ * RTEs; and (3) we intend to refer to joins by their constituent members
+ * rather than by reference to the join RTE.
+ *
+ * In general, we construct identifiers of the following form:
+ *
+ * alias_name#occurrence_number/child_table_name@subquery_name
+ *
+ * However, occurrence_number is omitted when it is the first occurrence
+ * within the same subquery, child_table_name is omitted for relations that
+ * are not child tables, and subquery_name is omitted for the topmost
+ * query level. Whenever an item is omitted, the preceding punctuation mark
+ * is also omitted. Identifier-style escaping is applied to alias_name and
+ * subquery_name. Whenever we include child_table_name, we always
+ * schema-qualified name, but writing their own plan advice are not required
+ * to do so. Identifier-style escaping is applied to the schema and to the
+ * relation names separately.
+ *
+ * The upshot of all of these rules is that in simple cases, the relation
+ * identifier is textually identical to the alias name, making life easier
+ * for users. However, even in complex cases, every relation identifier
+ * for a given query will be unique (or at least we hope so: if not, this
+ * code is buggy and the identifier format might need to be rethought).
+ *
+ * A key goal of this system is that we want to be able to reconstruct the
+ * same identifiers during a future planning cycle for the same query, so
+ * that if a certain behavior is specified for a certain identifier, we can
+ * properly identify the RTI for which that behavior is mandated. In order
+ * for this to work, subquery names must be unique and known before the
+ * subquery is planned, and the remainder of the identifier must not depend
+ * on any part of the query outside of the current subquery level. In
+ * particular, occurrence_number must be calculated relative to the range
+ * table for the relevant subquery, not the final flattened range table.
+ *
+ * Copyright (c) 2016-2024, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_identifier.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "pgpa_identifier.h"
+
+#include "parser/parsetree.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+
+static Index *pgpa_create_top_rti_map(Index rtable_length, List *rtable,
+ List *appinfos);
+static int pgpa_occurrence_number(List *rtable, Index *top_rti_map,
+ SubPlanRTInfo *rtinfo, Index rti);
+
+/*
+ * Create a range table identifier from scratch.
+ *
+ * This function leaves the caller to do all the heavy lifting, so it's
+ * generally better to use one of the functions below instead.
+ *
+ * See the file header comments for more details on the format of an
+ * identifier.
+ */
+const char *
+pgpa_identifier_string(const pgpa_identifier *rid)
+{
+ const char *result;
+
+ Assert(rid->alias_name != NULL);
+ result = quote_identifier(rid->alias_name);
+
+ Assert(rid->occurrence >= 0);
+ if (rid->occurrence > 1)
+ result = psprintf("%s#%d", result, rid->occurrence);
+
+ if (rid->partrel != NULL)
+ {
+ if (rid->partnsp == NULL)
+ result = psprintf("%s/%s", result,
+ quote_identifier(rid->partrel));
+ else
+ result = psprintf("%s/%s.%s", result,
+ quote_identifier(rid->partnsp),
+ quote_identifier(rid->partrel));
+ }
+
+ if (rid->plan_name != NULL)
+ result = psprintf("%s@%s", result, quote_identifier(rid->plan_name));
+
+ return result;
+}
+
+/*
+ * Compute a relation identifier for a particular RTI.
+ *
+ * The caller provides root and rti, and gets the necessary details back via
+ * the remaining parameters.
+ */
+void
+pgpa_compute_identifier_by_rti(PlannerInfo *root, Index rti,
+ pgpa_identifier *rid)
+{
+ Index top_rti = rti;
+ int occurrence = 1;
+ RangeTblEntry *rte;
+ RangeTblEntry *top_rte;
+ char *partnsp = NULL;
+ char *partrel = NULL;
+
+ /*
+ * If this is a child RTE, find the topmost parent that is still of type
+ * RTE_RELATION. We do this because we identify children of partitioned
+ * tables by the name of the child table, but subqueries can also have
+ * child rels and we don't care about those here.
+ */
+ for (;;)
+ {
+ AppendRelInfo *appinfo;
+ RangeTblEntry *parent_rte;
+
+ /* append_rel_array can be NULL if there are no children */
+ if (root->append_rel_array == NULL ||
+ (appinfo = root->append_rel_array[top_rti]) == NULL)
+ break;
+
+ parent_rte = planner_rt_fetch(appinfo->parent_relid, root);
+ if (parent_rte->rtekind != RTE_RELATION)
+ break;
+
+ top_rti = appinfo->parent_relid;
+ }
+
+ /* Get the range table entries for the RTI and top RTI. */
+ rte = planner_rt_fetch(rti, root);
+ top_rte = planner_rt_fetch(top_rti, root);
+ Assert(rte->rtekind != RTE_JOIN);
+ Assert(top_rte->rtekind != RTE_JOIN);
+
+ /* Work out the correct occurrence number. */
+ for (Index prior_rti = 1; prior_rti < top_rti; ++prior_rti)
+ {
+ RangeTblEntry *prior_rte;
+ AppendRelInfo *appinfo;
+
+ /*
+ * If this is a child rel of a parent that is a relation, skip it.
+ *
+ * Such range table entries are disambiguated by mentioning the schema
+ * and name of the table, not by counting them as separate occurrences
+ * of the same table.
+ *
+ * NB: append_rel_array can be NULL if there are no children
+ */
+ if (root->append_rel_array != NULL &&
+ (appinfo = root->append_rel_array[prior_rti]) != NULL)
+ {
+ RangeTblEntry *parent_rte;
+
+ parent_rte = planner_rt_fetch(appinfo->parent_relid, root);
+ if (parent_rte->rtekind == RTE_RELATION)
+ continue;
+ }
+
+ /* Skip NULL entries and joins. */
+ prior_rte = planner_rt_fetch(prior_rti, root);
+ if (prior_rte == NULL || prior_rte->rtekind == RTE_JOIN)
+ continue;
+
+ /* Skip if the alias name differs. */
+ if (strcmp(prior_rte->eref->aliasname, rte->eref->aliasname) != 0)
+ continue;
+
+ /* Looks like a true duplicate. */
+ ++occurrence;
+ }
+
+ /* If this is a child table, get the schema and relation names. */
+ if (rti != top_rti)
+ {
+ partnsp = get_namespace_name_or_temp(get_rel_namespace(rte->relid));
+ partrel = get_rel_name(rte->relid);
+ }
+
+ /* OK, we have all the answers we need. Return them to the caller. */
+ rid->alias_name = top_rte->eref->aliasname;
+ rid->occurrence = occurrence;
+ rid->partnsp = partnsp;
+ rid->partrel = partrel;
+ rid->plan_name = root->plan_name;
+}
+
+/*
+ * Compute a relation identifier for a set of RTIs, except for any RTE_JOIN
+ * RTIs that may be present.
+ *
+ * RTE_JOIN entries are excluded because they cannot be mentioned by plan
+ * advice.
+ *
+ * The caller is responsible for making sure that the tkeys array is large
+ * enough to store the results.
+ *
+ * The return value is the number of identifiers computed.
+ */
+int
+pgpa_compute_identifiers_by_relids(PlannerInfo *root, Bitmapset *relids,
+ pgpa_identifier *rids)
+{
+ int count = 0;
+ int rti = -1;
+
+ while ((rti = bms_next_member(relids, rti)) >= 0)
+ {
+ RangeTblEntry *rte = planner_rt_fetch(rti, root);
+
+ if (rte->rtekind == RTE_JOIN)
+ continue;
+ pgpa_compute_identifier_by_rti(root, rti, &rids[count++]);
+ }
+
+ Assert(count > 0);
+ return count;
+}
+
+/*
+ * Create an array of range table identifiers for all the non-NULL,
+ * non-RTE_JOIN entries in the PlannedStmt's range table.
+ */
+pgpa_identifier *
+pgpa_create_identifiers_for_planned_stmt(PlannedStmt *pstmt)
+{
+ Index rtable_length = list_length(pstmt->rtable);
+ pgpa_identifier *result = palloc0_array(pgpa_identifier, rtable_length);
+ Index *top_rti_map;
+ int rtinfoindex = 0;
+ SubPlanRTInfo *rtinfo = NULL;
+ SubPlanRTInfo *nextrtinfo = NULL;
+
+ /*
+ * Account for relations addded by inheritance expansion of partitioned
+ * tables.
+ */
+ top_rti_map = pgpa_create_top_rti_map(rtable_length, pstmt->rtable,
+ pstmt->appendRelations);
+
+ /*
+ * When we begin iterating, we're processing the portion of the range
+ * table that originated from the top-level PlannerInfo, so subrtinfo is
+ * NULL. Later, subrtinfo will be the SubPlanRTInfo for the subquery whose
+ * portion of the range table we are processing. nextrtinfo is always the
+ * SubPlanRTInfo that follows the current one, if any, so when we're
+ * processing the top-level query's portion of the range table, the next
+ * SubPlanRTInfo is the very first one.
+ */
+ if (pstmt->subrtinfos != NULL)
+ nextrtinfo = linitial(pstmt->subrtinfos);
+
+ /* Main loop over the range table. */
+ for (Index rti = 1; rti <= rtable_length; rti++)
+ {
+ const char *plan_name;
+ Index top_rti;
+ RangeTblEntry *rte;
+ RangeTblEntry *top_rte;
+ char *partnsp = NULL;
+ char *partrel = NULL;
+ int occurrence;
+ pgpa_identifier *rid;
+
+ /*
+ * Advance to the next SubPlanRTInfo, if it's time to do that.
+ *
+ * This loop probably shouldn't ever iterate more than once, because
+ * that would imply that a subquery was planned but added nothing to
+ * the range table; but let's be defensive and assume it can happen.
+ */
+ while (nextrtinfo != NULL && rti > nextrtinfo->rtoffset)
+ {
+ rtinfo = nextrtinfo;
+ if (++rtinfoindex >= list_length(pstmt->subrtinfos))
+ nextrtinfo = NULL;
+ else
+ nextrtinfo = list_nth(pstmt->subrtinfos, rtinfoindex);
+ }
+
+ /* Fetch the range table entry, if any. */
+ rte = rt_fetch(rti, pstmt->rtable);
+
+ /*
+ * We can't and don't need to identify null entries, and we don't want
+ * to identify join entries.
+ */
+ if (rte == NULL || rte->rtekind == RTE_JOIN)
+ continue;
+
+ /*
+ * If this is not a relation added by partitioned table expansion,
+ * then the top RTI/RTE are just the same as this RTI/RTE. Otherwise,
+ * we need the information for the top RTI/RTE, and must also fetch
+ * the partition schema and name.
+ */
+ top_rti = top_rti_map[rti - 1];
+ if (rti == top_rti)
+ top_rte = rte;
+ else
+ {
+ top_rte = rt_fetch(top_rti, pstmt->rtable);
+ partnsp =
+ get_namespace_name_or_temp(get_rel_namespace(rte->relid));
+ partrel = get_rel_name(rte->relid);
+ }
+
+ /* Compute the correct occurrence number. */
+ occurrence = pgpa_occurrence_number(pstmt->rtable, top_rti_map,
+ rtinfo, top_rti);
+
+ /* Get the name of the current plan (NULL for toplevel query). */
+ plan_name = rtinfo == NULL ? NULL : rtinfo->plan_name;
+
+ /* Save all the details we've derived. */
+ rid = &result[rti - 1];
+ rid->alias_name = top_rte->eref->aliasname;
+ rid->occurrence = occurrence;
+ rid->partnsp = partnsp;
+ rid->partrel = partrel;
+ rid->plan_name = plan_name;
+ }
+
+ return result;
+}
+
+/*
+ * Search for a pgpa_identifier in the array of identifiers computed for the
+ * range table. If exactly one match is found, return the matching RTI; else
+ * return 0.
+ */
+Index
+pgpa_compute_rti_from_identifier(int rtable_length,
+ pgpa_identifier *rt_identifiers,
+ pgpa_identifier *rid)
+{
+ Index result = 0;
+
+ for (Index rti = 1; rti <= rtable_length; ++rti)
+ {
+ pgpa_identifier *rti_rid = &rt_identifiers[rti - 1];
+
+ /* If there's no identifier for this RTI, skip it. */
+ if (rti_rid->alias_name == NULL)
+ continue;
+
+ /*
+ * If it matches, return this RTI. As usual, an omitted partition
+ * schema matches anything, but partition and plan names must either
+ * match exactly or be omitted on both sides.
+ */
+ if (strcmp(rid->alias_name, rti_rid->alias_name) == 0 &&
+ rid->occurrence == rti_rid->occurrence &&
+ (rid->partnsp == NULL || rti_rid->partnsp == NULL ||
+ strcmp(rid->partnsp, rti_rid->partnsp) == 0) &&
+ strings_equal_or_both_null(rid->partrel, rti_rid->partrel) &&
+ strings_equal_or_both_null(rid->plan_name, rti_rid->plan_name))
+ {
+ if (result != 0)
+ {
+ /* Multiple matches were found. */
+ return 0;
+ }
+ result = rti;
+ }
+ }
+
+ return result;
+}
+
+/*
+ * Build a mapping from each RTI to the RTI whose alias_name will be used to
+ * construct the range table identifier.
+ *
+ * For child relations, this is the topmost parent that is still of type
+ * RTE_RELATION. For other relations, it's just the original RTI.
+ *
+ * Since we're eventually going to need this information for every RTI in
+ * the range table, it's best to compute all the answers in a single pass over
+ * the AppendRelInfo list. Otherwise, we might end up searching through that
+ * list repeatedly for entries of interest.
+ *
+ * Note that the returned array is uses zero-based indexing, while RTIs use
+ * 1-based indexing, so subtract 1 from the RTI before looking it up in the
+ * array.
+ */
+static Index *
+pgpa_create_top_rti_map(Index rtable_length, List *rtable, List *appinfos)
+{
+ Index *top_rti_map = palloc0_array(Index, rtable_length);
+
+ /* Initially, make every RTI point to itself. */
+ for (Index rti = 1; rti <= rtable_length; ++rti)
+ top_rti_map[rti - 1] = rti;
+
+ /* Update the map for each AppendRelInfo object. */
+ foreach_node(AppendRelInfo, appinfo, appinfos)
+ {
+ Index parent_rti = appinfo->parent_relid;
+ RangeTblEntry *parent_rte = rt_fetch(parent_rti, rtable);
+
+ /* If the parent is not RTE_RELATION, ignore this entry. */
+ if (parent_rte->rtekind != RTE_RELATION)
+ continue;
+
+ /*
+ * Map the child to wherever we mapped the parent. Parents always
+ * precede their children in the AppendRelInfo list, so this should
+ * work out.
+ */
+ top_rti_map[appinfo->child_relid - 1] = top_rti_map[parent_rti - 1];
+ }
+
+ return top_rti_map;
+}
+
+/*
+ * Find the occurence number of a certain relation within a certain subquery.
+ *
+ * The same alias name can occur multiple times within a subquery, but we want
+ * to disambiguate by giving different occurrences different integer indexes.
+ * However, child tables are disambiguated by including the table name rather
+ * than by incrementing the occurrence number; and joins are not named and so
+ * shouldn't increment the occurence number either.
+ */
+static int
+pgpa_occurrence_number(List *rtable, Index *top_rti_map,
+ SubPlanRTInfo *rtinfo, Index rti)
+{
+ Index rtoffset = (rtinfo == NULL) ? 0 : rtinfo->rtoffset;
+ int occurrence = 1;
+ RangeTblEntry *rte = rt_fetch(rti, rtable);
+
+ for (Index prior_rti = rtoffset + 1; prior_rti < rti; ++prior_rti)
+ {
+ RangeTblEntry *prior_rte;
+
+ /*
+ * If this is a child rel of a parent that is a relation, skip it.
+ *
+ * Such range table entries are disambiguated by mentioning the schema
+ * and name of the table, not by counting them as separate occurrences
+ * of the same table.
+ */
+ if (top_rti_map[prior_rti - 1] != prior_rti)
+ continue;
+
+ /* Skip joins. */
+ prior_rte = rt_fetch(prior_rti, rtable);
+ if (prior_rte->rtekind == RTE_JOIN)
+ continue;
+
+ /* Skip if the alias name differs. */
+ if (strcmp(prior_rte->eref->aliasname, rte->eref->aliasname) != 0)
+ continue;
+
+ /* Looks like a true duplicate. */
+ ++occurrence;
+ }
+
+ return occurrence;
+}
diff --git a/contrib/pg_plan_advice/pgpa_identifier.h b/contrib/pg_plan_advice/pgpa_identifier.h
new file mode 100644
index 00000000000..b000d2b7081
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_identifier.h
@@ -0,0 +1,52 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_identifier.h
+ * create appropriate identifiers for range table entries
+ *
+ * Copyright (c) 2016-2025, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_identifier.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef PGPA_IDENTIFIER_H
+#define PGPA_IDENTIFIER_H
+
+#include "nodes/pathnodes.h"
+#include "nodes/plannodes.h"
+
+typedef struct pgpa_identifier
+{
+ const char *alias_name;
+ int occurrence;
+ const char *partnsp;
+ const char *partrel;
+ const char *plan_name;
+} pgpa_identifier;
+
+/* Convenience function for comparing possibly-NULL strings. */
+static inline bool
+strings_equal_or_both_null(const char *a, const char *b)
+{
+ if (a == b)
+ return true;
+ else if (a == NULL || b == NULL)
+ return false;
+ else
+ return strcmp(a, b) == 0;
+}
+
+extern const char *pgpa_identifier_string(const pgpa_identifier *rid);
+extern void pgpa_compute_identifier_by_rti(PlannerInfo *root, Index rti,
+ pgpa_identifier *rid);
+extern int pgpa_compute_identifiers_by_relids(PlannerInfo *root,
+ Bitmapset *relids,
+ pgpa_identifier *rids);
+extern pgpa_identifier *pgpa_create_identifiers_for_planned_stmt(PlannedStmt *pstmt);
+
+extern Index pgpa_compute_rti_from_identifier(int rtable_length,
+ pgpa_identifier *rt_identifiers,
+ pgpa_identifier *rid);
+
+#endif
diff --git a/contrib/pg_plan_advice/pgpa_join.c b/contrib/pg_plan_advice/pgpa_join.c
new file mode 100644
index 00000000000..ec8e1a666ec
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_join.c
@@ -0,0 +1,629 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_join.c
+ * analysis of joins in Plan trees
+ *
+ * Copyright (c) 2016-2025, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_join.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "pgpa_join.h"
+#include "pgpa_scan.h"
+#include "pgpa_walker.h"
+
+#include "nodes/pathnodes.h"
+#include "nodes/print.h"
+#include "parser/parsetree.h"
+
+/*
+ * Temporary object used when unrolling a join tree.
+ */
+struct pgpa_join_unroller
+{
+ unsigned nallocated;
+ unsigned nused;
+ Plan *outer_subplan;
+ ElidedNode *outer_elided_node;
+ bool outer_beneath_any_gather;
+ pgpa_join_strategy *strategy;
+ Plan **inner_subplans;
+ ElidedNode **inner_elided_nodes;
+ pgpa_join_unroller **inner_unrollers;
+ bool *inner_beneath_any_gather;
+};
+
+static pgpa_join_strategy pgpa_decompose_join(pgpa_plan_walker_context *walker,
+ Plan *plan,
+ Plan **realouter,
+ Plan **realinner,
+ ElidedNode **elidedrealouter,
+ ElidedNode **elidedrealinner,
+ bool *found_any_outer_gather,
+ bool *found_any_inner_gather);
+static ElidedNode *pgpa_descend_node(PlannedStmt *pstmt, Plan **plan);
+static ElidedNode *pgpa_descend_any_gather(PlannedStmt *pstmt, Plan **plan,
+ bool *found_any_gather);
+static bool pgpa_descend_any_unique(PlannedStmt *pstmt, Plan **plan,
+ ElidedNode **elided_node);
+
+static bool is_result_node_with_child(Plan *plan);
+static bool is_sorting_plan(Plan *plan);
+
+/*
+ * Create an initially-empty object for unrolling joins.
+ *
+ * This function creates a helper object that can later be used to create a
+ * pgpa_unrolled_join, after first calling pgpa_unroll_join one or more times.
+ */
+pgpa_join_unroller *
+pgpa_create_join_unroller(void)
+{
+ pgpa_join_unroller *join_unroller;
+
+ join_unroller = palloc0_object(pgpa_join_unroller);
+ join_unroller->nallocated = 4;
+ join_unroller->strategy =
+ palloc_array(pgpa_join_strategy, join_unroller->nallocated);
+ join_unroller->inner_subplans =
+ palloc_array(Plan *, join_unroller->nallocated);
+ join_unroller->inner_elided_nodes =
+ palloc_array(ElidedNode *, join_unroller->nallocated);
+ join_unroller->inner_unrollers =
+ palloc_array(pgpa_join_unroller *, join_unroller->nallocated);
+ join_unroller->inner_beneath_any_gather =
+ palloc_array(bool, join_unroller->nallocated);
+
+ return join_unroller;
+}
+
+/*
+ * Unroll one level of an unrollable join tree.
+ *
+ * Our basic goal here is to unroll join trees as they occur in the Plan
+ * tree into a simpler and more regular structure that we can more easily
+ * use for further processing. Unrolling is outer-deep, so if the plan tree
+ * has Join1(Join2(A,B),Join3(C,D)), the same join unroller object should be
+ * used for Join1 and Join2, but a different one will be needed for Join3,
+ * since that involves a join within the *inner* side of another join.
+ *
+ * pgpa_plan_walker creates a "top level" join unroller object when it
+ * encounters a join in a portion of the plan tree in which no join unroller
+ * is already active. From there, this function is responsible for determing
+ * to what portion of the plan tree that join unroller applies, and for
+ * creating any subordinate join unroller objects that are needed as a result
+ * of non-outer-deep join trees. We do this by returning the join unroller
+ * objects that should be used for further traversal of the outer and inner
+ * subtrees of the current plan node via *outer_join_unroller and
+ * *inner_join_unroller, respectively.
+ */
+void
+pgpa_unroll_join(pgpa_plan_walker_context *walker, Plan *plan,
+ bool beneath_any_gather,
+ pgpa_join_unroller *join_unroller,
+ pgpa_join_unroller **outer_join_unroller,
+ pgpa_join_unroller **inner_join_unroller)
+{
+ pgpa_join_strategy strategy;
+ Plan *realinner,
+ *realouter;
+ ElidedNode *elidedinner,
+ *elidedouter;
+ int n;
+ bool found_any_outer_gather = false;
+ bool found_any_inner_gather = false;
+
+ Assert(join_unroller != NULL);
+
+ /*
+ * We need to pass the join_unroller object down through certain types of
+ * plan nodes -- anything that's considered part of the join strategy, and
+ * any other nodes that can occur in a join tree despite not being scans
+ * or joins.
+ *
+ * This includes:
+ *
+ * (1) Materialize, Memoize, and Hash nodes, which are part of the join
+ * strategy,
+ *
+ * (2) Gather and Gather Merge nodes, which can occur at any point in the
+ * join tree where the planner decided to initiate parallelism,
+ *
+ * (3) Sort and IncrementalSort nodes, which can occur beneath MergeJoin
+ * or GatherMerge,
+ *
+ * (4) Agg and Unique nodes, which can occur when we decide to make the
+ * nullable side of a semijoin unique and then join the result, and
+ *
+ * (5) Result nodes with children, which can be added either to project to
+ * enforce a one-time filter (but Result nodes without children are
+ * degenerate scans or joins).
+ */
+ if (IsA(plan, Material) || IsA(plan, Memoize) || IsA(plan, Hash)
+ || IsA(plan, Gather) || IsA(plan, GatherMerge)
+ || is_sorting_plan(plan) || IsA(plan, Agg) || IsA(plan, Unique)
+ || is_result_node_with_child(plan))
+ {
+ *outer_join_unroller = join_unroller;
+ return;
+ }
+
+ /*
+ * Since we've already handled nodes that require pass-through treatment,
+ * this should be an unrollable join.
+ */
+ strategy = pgpa_decompose_join(walker, plan,
+ &realouter, &realinner,
+ &elidedouter, &elidedinner,
+ &found_any_outer_gather,
+ &found_any_inner_gather);
+
+ /* If our workspace is full, expand it. */
+ if (join_unroller->nused >= join_unroller->nallocated)
+ {
+ join_unroller->nallocated *= 2;
+ join_unroller->strategy =
+ repalloc_array(join_unroller->strategy,
+ pgpa_join_strategy,
+ join_unroller->nallocated);
+ join_unroller->inner_subplans =
+ repalloc_array(join_unroller->inner_subplans,
+ Plan *,
+ join_unroller->nallocated);
+ join_unroller->inner_elided_nodes =
+ repalloc_array(join_unroller->inner_elided_nodes,
+ ElidedNode *,
+ join_unroller->nallocated);
+ join_unroller->inner_beneath_any_gather =
+ repalloc_array(join_unroller->inner_beneath_any_gather,
+ bool,
+ join_unroller->nallocated);
+ join_unroller->inner_unrollers =
+ repalloc_array(join_unroller->inner_unrollers,
+ pgpa_join_unroller *,
+ join_unroller->nallocated);
+ }
+
+ /*
+ * Since we're flattening outer-deep join trees, it follows that if the
+ * outer side is still an unrollable join, it should be unrolled into this
+ * same object. Otherwise, we've reached the limit of what we can unroll
+ * into this object and must remember the outer side as the final outer
+ * subplan.
+ */
+ if (elidedouter == NULL && pgpa_is_join(realouter))
+ *outer_join_unroller = join_unroller;
+ else
+ {
+ join_unroller->outer_subplan = realouter;
+ join_unroller->outer_elided_node = elidedouter;
+ join_unroller->outer_beneath_any_gather =
+ beneath_any_gather || found_any_outer_gather;
+ }
+
+ /*
+ * Store the inner subplan. If it's an unrollable join, it needs to be
+ * flattened in turn, but into a new unroller object, not this one.
+ */
+ n = join_unroller->nused++;
+ join_unroller->strategy[n] = strategy;
+ join_unroller->inner_subplans[n] = realinner;
+ join_unroller->inner_elided_nodes[n] = elidedinner;
+ join_unroller->inner_beneath_any_gather[n] =
+ beneath_any_gather || found_any_inner_gather;
+ if (elidedinner == NULL && pgpa_is_join(realinner))
+ *inner_join_unroller = pgpa_create_join_unroller();
+ else
+ *inner_join_unroller = NULL;
+ join_unroller->inner_unrollers[n] = *inner_join_unroller;
+}
+
+/*
+ * Use the data we've accumulated in a pgpa_join_unroller object to construct
+ * a pgpa_unrolled_join.
+ */
+pgpa_unrolled_join *
+pgpa_build_unrolled_join(pgpa_plan_walker_context *walker,
+ pgpa_join_unroller *join_unroller)
+{
+ pgpa_unrolled_join *ujoin;
+ int i;
+
+ /*
+ * We shouldn't have gone even so far as to create a join unroller unless
+ * we found at least one unrollable join.
+ */
+ Assert(join_unroller->nused > 0);
+
+ /* Allocate result structures. */
+ ujoin = palloc0_object(pgpa_unrolled_join);
+ ujoin->ninner = join_unroller->nused;
+ ujoin->strategy = palloc0_array(pgpa_join_strategy, join_unroller->nused);
+ ujoin->inner = palloc0_array(pgpa_join_member, join_unroller->nused);
+
+ /* Handle the outermost join. */
+ ujoin->outer.plan = join_unroller->outer_subplan;
+ ujoin->outer.elided_node = join_unroller->outer_elided_node;
+ ujoin->outer.scan =
+ pgpa_build_scan(walker, ujoin->outer.plan,
+ ujoin->outer.elided_node,
+ join_unroller->outer_beneath_any_gather,
+ true);
+
+ /*
+ * We want the joins from the deepest part of the plan tree to appear
+ * first in the result object, but the join unroller adds them in exactly
+ * the reverse of that order, so we need to flip the order of the arrays
+ * when constructing the final result.
+ */
+ for (i = 0; i < join_unroller->nused; ++i)
+ {
+ int k = join_unroller->nused - i - 1;
+
+ /* Copy strategy, Plan, and ElidedNode. */
+ ujoin->strategy[i] = join_unroller->strategy[k];
+ ujoin->inner[i].plan = join_unroller->inner_subplans[k];
+ ujoin->inner[i].elided_node = join_unroller->inner_elided_nodes[k];
+
+ /*
+ * Fill in remaining details, using either the nested join unroller,
+ * or by deriving them from the plan and elided nodes.
+ */
+ if (join_unroller->inner_unrollers[k] != NULL)
+ ujoin->inner[i].unrolled_join =
+ pgpa_build_unrolled_join(walker,
+ join_unroller->inner_unrollers[k]);
+ else
+ ujoin->inner[i].scan =
+ pgpa_build_scan(walker, ujoin->inner[i].plan,
+ ujoin->inner[i].elided_node,
+ join_unroller->inner_beneath_any_gather[k],
+ true);
+ }
+
+ return ujoin;
+}
+
+/*
+ * Free memory allocated for pgpa_join_unroller.
+ */
+void
+pgpa_destroy_join_unroller(pgpa_join_unroller *join_unroller)
+{
+ pfree(join_unroller->strategy);
+ pfree(join_unroller->inner_subplans);
+ pfree(join_unroller->inner_elided_nodes);
+ pfree(join_unroller->inner_unrollers);
+ pfree(join_unroller);
+}
+
+/*
+ * Identify the join strategy used by a join and the "real" inner and outer
+ * plans.
+ *
+ * For example, a Hash Join always has a Hash node on the inner side, but
+ * for all intents and purposes the real inner input is the Hash node's child,
+ * not the Hash node itself.
+ *
+ * Likewise, a Merge Join may have Sort note on the inner or outer side; if
+ * it does, the real input to the join is the Sort node's child, not the
+ * Sort node itself.
+ *
+ * In addition, with a Merge Join or a Nested Loop, the join planning code
+ * may add additional nodes such as Materialize or Memoize. We regard these
+ * as an aspect of the join strategy. As in the previous cases, the true input
+ * to the join is the underlying node.
+ *
+ * However, if any involved child node previously had a now-elided node stacked
+ * on top, then we can't "look through" that node -- indeed, what's going to be
+ * relevant for our purposes is the ElidedNode on top of that plan node, rather
+ * than the plan node itself.
+ *
+ * If there are multiple elided nodes, we want that one that would have been
+ * uppermost in the plan tree prior to setrefs processing; we expect to find
+ * that one last in the list of elided nodes.
+ *
+ * On return *realouter and *realinner will have been set to the real inner
+ * and real outer plans that we identified, and *elidedrealouter and
+ * *elidedrealinner to the last of any correspoding elided nodes.
+ * Additionally, *found_any_outer_gather and *found_any_inner_gather will
+ * be set to true if we looked through a Gather or Gather Merge node on
+ * that side of the join, and false otherwise.
+ */
+static pgpa_join_strategy
+pgpa_decompose_join(pgpa_plan_walker_context *walker, Plan *plan,
+ Plan **realouter, Plan **realinner,
+ ElidedNode **elidedrealouter, ElidedNode **elidedrealinner,
+ bool *found_any_outer_gather, bool *found_any_inner_gather)
+{
+ PlannedStmt *pstmt = walker->pstmt;
+ JoinType jointype = ((Join *) plan)->jointype;
+ Plan *outerplan = plan->lefttree;
+ Plan *innerplan = plan->righttree;
+ ElidedNode *elidedouter;
+ ElidedNode *elidedinner;
+ pgpa_join_strategy strategy;
+ bool uniqueouter;
+ bool uniqueinner;
+
+ elidedouter = pgpa_last_elided_node(pstmt, outerplan);
+ elidedinner = pgpa_last_elided_node(pstmt, innerplan);
+ *found_any_outer_gather = false;
+ *found_any_inner_gather = false;
+
+ switch (nodeTag(plan))
+ {
+ case T_MergeJoin:
+
+ /*
+ * The planner may have chosen to place a Material node on the
+ * inner side of the MergeJoin; if this is present, we record it
+ * as part of the join strategy.
+ */
+ if (elidedinner == NULL && IsA(innerplan, Material))
+ {
+ elidedinner = pgpa_descend_node(pstmt, &innerplan);
+ strategy = JSTRAT_MERGE_JOIN_MATERIALIZE;
+ }
+ else
+ strategy = JSTRAT_MERGE_JOIN_PLAIN;
+
+ /*
+ * For a MergeJoin, either the outer or the inner subplan, or
+ * both, may have needed to be sorted; we must disregard any Sort
+ * or IncrementalSort node to find the real inner or outer
+ * subplan.
+ */
+ if (elidedouter == NULL && is_sorting_plan(outerplan))
+ elidedouter = pgpa_descend_node(pstmt, &outerplan);
+ if (elidedinner == NULL && is_sorting_plan(innerplan))
+ elidedinner = pgpa_descend_node(pstmt, &innerplan);
+ break;
+
+ case T_NestLoop:
+
+ /*
+ * The planner may have chosen to place a Material or Memoize node
+ * on the inner side of the NestLoop; if this is present, we
+ * record it as part of the join strategy.
+ */
+ if (elidedinner == NULL && IsA(innerplan, Material))
+ {
+ elidedinner = pgpa_descend_node(pstmt, &innerplan);
+ strategy = JSTRAT_NESTED_LOOP_MATERIALIZE;
+ }
+ else if (elidedinner == NULL && IsA(innerplan, Memoize))
+ {
+ elidedinner = pgpa_descend_node(pstmt, &innerplan);
+ strategy = JSTRAT_NESTED_LOOP_MEMOIZE;
+ }
+ else
+ strategy = JSTRAT_NESTED_LOOP_PLAIN;
+ break;
+
+ case T_HashJoin:
+
+ /*
+ * The inner subplan of a HashJoin is always a Hash node; the real
+ * inner subplan is the Hash node's child.
+ */
+ Assert(IsA(innerplan, Hash));
+ Assert(elidedinner == NULL);
+ elidedinner = pgpa_descend_node(pstmt, &innerplan);
+ strategy = JSTRAT_HASH_JOIN;
+ break;
+
+ default:
+ elog(ERROR, "unrecognized node type: %d", (int) nodeTag(plan));
+ }
+
+ /*
+ * The planner may have decided to implement a semijoin by first making
+ * the nullable side of the plan unique, and then performing a normal join
+ * against the result. Therefore, we might need to descend through a
+ * unique node on either side of the plan.
+ */
+ uniqueouter = pgpa_descend_any_unique(pstmt, &outerplan, &elidedouter);
+ uniqueinner = pgpa_descend_any_unique(pstmt, &innerplan, &elidedinner);
+
+ /*
+ * The planner may have decided to parallelize part of the join tree, so
+ * we could find a Gather or Gather Merge node here. Note that, if
+ * present, this will appear below nodes we considered as part of the join
+ * strategy, but we could find another uniqueness-enforcing node below the
+ * Gather or Gather Merge, if present.
+ */
+ if (elidedouter == NULL)
+ {
+ elidedouter = pgpa_descend_any_gather(pstmt, &outerplan,
+ found_any_outer_gather);
+ if (found_any_outer_gather &&
+ pgpa_descend_any_unique(pstmt, &outerplan, &elidedouter))
+ uniqueouter = true;
+ }
+ if (elidedinner == NULL)
+ {
+ elidedinner = pgpa_descend_any_gather(pstmt, &innerplan,
+ found_any_inner_gather);
+ if (found_any_inner_gather &&
+ pgpa_descend_any_unique(pstmt, &innerplan, &elidedinner))
+ uniqueinner = true;
+ }
+
+ /*
+ * It's possible that Result node has been inserted either to project a
+ * target list or to implement a one-time filter. If so, we can descend
+ * throught it. Note that a result node without a child would be a
+ * degenerate scan or join, and not something we could descend through.
+ *
+ * XXX. I suspect it's possible for this to happen above the Gather or
+ * Gather Merge node, too, but apparently we have no test case for that
+ * scenario.
+ */
+ if (elidedouter == NULL && is_result_node_with_child(outerplan))
+ elidedouter = pgpa_descend_node(pstmt, &outerplan);
+ if (elidedinner == NULL && is_result_node_with_child(innerplan))
+ elidedinner = pgpa_descend_node(pstmt, &innerplan);
+
+ /*
+ * If this is a semijoin that was converted to an inner join by making one
+ * side or the other unique, make a note that the inner or outer subplan,
+ * as appropriate, should be treated as a query plan feature when the main
+ * tree traversal reaches it.
+ *
+ * Conversely, if the planner could have made one side of the join unique
+ * and thereby converted it to an inner join, and chose not to do so, that
+ * is also worth noting.
+ *
+ * NB: This code could appear slightly higher up in in this function, but
+ * none of the nodes through which we just descended should have
+ * associated RTIs.
+ *
+ * NB: This seems like a somewhat hacky way of passing information up to
+ * the main tree walk, but I don't currently have a better idea.
+ */
+ if (uniqueouter)
+ pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_UNIQUE, outerplan);
+ else if (jointype == JOIN_RIGHT_SEMI)
+ pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_NON_UNIQUE, outerplan);
+ if (uniqueinner)
+ pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_UNIQUE, innerplan);
+ else if (jointype == JOIN_SEMI)
+ pgpa_add_future_feature(walker, PGPAQF_SEMIJOIN_NON_UNIQUE, innerplan);
+
+ /* Set output parameters. */
+ *realouter = outerplan;
+ *realinner = innerplan;
+ *elidedrealouter = elidedouter;
+ *elidedrealinner = elidedinner;
+ return strategy;
+}
+
+/*
+ * Descend through a Plan node in a join tree that the caller has determined
+ * to be irrelevant.
+ *
+ * Updates *plan, and returns the last of any elided nodes pertaining to the
+ * new plan node.
+ */
+static ElidedNode *
+pgpa_descend_node(PlannedStmt *pstmt, Plan **plan)
+{
+ *plan = (*plan)->lefttree;
+ return pgpa_last_elided_node(pstmt, *plan);
+}
+
+/*
+ * Descend through a Gather or Gather Merge node, if present, and any Sort
+ * or IncrementalSort node occurring under a Gather Merge.
+ *
+ * Caller should have verified that there is no ElidedNode pertaining to
+ * the initial value of *plan.
+ *
+ * Updates *plan, and returns the last of any elided nodes pertaining to the
+ * new plan node. Sets *found_any_gather = true if either Gather or
+ * Gather Merge was found, and otherwise leaves it unchanged.
+ */
+static ElidedNode *
+pgpa_descend_any_gather(PlannedStmt *pstmt, Plan **plan,
+ bool *found_any_gather)
+{
+ if (IsA(*plan, Gather))
+ {
+ *found_any_gather = true;
+ return pgpa_descend_node(pstmt, plan);
+ }
+
+ if (IsA(*plan, GatherMerge))
+ {
+ ElidedNode *elided = pgpa_descend_node(pstmt, plan);
+
+ if (elided == NULL && is_sorting_plan(*plan))
+ elided = pgpa_descend_node(pstmt, plan);
+
+ *found_any_gather = true;
+ return elided;
+ }
+
+ return NULL;
+}
+
+/*
+ * If *plan is an Agg or Unique node, we want to descend through it, unless
+ * it has a corresponding elided node. If its immediate child is a Sort or
+ * IncrementalSort, we also want to descend through that, unless it has a
+ * corresponding elided node.
+ *
+ * On entry, *elided_node must be the last of any elided nodes corresponding
+ * to *plan; on exit, this will still be true, but *plan may have been updated.
+ *
+ * The reason we don't want to descend through elided nodes is that a single
+ * join tree can't cross through any sort of elided node: subqueries are
+ * planned separately, and planning inside an Append or MergeAppend is
+ * separate from planning outside of it.
+ *
+ * The return value is true if we descend through a node that we believe is
+ * making one side of a semijoin unique, and otherwise false.
+ */
+static bool
+pgpa_descend_any_unique(PlannedStmt *pstmt, Plan **plan,
+ ElidedNode **elided_node)
+{
+ bool descend = false;
+ bool sjunique = false;
+
+ if (*elided_node != NULL)
+ return sjunique;
+
+ if (IsA(*plan, Unique))
+ {
+ descend = true;
+ sjunique = true;
+ }
+ else if (IsA(*plan, Agg))
+ {
+ /*
+ * If this is a simple Agg node, then assume it's here to implement
+ * semijoin uniqueness. Otherwise, assume it's completing an eager
+ * aggregation or partitionwise aggregation operation that began at a
+ * higher level of the plan tree.
+ *
+ * XXX. I suspect this logic does not cover all cases: couldn't SJ
+ * uniqueness be implemented in two steps with an intermediate Gather?
+ */
+ descend = true;
+ sjunique = (((Agg *) *plan)->aggsplit == AGGSPLIT_SIMPLE);
+ }
+
+ if (descend)
+ {
+ *elided_node = pgpa_descend_node(pstmt, plan);
+
+ if (*elided_node == NULL && is_sorting_plan(*plan))
+ *elided_node = pgpa_descend_node(pstmt, plan);
+ }
+
+ return sjunique;
+}
+
+/*
+ * Is this a Result node that has a child?
+ */
+static bool
+is_result_node_with_child(Plan *plan)
+{
+ return IsA(plan, Result) && plan->lefttree != NULL;
+}
+
+/*
+ * Is this a Plan node whose purpose is put the data in a certain order?
+ */
+static bool
+is_sorting_plan(Plan *plan)
+{
+ return IsA(plan, Sort) || IsA(plan, IncrementalSort);
+}
diff --git a/contrib/pg_plan_advice/pgpa_join.h b/contrib/pg_plan_advice/pgpa_join.h
new file mode 100644
index 00000000000..4dc72986a70
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_join.h
@@ -0,0 +1,105 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_join.h
+ * analysis of joins in Plan trees
+ *
+ * Copyright (c) 2016-2025, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_join.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PGPA_JOIN_H
+#define PGPA_JOIN_H
+
+#include "nodes/plannodes.h"
+
+typedef struct pgpa_plan_walker_context pgpa_plan_walker_context;
+typedef struct pgpa_join_unroller pgpa_join_unroller;
+typedef struct pgpa_unrolled_join pgpa_unrolled_join;
+
+/*
+ * Although there are three main join strategies, we try to classify things
+ * more precisely here: merge joins have the option of using materialization
+ * on the inner side, and nested loops can use either materialization or
+ * memoization.
+ */
+typedef enum
+{
+ JSTRAT_MERGE_JOIN_PLAIN = 0,
+ JSTRAT_MERGE_JOIN_MATERIALIZE,
+ JSTRAT_NESTED_LOOP_PLAIN,
+ JSTRAT_NESTED_LOOP_MATERIALIZE,
+ JSTRAT_NESTED_LOOP_MEMOIZE,
+ JSTRAT_HASH_JOIN
+ /* update NUM_PGPA_JOIN_STRATEGY if you add anything here */
+} pgpa_join_strategy;
+
+#define NUM_PGPA_JOIN_STRATEGY ((int) JSTRAT_HASH_JOIN + 1)
+
+/*
+ * In an outer-deep join tree, every member of an unrolled join will be a scan,
+ * but join trees with other shapes can contain unrolled joins.
+ *
+ * The plan node we store here will be the inner or outer child of the join
+ * node, as appropriate, except that we look through subnodes that we regard as
+ * part of the join method itself. For instance, for a Nested Loop that
+ * materializes the inner input, we'll store the child of the Materialize node,
+ * not the Materialize node itself.
+ *
+ * If setrefs processing elided one or more nodes from the plan tree, then
+ * we'll store details about the topmost of those in elided_node; otherwise,
+ * it will be NULL.
+ *
+ * Exactly one of scan and unrolled_join will be non-NULL.
+ */
+typedef struct
+{
+ Plan *plan;
+ ElidedNode *elided_node;
+ struct pgpa_scan *scan;
+ pgpa_unrolled_join *unrolled_join;
+} pgpa_join_member;
+
+/*
+ * We convert outer-deep join trees to a flat structure; that is, ((A JOIN B)
+ * JOIN C) JOIN D gets converted to outer = A, inner = . When joins
+ * aren't outer-deep, substructure is required, e.g. (A JOIN B) JOIN (C JOIN D)
+ * is represented as outer = A, inner = , where X is a pgpa_unrolled_join
+ * covering C-D.
+ */
+struct pgpa_unrolled_join
+{
+ /* Outermost member; must not itself be an unrolled join. */
+ pgpa_join_member outer;
+
+ /* Number of inner members. Length of the strategy and inner arrays. */
+ unsigned ninner;
+
+ /* Array of strategies, one per non-outermost member. */
+ pgpa_join_strategy *strategy;
+
+ /* Array of members, excluding the outermost. Deepest first. */
+ pgpa_join_member *inner;
+};
+
+/*
+ * Does this plan node inherit from Join?
+ */
+static inline bool
+pgpa_is_join(Plan *plan)
+{
+ return IsA(plan, NestLoop) || IsA(plan, MergeJoin) || IsA(plan, HashJoin);
+}
+
+extern pgpa_join_unroller *pgpa_create_join_unroller(void);
+extern void pgpa_unroll_join(pgpa_plan_walker_context *walker,
+ Plan *plan, bool beneath_any_gather,
+ pgpa_join_unroller *join_unroller,
+ pgpa_join_unroller **outer_join_unroller,
+ pgpa_join_unroller **inner_join_unroller);
+extern pgpa_unrolled_join *pgpa_build_unrolled_join(pgpa_plan_walker_context *walker,
+ pgpa_join_unroller *join_unroller);
+extern void pgpa_destroy_join_unroller(pgpa_join_unroller *join_unroller);
+
+#endif
diff --git a/contrib/pg_plan_advice/pgpa_output.c b/contrib/pg_plan_advice/pgpa_output.c
new file mode 100644
index 00000000000..67647acdf5a
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_output.c
@@ -0,0 +1,571 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_output.c
+ * produce textual output from the results of a plan tree walk
+ *
+ * Copyright (c) 2016-2025, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_output.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "pgpa_output.h"
+#include "pgpa_scan.h"
+
+#include "nodes/parsenodes.h"
+#include "parser/parsetree.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+
+/*
+ * Context object for textual advice generation.
+ *
+ * rt_identifiers is the caller-provided array of range table identifiers.
+ * See the comments at the top of pgpa_identifier.c for more details.
+ *
+ * buf is the caller-provided output buffer.
+ *
+ * wrap_column is the wrap column, so that we don't create output that is
+ * too wide. See pgpa_maybe_linebreak() and comments in pgpa_output_advice.
+ */
+typedef struct pgpa_output_context
+{
+ const char **rid_strings;
+ StringInfo buf;
+ int wrap_column;
+} pgpa_output_context;
+
+static void pgpa_output_unrolled_join(pgpa_output_context *context,
+ pgpa_unrolled_join *join);
+static void pgpa_output_join_member(pgpa_output_context *context,
+ pgpa_join_member *member);
+static void pgpa_output_scan_strategy(pgpa_output_context *context,
+ pgpa_scan_strategy strategy,
+ List *scans);
+static void pgpa_output_relation_name(pgpa_output_context *context, Oid relid);
+static void pgpa_output_query_feature(pgpa_output_context *context,
+ pgpa_qf_type type,
+ List *query_features);
+static void pgpa_output_simple_strategy(pgpa_output_context *context,
+ char *strategy,
+ List *relid_sets);
+static void pgpa_output_no_gather(pgpa_output_context *context,
+ Bitmapset *relids);
+static void pgpa_output_relations(pgpa_output_context *context, StringInfo buf,
+ Bitmapset *relids);
+
+static char *pgpa_cstring_join_strategy(pgpa_join_strategy strategy);
+static char *pgpa_cstring_scan_strategy(pgpa_scan_strategy strategy);
+static char *pgpa_cstring_query_feature_type(pgpa_qf_type type);
+
+static void pgpa_maybe_linebreak(StringInfo buf, int wrap_column);
+
+/*
+ * Append query advice to the provided buffer.
+ *
+ * Before calling this function, 'walker' must be used to iterate over the
+ * main plan tree and all subplans from the PlannedStmt.
+ *
+ * 'rt_identifiers' is a table of unique identifiers, one for each RTI.
+ * See pgpa_create_identifiers_for_planned_stmt().
+ *
+ * Results will be appended to 'buf'.
+ */
+void
+pgpa_output_advice(StringInfo buf, pgpa_plan_walker_context *walker,
+ pgpa_identifier *rt_identifiers)
+{
+ Index rtable_length = list_length(walker->pstmt->rtable);
+ ListCell *lc;
+ pgpa_output_context context;
+
+ /* Basic initialization. */
+ memset(&context, 0, sizeof(pgpa_output_context));
+ context.buf = buf;
+
+ /*
+ * Convert identifiers to string form. Note that the loop variable here is
+ * not an RTI, because RTIs are 1-based. Some RTIs will have no
+ * identifier, either because the reloptkind is RTE_JOIN or because that
+ * portion of the query didn't make it into the final plan.
+ */
+ context.rid_strings = palloc0_array(const char *, rtable_length);
+ for (int i = 0; i < rtable_length; ++i)
+ if (rt_identifiers[i].alias_name != NULL)
+ context.rid_strings[i] = pgpa_identifier_string(&rt_identifiers[i]);
+
+ /*
+ * If the user chooses to use EXPLAIN (PLAN_ADVICE) in an 80-column window
+ * from a psql client with default settings, psql will add one space to
+ * the left of the output and EXPLAIN will add two more to the left of the
+ * advice. Thus, lines of more than 77 characters will wrap. We set the
+ * wrap limit to 76 here so that the output won't reach all the way to the
+ * very last column of the terminal.
+ *
+ * Of course, this is fairly arbitrary set of assumptions, and one could
+ * well make an argument for a different wrap limit, or for a configurable
+ * one.
+ */
+ context.wrap_column = 76;
+
+ /*
+ * Each piece of JOIN_ORDER() advice fully describes the join order for a
+ * a single unrolled join. Merging is not permitted, because that would
+ * change the meaning, e.g. SEQ_SCAN(a b c d) means simply that sequential
+ * scans should be used for all of those relations, and is thus equivalent
+ * to SEQ_SCAN(a b) SEQ_SCAN(c d), but JOIN_ORDER(a b c d) means that "a"
+ * is the driving table which is then joined to "b" then "c" then "d",
+ * which is totally different from JOIN_ORDER(a b) and JOIN_ORDER(c d).
+ */
+ foreach(lc, walker->toplevel_unrolled_joins)
+ {
+ pgpa_unrolled_join *ujoin = lfirst(lc);
+
+ if (buf->len > 0)
+ appendStringInfoChar(buf, '\n');
+ appendStringInfo(context.buf, "JOIN_ORDER(");
+ pgpa_output_unrolled_join(&context, ujoin);
+ appendStringInfoChar(context.buf, ')');
+ pgpa_maybe_linebreak(context.buf, context.wrap_column);
+ }
+
+ /* Emit join strategy advice. */
+ for (int s = 0; s < NUM_PGPA_JOIN_STRATEGY; ++s)
+ {
+ char *strategy = pgpa_cstring_join_strategy(s);
+
+ pgpa_output_simple_strategy(&context,
+ strategy,
+ walker->join_strategies[s]);
+ }
+
+ /*
+ * Emit scan strategy advice (but not for ordinary scans, which are
+ * definitionally uninteresting).
+ */
+ for (int c = 0; c < NUM_PGPA_SCAN_STRATEGY; ++c)
+ if (c != PGPA_SCAN_ORDINARY)
+ pgpa_output_scan_strategy(&context, c, walker->scans[c]);
+
+ /* Emit query feature advice. */
+ for (int t = 0; t < NUM_PGPA_QF_TYPES; ++t)
+ pgpa_output_query_feature(&context, t, walker->query_features[t]);
+
+ /* Emit NO_GATHER advice. */
+ pgpa_output_no_gather(&context, walker->no_gather_scans);
+}
+
+/*
+ * Output the members of an unrolled join, first the outermost member, and
+ * then the inner members one by one, as part of JOIN_ORDER() advice.
+ */
+static void
+pgpa_output_unrolled_join(pgpa_output_context *context,
+ pgpa_unrolled_join *join)
+{
+ pgpa_output_join_member(context, &join->outer);
+
+ for (int k = 0; k < join->ninner; ++k)
+ {
+ pgpa_join_member *member = &join->inner[k];
+
+ pgpa_maybe_linebreak(context->buf, context->wrap_column);
+ appendStringInfoChar(context->buf, ' ');
+ pgpa_output_join_member(context, member);
+ }
+}
+
+/*
+ * Output a single member of an unrolled join as part of JOIN_ORDER() advice.
+ */
+static void
+pgpa_output_join_member(pgpa_output_context *context,
+ pgpa_join_member *member)
+{
+ if (member->unrolled_join != NULL)
+ {
+ appendStringInfoChar(context->buf, '(');
+ pgpa_output_unrolled_join(context, member->unrolled_join);
+ appendStringInfoChar(context->buf, ')');
+ }
+ else
+ {
+ pgpa_scan *scan = member->scan;
+
+ Assert(scan != NULL);
+ if (bms_membership(scan->relids) == BMS_SINGLETON)
+ pgpa_output_relations(context, context->buf, scan->relids);
+ else
+ {
+ appendStringInfoChar(context->buf, '{');
+ pgpa_output_relations(context, context->buf, scan->relids);
+ appendStringInfoChar(context->buf, '}');
+ }
+ }
+}
+
+/*
+ * Output advice for a List of pgpa_scan objects.
+ *
+ * All the scans must use the strategy specified by the "strategy" argument.
+ */
+static void
+pgpa_output_scan_strategy(pgpa_output_context *context,
+ pgpa_scan_strategy strategy,
+ List *scans)
+{
+ bool first = true;
+
+ if (scans == NIL)
+ return;
+
+ if (context->buf->len > 0)
+ appendStringInfoChar(context->buf, '\n');
+ appendStringInfo(context->buf, "%s(",
+ pgpa_cstring_scan_strategy(strategy));
+
+ foreach_ptr(pgpa_scan, scan, scans)
+ {
+ Plan *plan = scan->plan;
+
+ if (first)
+ first = false;
+ else
+ {
+ pgpa_maybe_linebreak(context->buf, context->wrap_column);
+ appendStringInfoChar(context->buf, ' ');
+ }
+
+ /* Output the relation identifiers. */
+ if (bms_membership(scan->relids) == BMS_SINGLETON)
+ pgpa_output_relations(context, context->buf, scan->relids);
+ else
+ {
+ appendStringInfoChar(context->buf, '(');
+ pgpa_output_relations(context, context->buf, scan->relids);
+ appendStringInfoChar(context->buf, ')');
+ }
+
+ /* For index or index-only scans, output index information. */
+ if (strategy == PGPA_SCAN_INDEX)
+ {
+ Assert(IsA(plan, IndexScan));
+ pgpa_maybe_linebreak(context->buf, context->wrap_column);
+ appendStringInfoChar(context->buf, ' ');
+ pgpa_output_relation_name(context, ((IndexScan *) plan)->indexid);
+ }
+ else if (strategy == PGPA_SCAN_INDEX_ONLY)
+ {
+ Assert(IsA(plan, IndexOnlyScan));
+ pgpa_maybe_linebreak(context->buf, context->wrap_column);
+ appendStringInfoChar(context->buf, ' ');
+ pgpa_output_relation_name(context,
+ ((IndexOnlyScan *) plan)->indexid);
+ }
+ }
+
+ appendStringInfoChar(context->buf, ')');
+ pgpa_maybe_linebreak(context->buf, context->wrap_column);
+}
+
+/*
+ * Output a schema-qualified relation name.
+ */
+static void
+pgpa_output_relation_name(pgpa_output_context *context, Oid relid)
+{
+ Oid nspoid = get_rel_namespace(relid);
+ char *relnamespace = get_namespace_name_or_temp(nspoid);
+ char *relname = get_rel_name(relid);
+
+ appendStringInfoString(context->buf, quote_identifier(relnamespace));
+ appendStringInfoChar(context->buf, '.');
+ appendStringInfoString(context->buf, quote_identifier(relname));
+}
+
+/*
+ * Output advice for a List of pgpa_query_feature objects.
+ *
+ * All features must be of the type specified by the "type" argument.
+ */
+static void
+pgpa_output_query_feature(pgpa_output_context *context, pgpa_qf_type type,
+ List *query_features)
+{
+ bool first = true;
+
+ if (query_features == NIL)
+ return;
+
+ if (context->buf->len > 0)
+ appendStringInfoChar(context->buf, '\n');
+ appendStringInfo(context->buf, "%s(",
+ pgpa_cstring_query_feature_type(type));
+
+ foreach_ptr(pgpa_query_feature, qf, query_features)
+ {
+ if (first)
+ first = false;
+ else
+ {
+ pgpa_maybe_linebreak(context->buf, context->wrap_column);
+ appendStringInfoChar(context->buf, ' ');
+ }
+
+ if (bms_membership(qf->relids) == BMS_SINGLETON)
+ pgpa_output_relations(context, context->buf, qf->relids);
+ else
+ {
+ appendStringInfoChar(context->buf, '(');
+ pgpa_output_relations(context, context->buf, qf->relids);
+ appendStringInfoChar(context->buf, ')');
+ }
+ }
+
+ appendStringInfoChar(context->buf, ')');
+ pgpa_maybe_linebreak(context->buf, context->wrap_column);
+}
+
+/*
+ * Output "simple" advice for a List of Bitmapset objects each of which
+ * contains one or more RTIs.
+ *
+ * By simple, we just mean that the advice emitted follows the most
+ * straightforward pattern: the strategy name, followed by a list of items
+ * separated by spaces and surrounded by parentheses. Individual items in
+ * the list are a single relation identifier for a Bitmapset that contains
+ * just one member, or a sub-list again separated by spaces and surrounded
+ * by parentheses for a Bitmapset with multiple members. Bitmapsets with
+ * no members probably shouldn't occur here, but if they do they'll be
+ * rendered as an empty sub-list.
+ */
+static void
+pgpa_output_simple_strategy(pgpa_output_context *context, char *strategy,
+ List *relid_sets)
+{
+ bool first = true;
+
+ if (relid_sets == NIL)
+ return;
+
+ if (context->buf->len > 0)
+ appendStringInfoChar(context->buf, '\n');
+ appendStringInfo(context->buf, "%s(", strategy);
+
+ foreach_node(Bitmapset, relids, relid_sets)
+ {
+ if (first)
+ first = false;
+ else
+ {
+ pgpa_maybe_linebreak(context->buf, context->wrap_column);
+ appendStringInfoChar(context->buf, ' ');
+ }
+
+ if (bms_membership(relids) == BMS_SINGLETON)
+ pgpa_output_relations(context, context->buf, relids);
+ else
+ {
+ appendStringInfoChar(context->buf, '(');
+ pgpa_output_relations(context, context->buf, relids);
+ appendStringInfoChar(context->buf, ')');
+ }
+ }
+
+ appendStringInfoChar(context->buf, ')');
+ pgpa_maybe_linebreak(context->buf, context->wrap_column);
+}
+
+/*
+ * Output NO_GATHER advice for all relations not appearing beneath any
+ * Gather or Gather Merge node.
+ */
+static void
+pgpa_output_no_gather(pgpa_output_context *context, Bitmapset *relids)
+{
+ if (relids == NULL)
+ return;
+ if (context->buf->len > 0)
+ appendStringInfoChar(context->buf, '\n');
+ appendStringInfoString(context->buf, "NO_GATHER(");
+ pgpa_output_relations(context, context->buf, relids);
+ appendStringInfoChar(context->buf, ')');
+}
+
+/*
+ * Output the identifiers for each RTI in the provided set.
+ *
+ * Identifiers are separated by spaces, and a line break is possible after
+ * each one.
+ */
+static void
+pgpa_output_relations(pgpa_output_context *context, StringInfo buf,
+ Bitmapset *relids)
+{
+ int rti = -1;
+ bool first = true;
+
+ while ((rti = bms_next_member(relids, rti)) >= 0)
+ {
+ const char *rid_string = context->rid_strings[rti - 1];
+
+ if (rid_string == NULL)
+ elog(ERROR, "no identifier for RTI %d", rti);
+
+ if (first)
+ {
+ first = false;
+ appendStringInfoString(buf, rid_string);
+ }
+ else
+ {
+ pgpa_maybe_linebreak(buf, context->wrap_column);
+ appendStringInfo(buf, " %s", rid_string);
+ }
+ }
+}
+
+/*
+ * Get a C string that corresponds to the specified join strategy.
+ */
+static char *
+pgpa_cstring_join_strategy(pgpa_join_strategy strategy)
+{
+ switch (strategy)
+ {
+ case JSTRAT_MERGE_JOIN_PLAIN:
+ return "MERGE_JOIN_PLAIN";
+ case JSTRAT_MERGE_JOIN_MATERIALIZE:
+ return "MERGE_JOIN_MATERIALIZE";
+ case JSTRAT_NESTED_LOOP_PLAIN:
+ return "NESTED_LOOP_PLAIN";
+ case JSTRAT_NESTED_LOOP_MATERIALIZE:
+ return "NESTED_LOOP_MATERIALIZE";
+ case JSTRAT_NESTED_LOOP_MEMOIZE:
+ return "NESTED_LOOP_MEMOIZE";
+ case JSTRAT_HASH_JOIN:
+ return "HASH_JOIN";
+ }
+
+ pg_unreachable();
+ return NULL;
+}
+
+/*
+ * Get a C string that corresponds to the specified scan strategy.
+ */
+static char *
+pgpa_cstring_scan_strategy(pgpa_scan_strategy strategy)
+{
+ switch (strategy)
+ {
+ case PGPA_SCAN_ORDINARY:
+ return "ORDINARY_SCAN";
+ case PGPA_SCAN_SEQ:
+ return "SEQ_SCAN";
+ case PGPA_SCAN_BITMAP_HEAP:
+ return "BITMAP_HEAP_SCAN";
+ case PGPA_SCAN_FOREIGN:
+ return "FOREIGN_JOIN";
+ case PGPA_SCAN_INDEX:
+ return "INDEX_SCAN";
+ case PGPA_SCAN_INDEX_ONLY:
+ return "INDEX_ONLY_SCAN";
+ case PGPA_SCAN_PARTITIONWISE:
+ return "PARTITIONWISE";
+ case PGPA_SCAN_TID:
+ return "TID_SCAN";
+ }
+
+ pg_unreachable();
+ return NULL;
+}
+
+/*
+ * Get a C string that corresponds to the specified scan strategy.
+ */
+static char *
+pgpa_cstring_query_feature_type(pgpa_qf_type type)
+{
+ switch (type)
+ {
+ case PGPAQF_GATHER:
+ return "GATHER";
+ case PGPAQF_GATHER_MERGE:
+ return "GATHER_MERGE";
+ case PGPAQF_SEMIJOIN_NON_UNIQUE:
+ return "SEMIJOIN_NON_UNIQUE";
+ case PGPAQF_SEMIJOIN_UNIQUE:
+ return "SEMIJOIN_UNIQUE";
+ }
+
+
+ pg_unreachable();
+ return NULL;
+}
+
+/*
+ * Insert a line break into the StringInfoData, if needed.
+ *
+ * If wrap_column is zero or negative, this does nothing. Otherwise, we
+ * consider inserting a newline. We only insert a newline if the length of
+ * the last line in the buffer exceeds wrap_column, and not if we'd be
+ * inserting a newline at or before the beginning of the current line.
+ *
+ * The position at which the newline is inserted is simply wherever the
+ * buffer ended the last time this function was called. In other words,
+ * the caller is expected to call this function every time we reach a good
+ * place for a line break.
+ */
+static void
+pgpa_maybe_linebreak(StringInfo buf, int wrap_column)
+{
+ char *trailing_nl;
+ int line_start;
+ int save_cursor;
+
+ /* If line wrapping is disabled, exit quickly. */
+ if (wrap_column <= 0)
+ return;
+
+ /*
+ * Set line_start to the byte offset within buf->data of the first
+ * character of the current line, where the current line means the last
+ * one in the buffer. Note that line_start could be the offset of the
+ * trailing '\0' if the last character in the buffer is a line break.
+ */
+ trailing_nl = strrchr(buf->data, '\n');
+ if (trailing_nl == NULL)
+ line_start = 0;
+ else
+ line_start = (trailing_nl - buf->data) + 1;
+
+ /*
+ * Remember that the current end of the buffer is a potential location to
+ * insert a line break on a future call to this function.
+ */
+ save_cursor = buf->cursor;
+ buf->cursor = buf->len;
+
+ /* If we haven't passed the wrap column, we don't need a newline. */
+ if (buf->len - line_start <= wrap_column)
+ return;
+
+ /*
+ * It only makes sense to insert a newline at a position later than the
+ * beginning of the current line.
+ */
+ if (buf->cursor <= line_start)
+ return;
+
+ /* Insert a newline at the previous cursor location. */
+ enlargeStringInfo(buf, 1);
+ memmove(&buf->data[save_cursor] + 1, &buf->data[save_cursor],
+ buf->len - save_cursor);
+ ++buf->cursor;
+ buf->data[++buf->len] = '\0';
+ buf->data[save_cursor] = '\n';
+}
diff --git a/contrib/pg_plan_advice/pgpa_output.h b/contrib/pg_plan_advice/pgpa_output.h
new file mode 100644
index 00000000000..47496d76f52
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_output.h
@@ -0,0 +1,22 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_output.h
+ * produce textual output from the results of a plan tree walk
+ *
+ * Copyright (c) 2016-2025, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_output.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PGPA_OUTPUT_H
+#define PGPA_OUTPUT_H
+
+#include "pgpa_identifier.h"
+#include "pgpa_walker.h"
+
+extern void pgpa_output_advice(StringInfo buf,
+ pgpa_plan_walker_context *walker,
+ pgpa_identifier *rt_identifiers);
+
+#endif
diff --git a/contrib/pg_plan_advice/pgpa_parser.y b/contrib/pg_plan_advice/pgpa_parser.y
new file mode 100644
index 00000000000..4c3a3ed6db9
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_parser.y
@@ -0,0 +1,301 @@
+%{
+/*
+ * Parser for plan advice
+ *
+ * Copyright (c) 2000-2025, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_parser.y
+ */
+
+#include "postgres.h"
+
+#include
+#include
+
+#include "fmgr.h"
+#include "nodes/miscnodes.h"
+#include "utils/builtins.h"
+#include "utils/float.h"
+
+#include "pgpa_ast.h"
+#include "pgpa_parser.h"
+
+/*
+ * Bison doesn't allocate anything that needs to live across parser calls,
+ * so we can easily have it use palloc instead of malloc. This prevents
+ * memory leaks if we error out during parsing.
+ */
+#define YYMALLOC palloc
+#define YYFREE pfree
+%}
+
+/* BISON Declarations */
+%parse-param {List **result}
+%parse-param {char **parse_error_msg_p}
+%parse-param {yyscan_t yyscanner}
+%lex-param {List **result}
+%lex-param {char **parse_error_msg_p}
+%lex-param {yyscan_t yyscanner}
+%pure-parser
+%expect 0
+%name-prefix="pgpa_yy"
+
+%union
+{
+ char *str;
+ int integer;
+ List *list;
+ pgpa_advice_item *item;
+ pgpa_advice_target *target;
+ pgpa_index_target *itarget;
+}
+%token TOK_IDENT TOK_TAG_JOIN_ORDER TOK_TAG_INDEX
+%token TOK_TAG_SIMPLE TOK_TAG_GENERIC
+%token TOK_INTEGER
+
+%type opt_ri_occurrence
+%type advice_item
+%type advice_item_list generic_target_list
+%type index_target_list join_order_target_list
+%type opt_partition simple_target_list
+%type identifier opt_plan_name
+%type generic_sublist join_order_sublist
+%type relation_identifier
+%type index_name
+
+%start parse_toplevel
+
+/* Grammar follows */
+%%
+
+parse_toplevel: advice_item_list
+ {
+ (void) yynerrs; /* suppress compiler warning */
+ *result = $1;
+ }
+ ;
+
+advice_item_list: advice_item_list advice_item
+ { $$ = lappend($1, $2); }
+ |
+ { $$ = NIL; }
+ ;
+
+advice_item: TOK_TAG_JOIN_ORDER '(' join_order_target_list ')'
+ {
+ $$ = palloc0_object(pgpa_advice_item);
+ $$->tag = PGPA_TAG_JOIN_ORDER;
+ $$->targets = $3;
+ if ($3 == NIL)
+ pgpa_yyerror(result, parse_error_msg_p, yyscanner,
+ "JOIN_ORDER must have at least one target");
+ }
+ | TOK_TAG_INDEX '(' index_target_list ')'
+ {
+ $$ = palloc0_object(pgpa_advice_item);
+ if (strcmp($1, "index_only_scan") == 0)
+ $$->tag = PGPA_TAG_INDEX_ONLY_SCAN;
+ else if (strcmp($1, "index_scan") == 0)
+ $$->tag = PGPA_TAG_INDEX_SCAN;
+ else
+ elog(ERROR, "tag parsing failed: %s", $1);
+ $$->targets = $3;
+ }
+ | TOK_TAG_SIMPLE '(' simple_target_list ')'
+ {
+ $$ = palloc0_object(pgpa_advice_item);
+ if (strcmp($1, "bitmap_heap_scan") == 0)
+ $$->tag = PGPA_TAG_BITMAP_HEAP_SCAN;
+ else if (strcmp($1, "no_gather") == 0)
+ $$->tag = PGPA_TAG_NO_GATHER;
+ else if (strcmp($1, "seq_scan") == 0)
+ $$->tag = PGPA_TAG_SEQ_SCAN;
+ else if (strcmp($1, "tid_scan") == 0)
+ $$->tag = PGPA_TAG_TID_SCAN;
+ else
+ elog(ERROR, "tag parsing failed: %s", $1);
+ $$->targets = $3;
+ }
+ | TOK_TAG_GENERIC '(' generic_target_list ')'
+ {
+ bool fail;
+
+ $$ = palloc0_object(pgpa_advice_item);
+ $$->tag = pgpa_parse_advice_tag($1, &fail);
+ if (fail)
+ {
+ pgpa_yyerror(result, parse_error_msg_p, yyscanner,
+ "unrecognized advice tag");
+ }
+
+ if ($$->tag == PGPA_TAG_FOREIGN_JOIN)
+ {
+ foreach_ptr(pgpa_advice_target, target, $3)
+ {
+ if (target->ttype == PGPA_TARGET_IDENTIFIER ||
+ list_length(target->children) == 1)
+ pgpa_yyerror(result, parse_error_msg_p, yyscanner,
+ "FOREIGN_JOIN targets must contain more than one relation identifier");
+ }
+ }
+
+ $$->targets = $3;
+ }
+ ;
+
+relation_identifier: identifier opt_ri_occurrence opt_partition opt_plan_name
+ {
+ $$ = palloc0_object(pgpa_advice_target);
+ $$->ttype = PGPA_TARGET_IDENTIFIER;
+ $$->rid.alias_name = $1;
+ $$->rid.occurrence = $2;
+ if (list_length($3) == 2)
+ {
+ $$->rid.partnsp = linitial($3);
+ $$->rid.partrel = lsecond($3);
+ }
+ else if ($3 != NIL)
+ $$->rid.partrel = linitial($3);
+ $$->rid.plan_name = $4;
+ }
+ ;
+
+index_name: identifier
+ {
+ $$ = palloc0_object(pgpa_index_target);
+ $$->indname = $1;
+ }
+ | identifier '.' identifier
+ {
+ $$ = palloc0_object(pgpa_index_target);
+ $$->indnamespace = $1;
+ $$->indname = $3;
+ }
+ ;
+
+opt_ri_occurrence:
+ '#' TOK_INTEGER
+ {
+ if ($2 <= 0)
+ pgpa_yyerror(result, parse_error_msg_p, yyscanner,
+ "only positive occurrence numbers are permitted");
+ $$ = $2;
+ }
+ |
+ {
+ /* The default occurrence number is 1. */
+ $$ = 1;
+ }
+ ;
+
+identifier: TOK_IDENT
+ | TOK_TAG_JOIN_ORDER
+ | TOK_TAG_INDEX
+ | TOK_TAG_SIMPLE
+ | TOK_TAG_GENERIC
+ ;
+
+/*
+ * When generating advice, we always schema-qualify the partition name, but
+ * when parsing advice, we accept a specification that lacks one.
+ */
+opt_partition:
+ '/' TOK_IDENT '.' TOK_IDENT
+ { $$ = list_make2($2, $4); }
+ | '/' TOK_IDENT
+ { $$ = list_make1($2); }
+ |
+ { $$ = NIL; }
+ ;
+
+opt_plan_name:
+ '@' TOK_IDENT
+ { $$ = $2; }
+ |
+ { $$ = NULL; }
+ ;
+
+generic_target_list: generic_target_list relation_identifier
+ { $$ = lappend($1, $2); }
+ | generic_target_list generic_sublist
+ { $$ = lappend($1, $2); }
+ |
+ { $$ = NIL; }
+ ;
+
+generic_sublist: '(' simple_target_list ')'
+ {
+ $$ = palloc0_object(pgpa_advice_target);
+ $$->ttype = PGPA_TARGET_ORDERED_LIST;
+ $$->children = $2;
+ }
+ ;
+
+index_target_list:
+ index_target_list relation_identifier index_name
+ {
+ $2->itarget = $3;
+ $$ = lappend($1, $2);
+ }
+ |
+ { $$ = NIL; }
+ ;
+
+join_order_target_list: join_order_target_list relation_identifier
+ { $$ = lappend($1, $2); }
+ | join_order_target_list join_order_sublist
+ { $$ = lappend($1, $2); }
+ |
+ { $$ = NIL; }
+ ;
+
+join_order_sublist:
+ '(' join_order_target_list ')'
+ {
+ $$ = palloc0_object(pgpa_advice_target);
+ $$->ttype = PGPA_TARGET_ORDERED_LIST;
+ $$->children = $2;
+ }
+ | '{' simple_target_list '}'
+ {
+ $$ = palloc0_object(pgpa_advice_target);
+ $$->ttype = PGPA_TARGET_UNORDERED_LIST;
+ $$->children = $2;
+ }
+ ;
+
+simple_target_list: simple_target_list relation_identifier
+ { $$ = lappend($1, $2); }
+ |
+ { $$ = NIL; }
+ ;
+
+%%
+
+/*
+ * Parse an advice_string and return the resulting list of pgpa_advice_item
+ * objects. If a parse error occurs, instead return NULL.
+ *
+ * If the return value is NULL, *error_p will be set to the error message;
+ * otherwise, *error_p will be set to NULL.
+ */
+List *
+pgpa_parse(const char *advice_string, char **error_p)
+{
+ yyscan_t scanner;
+ List *result;
+ char *error = NULL;
+
+ pgpa_scanner_init(advice_string, &scanner);
+ pgpa_yyparse(&result, &error, scanner);
+ pgpa_scanner_finish(scanner);
+
+ if (error != NULL)
+ {
+ *error_p = error;
+ return NULL;
+ }
+
+ *error_p = NULL;
+ return result;
+}
diff --git a/contrib/pg_plan_advice/pgpa_planner.c b/contrib/pg_plan_advice/pgpa_planner.c
new file mode 100644
index 00000000000..1a14ff9fd4b
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_planner.c
@@ -0,0 +1,2140 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_planner.c
+ * planner hooks
+ *
+ * Copyright (c) 2016-2024, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_planner.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "pg_plan_advice.h"
+#include "pgpa_collector.h"
+#include "pgpa_identifier.h"
+#include "pgpa_output.h"
+#include "pgpa_planner.h"
+#include "pgpa_trove.h"
+#include "pgpa_walker.h"
+
+#include "commands/defrem.h"
+#include "common/hashfn_unstable.h"
+#include "nodes/makefuncs.h"
+#include "optimizer/extendplan.h"
+#include "optimizer/pathnode.h"
+#include "optimizer/paths.h"
+#include "optimizer/plancat.h"
+#include "optimizer/planner.h"
+#include "parser/parsetree.h"
+#include "utils/lsyscache.h"
+
+#ifdef USE_ASSERT_CHECKING
+
+/*
+ * When assertions are enabled, we try generating relation identifiers during
+ * planning, saving them in a hash table, and then cross-checking them against
+ * the ones generated after planning is complete.
+ */
+typedef struct pgpa_ri_checker_key
+{
+ char *plan_name;
+ Index rti;
+} pgpa_ri_checker_key;
+
+typedef struct pgpa_ri_checker
+{
+ pgpa_ri_checker_key key;
+ uint32 status;
+ const char *rid_string;
+} pgpa_ri_checker;
+
+static uint32 pgpa_ri_checker_hash_key(pgpa_ri_checker_key key);
+
+static inline bool
+pgpa_ri_checker_compare_key(pgpa_ri_checker_key a, pgpa_ri_checker_key b)
+{
+ if (a.rti != b.rti)
+ return false;
+ if (a.plan_name == NULL)
+ return (b.plan_name == NULL);
+ if (b.plan_name == NULL)
+ return false;
+ return strcmp(a.plan_name, b.plan_name) == 0;
+}
+
+#define SH_PREFIX pgpa_ri_check
+#define SH_ELEMENT_TYPE pgpa_ri_checker
+#define SH_KEY_TYPE pgpa_ri_checker_key
+#define SH_KEY key
+#define SH_HASH_KEY(tb, key) pgpa_ri_checker_hash_key(key)
+#define SH_EQUAL(tb, a, b) pgpa_ri_checker_compare_key(a, b)
+#define SH_SCOPE static inline
+#define SH_DECLARE
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+#endif
+
+typedef enum pgpa_jo_outcome
+{
+ PGPA_JO_PERMITTED, /* permit this join order */
+ PGPA_JO_DENIED, /* deny this join order */
+ PGPA_JO_INDIFFERENT /* do neither */
+} pgpa_jo_outcome;
+
+typedef struct pgpa_planner_state
+{
+ ExplainState *explain_state;
+ bool generate_advice_feedback;
+ bool generate_advice_string;
+ pgpa_trove *trove;
+ MemoryContext trove_cxt;
+ List *sj_unique_rels;
+
+#ifdef USE_ASSERT_CHECKING
+ pgpa_ri_check_hash *ri_check_hash;
+#endif
+} pgpa_planner_state;
+
+typedef struct pgpa_join_state
+{
+ /* Most-recently-considered outer rel. */
+ RelOptInfo *outerrel;
+
+ /* Most-recently-considered inner rel. */
+ RelOptInfo *innerrel;
+
+ /*
+ * Array of relation identifiers for all members of this joinrel, with
+ * outerrel idenifiers before innerrel identifiers.
+ */
+ pgpa_identifier *rids;
+
+ /* Number of outer rel identifiers. */
+ int outer_count;
+
+ /* Number of inner rel identifiers. */
+ int inner_count;
+
+ /*
+ * Trove lookup results.
+ *
+ * join_entries and rel_entries are arrays of entries, and join_indexes
+ * and rel_indexes are the integer offsets within those arrays of entries
+ * potentially relevant to us. The "join" fields correspond to a lookup
+ * using PGPA_TROVE_LOOKUP_JOIN and the "rel" fields to a lookup using
+ * PGPA_TROVE_LOOKUP_REL.
+ */
+ pgpa_trove_entry *join_entries;
+ Bitmapset *join_indexes;
+ pgpa_trove_entry *rel_entries;
+ Bitmapset *rel_indexes;
+} pgpa_join_state;
+
+/* Saved hook values */
+static get_relation_info_hook_type prev_get_relation_info = NULL;
+static join_path_setup_hook_type prev_join_path_setup = NULL;
+static joinrel_setup_hook_type prev_joinrel_setup = NULL;
+static planner_setup_hook_type prev_planner_setup = NULL;
+static planner_shutdown_hook_type prev_planner_shutdown = NULL;
+
+/* Other global variabes */
+static int planner_extension_id = -1;
+
+/* Function prototypes. */
+static void pgpa_planner_setup(PlannerGlobal *glob, Query *parse,
+ const char *query_string,
+ int cursorOptions,
+ double *tuple_fraction,
+ ExplainState *es);
+static void pgpa_planner_shutdown(PlannerGlobal *glob, Query *parse,
+ const char *query_string, PlannedStmt *pstmt);
+static void pgpa_get_relation_info(PlannerInfo *root,
+ Oid relationObjectId,
+ bool inhparent,
+ RelOptInfo *rel);
+static void pgpa_joinrel_setup(PlannerInfo *root,
+ RelOptInfo *joinrel,
+ RelOptInfo *outerrel,
+ RelOptInfo *innerrel,
+ SpecialJoinInfo *sjinfo,
+ List *restrictlist);
+static void pgpa_join_path_setup(PlannerInfo *root,
+ RelOptInfo *joinrel,
+ RelOptInfo *outerrel,
+ RelOptInfo *innerrel,
+ JoinType jointype,
+ JoinPathExtraData *extra);
+static pgpa_join_state *pgpa_get_join_state(PlannerInfo *root,
+ RelOptInfo *joinrel,
+ RelOptInfo *outerrel,
+ RelOptInfo *innerrel);
+static void pgpa_planner_apply_joinrel_advice(uint64 *pgs_mask_p,
+ char *plan_name,
+ pgpa_join_state *pjs);
+static void pgpa_planner_apply_join_path_advice(JoinType jointype,
+ uint64 *pgs_mask_p,
+ char *plan_name,
+ pgpa_join_state *pjs);
+static void pgpa_planner_apply_scan_advice(RelOptInfo *rel,
+ pgpa_trove_entry *scan_entries,
+ Bitmapset *scan_indexes,
+ pgpa_trove_entry *rel_entries,
+ Bitmapset *rel_indexes);
+static uint64 pgpa_join_strategy_mask_from_advice_tag(pgpa_advice_tag_type tag);
+static pgpa_jo_outcome pgpa_join_order_permits_join(int outer_count,
+ int inner_count,
+ pgpa_identifier *rids,
+ pgpa_trove_entry *entry);
+static bool pgpa_join_method_permits_join(int outer_count, int inner_count,
+ pgpa_identifier *rids,
+ pgpa_trove_entry *entry,
+ bool *restrict_method);
+static bool pgpa_opaque_join_permits_join(int outer_count, int inner_count,
+ pgpa_identifier *rids,
+ pgpa_trove_entry *entry,
+ bool *restrict_method);
+static bool pgpa_semijoin_permits_join(int outer_count, int inner_count,
+ pgpa_identifier *rids,
+ pgpa_trove_entry *entry,
+ bool outer_side_nullable,
+ bool *restrict_method);
+
+static List *pgpa_planner_append_feedback(List *list, pgpa_trove *trove,
+ pgpa_trove_lookup_type type,
+ pgpa_identifier *rt_identifiers,
+ pgpa_plan_walker_context *walker);
+static void pgpa_planner_feedback_warning(List *feedback);
+
+static inline void pgpa_ri_checker_save(pgpa_planner_state *pps,
+ PlannerInfo *root,
+ RelOptInfo *rel);
+static void pgpa_ri_checker_validate(pgpa_planner_state *pps,
+ PlannedStmt *pstmt);
+
+static char *pgpa_bms_to_cstring(Bitmapset *bms);
+static const char *pgpa_jointype_to_cstring(JoinType jointype);
+
+/*
+ * Install planner-related hooks.
+ */
+void
+pgpa_planner_install_hooks(void)
+{
+ planner_extension_id = GetPlannerExtensionId("pg_plan_advice");
+ prev_planner_setup = planner_setup_hook;
+ planner_setup_hook = pgpa_planner_setup;
+ prev_planner_shutdown = planner_shutdown_hook;
+ planner_shutdown_hook = pgpa_planner_shutdown;
+ prev_get_relation_info = get_relation_info_hook;
+ get_relation_info_hook = pgpa_get_relation_info;
+ prev_joinrel_setup = joinrel_setup_hook;
+ joinrel_setup_hook = pgpa_joinrel_setup;
+ prev_join_path_setup = join_path_setup_hook;
+ join_path_setup_hook = pgpa_join_path_setup;
+}
+
+/*
+ * Carry out whatever setup work we need to do before planning.
+ */
+static void
+pgpa_planner_setup(PlannerGlobal *glob, Query *parse, const char *query_string,
+ int cursorOptions, double *tuple_fraction,
+ ExplainState *es)
+{
+ pgpa_trove *trove = NULL;
+ pgpa_planner_state *pps;
+ char *supplied_advice;
+ bool generate_advice_feedback = false;
+ bool generate_advice_string = false;
+ bool needs_pps = false;
+
+ /*
+ * Decide whether we need to generate an advice string. We must do this if
+ * the user has told us to do it categorically, or if at least one
+ * collector is enabled, or if the user has requested it using the EXPLAIN
+ * (PLAN_ADVICE) option.
+ */
+ generate_advice_string = (pg_plan_advice_always_store_advice_details ||
+ pg_plan_advice_local_collector ||
+ pg_plan_advice_shared_collector ||
+ pg_plan_advice_should_explain(es));
+ if (generate_advice_string)
+ needs_pps = true;
+
+ /*
+ * If any advice was provided, build a trove of advice for use during
+ * planning.
+ */
+ supplied_advice = pg_plan_advice_get_supplied_query_advice(glob, parse,
+ query_string,
+ cursorOptions,
+ es);
+ if (supplied_advice != NULL && supplied_advice[0] != '\0')
+ {
+ List *advice_items;
+ char *error;
+
+ /*
+ * If the supplied advice string comes from pg_plan_advice.advice,
+ * parsing shouldn't fail here, because we must have previously parsed
+ * successfully in pg_plan_advice_advice_check_hook. However, it might
+ * also be come from a hook registered via pg_plan_advice_add_advisor,
+ * and we can't be sure whether that's valid. (Plus, having an error
+ * check of here seems like a good idea anyway, just for safety.)
+ */
+ advice_items = pgpa_parse(supplied_advice, &error);
+ if (error)
+ ereport(WARNING,
+ errmsg("could not parse supplied advice: %s", error));
+
+ /*
+ * It's possible that the advice string was non-empty but contained no
+ * actual advice, e.g. it was all whitespace.
+ */
+ if (advice_items != NIL)
+ {
+ trove = pgpa_build_trove(advice_items);
+ needs_pps = true;
+
+ /*
+ * If we know that we're running under EXPLAIN, or if the user has
+ * told us to always do the work, generate advice feedback.
+ */
+ if (es != NULL || pg_plan_advice_feedback_warnings ||
+ pg_plan_advice_always_store_advice_details)
+ generate_advice_feedback = true;
+ }
+ }
+
+#ifdef USE_ASSERT_CHECKING
+
+ /*
+ * If asserts are enabled, always build a private state object for
+ * cross-checks.
+ */
+ needs_pps = true;
+#endif
+
+ /*
+ * We only create and initialize a private state object if it's needed for
+ * some purpose. That could be (1) recording that we will need to generate
+ * an advice string, (2) storing a trove of supplied advice, or (3)
+ * facilitating debugging cross-checks when asserts are enabled.
+ */
+ if (needs_pps)
+ {
+ pps = palloc0_object(pgpa_planner_state);
+ pps->explain_state = es;
+ pps->generate_advice_feedback = generate_advice_feedback;
+ pps->generate_advice_string = generate_advice_string;
+ pps->trove = trove;
+#ifdef USE_ASSERT_CHECKING
+ pps->ri_check_hash =
+ pgpa_ri_check_create(CurrentMemoryContext, 1024, NULL);
+#endif
+ SetPlannerGlobalExtensionState(glob, planner_extension_id, pps);
+ }
+}
+
+/*
+ * Carry out whatever work we want to do after planning is complete.
+ */
+static void
+pgpa_planner_shutdown(PlannerGlobal *glob, Query *parse,
+ const char *query_string, PlannedStmt *pstmt)
+{
+ pgpa_planner_state *pps;
+ pgpa_trove *trove = NULL;
+ pgpa_plan_walker_context walker = {0}; /* placate compiler */
+ bool generate_advice_feedback = false;
+ bool generate_advice_string = false;
+ List *pgpa_items = NIL;
+ pgpa_identifier *rt_identifiers = NULL;
+
+ /* Fetch our private state, set up by pgpa_planner_setup(). */
+ pps = GetPlannerGlobalExtensionState(glob, planner_extension_id);
+ if (pps != NULL)
+ {
+ trove = pps->trove;
+ generate_advice_feedback = pps->generate_advice_feedback;
+ generate_advice_string = pps->generate_advice_string;
+ }
+
+ /*
+ * If we're trying to generate an advice string or if we're trying to
+ * provide advice feedback, then we will need to create range table
+ * identifiers.
+ */
+ if (generate_advice_string || generate_advice_feedback)
+ {
+ pgpa_plan_walker(&walker, pstmt, pps->sj_unique_rels);
+ rt_identifiers = pgpa_create_identifiers_for_planned_stmt(pstmt);
+ }
+
+ /* Generate the advice string, if we need to do so. */
+ if (generate_advice_string)
+ {
+ char *advice_string;
+ StringInfoData buf;
+
+ /* Generate a textual advice string. */
+ initStringInfo(&buf);
+ pgpa_output_advice(&buf, &walker, rt_identifiers);
+ advice_string = buf.data;
+
+ /* If the advice string is empty, don't bother collecting it. */
+ if (advice_string[0] != '\0')
+ pgpa_collect_advice(pstmt->queryId, query_string, advice_string);
+
+ /* Save the advice string in the final plan. */
+ pgpa_items = lappend(pgpa_items,
+ makeDefElem("advice_string",
+ (Node *) makeString(advice_string),
+ -1));
+ }
+
+ /*
+ * If we're trying to provide advice feedback, then we will need to
+ * analyze how successful the advice was.
+ */
+ if (generate_advice_feedback)
+ {
+ List *feedback = NIL;
+
+ /*
+ * Inject a Node-tree representation of all the trove-entry flags into
+ * the PlannedStmt.
+ */
+ feedback = pgpa_planner_append_feedback(feedback,
+ trove,
+ PGPA_TROVE_LOOKUP_SCAN,
+ rt_identifiers, &walker);
+ feedback = pgpa_planner_append_feedback(feedback,
+ trove,
+ PGPA_TROVE_LOOKUP_JOIN,
+ rt_identifiers, &walker);
+ feedback = pgpa_planner_append_feedback(feedback,
+ trove,
+ PGPA_TROVE_LOOKUP_REL,
+ rt_identifiers, &walker);
+
+ pgpa_items = lappend(pgpa_items, makeDefElem("feedback",
+ (Node *) feedback, -1));
+
+ /* If we were asked to generate feedback warnings, do so. */
+ if (pg_plan_advice_feedback_warnings)
+ pgpa_planner_feedback_warning(feedback);
+ }
+
+ /* Push whatever data we're saving into the PlannedStmt. */
+ if (pgpa_items != NIL)
+ pstmt->extension_state =
+ lappend(pstmt->extension_state,
+ makeDefElem("pg_plan_advice", (Node *) pgpa_items, -1));
+
+ /*
+ * If assertions are enabled, cross-check the generated range table
+ * identifiers.
+ */
+ if (pps != NULL)
+ pgpa_ri_checker_validate(pps, pstmt);
+}
+
+/*
+ * Hook function for get_relation_info().
+ *
+ * We can apply scan advice at this opint, and we also usee this as an
+ * opportunity to do range-table identifier cross-checking in assert-enabled
+ * builds.
+ */
+static void
+pgpa_get_relation_info(PlannerInfo *root, Oid relationObjectId,
+ bool inhparent, RelOptInfo *rel)
+{
+ pgpa_planner_state *pps;
+
+ /* Fetch our private state, set up by pgpa_planner_setup(). */
+ pps = GetPlannerGlobalExtensionState(root->glob, planner_extension_id);
+
+ /* Save details needed for range table identifier cross-checking. */
+ if (pps != NULL)
+ pgpa_ri_checker_save(pps, root, rel);
+
+ /* If query advice was provided, search for relevant entries. */
+ if (pps != NULL && pps->trove != NULL)
+ {
+ pgpa_identifier rid;
+ pgpa_trove_result tresult_scan;
+ pgpa_trove_result tresult_rel;
+
+ /* Search for scan advice and general rel advice. */
+ pgpa_compute_identifier_by_rti(root, rel->relid, &rid);
+ pgpa_trove_lookup(pps->trove, PGPA_TROVE_LOOKUP_SCAN, 1, &rid,
+ &tresult_scan);
+ pgpa_trove_lookup(pps->trove, PGPA_TROVE_LOOKUP_REL, 1, &rid,
+ &tresult_rel);
+
+ /* If relevant entries were found, apply them. */
+ if (tresult_scan.indexes != NULL || tresult_rel.indexes != NULL)
+ {
+ uint64 original_mask = rel->pgs_mask;
+
+ pgpa_planner_apply_scan_advice(rel,
+ tresult_scan.entries,
+ tresult_scan.indexes,
+ tresult_rel.entries,
+ tresult_rel.indexes);
+
+ /* Emit debugging message, if enabled. */
+ if (pg_plan_advice_trace_mask && original_mask != rel->pgs_mask)
+ ereport(WARNING,
+ (errmsg("strategy mask for RTI %u changed from 0x%" PRIx64 " to 0x%" PRIx64,
+ rel->relid, original_mask, rel->pgs_mask)));
+ }
+ }
+
+ /* Pass call to previous hook. */
+ if (prev_get_relation_info)
+ (*prev_get_relation_info) (root, relationObjectId, inhparent, rel);
+}
+
+/*
+ * Enforce any provided advice that is relevant to any method of implementing
+ * this join.
+ *
+ * Although we're passed the outerrel and innerrel here, those are just
+ * whatever values happened to prompt the creation of this joinrel; they
+ * shouldn't really influence our choice of what advice to apply.
+ */
+static void
+pgpa_joinrel_setup(PlannerInfo *root, RelOptInfo *joinrel,
+ RelOptInfo *outerrel, RelOptInfo *innerrel,
+ SpecialJoinInfo *sjinfo, List *restrictlist)
+{
+ pgpa_join_state *pjs;
+
+ Assert(bms_membership(joinrel->relids) == BMS_MULTIPLE);
+
+ /* Get our private state information for this join. */
+ pjs = pgpa_get_join_state(root, joinrel, outerrel, innerrel);
+
+ /* If there is relevant advice, call a helper function to apply it. */
+ if (pjs != NULL)
+ {
+ uint64 original_mask = joinrel->pgs_mask;
+
+ pgpa_planner_apply_joinrel_advice(&joinrel->pgs_mask,
+ root->plan_name,
+ pjs);
+
+ /* Emit debugging message, if enabled. */
+ if (pg_plan_advice_trace_mask && original_mask != joinrel->pgs_mask)
+ ereport(WARNING,
+ (errmsg("strategy mask for join on RTIs %s changed from 0x%" PRIx64 " to 0x%" PRIx64,
+ pgpa_bms_to_cstring(joinrel->relids),
+ original_mask,
+ joinrel->pgs_mask)));
+ }
+
+ /* Pass call to previous hook. */
+ if (prev_joinrel_setup)
+ (*prev_joinrel_setup) (root, joinrel, outerrel, innerrel,
+ sjinfo, restrictlist);
+}
+
+/*
+ * Enforce any provided advice that is relevant to this particular method of
+ * implementing this particular join.
+ */
+static void
+pgpa_join_path_setup(PlannerInfo *root, RelOptInfo *joinrel,
+ RelOptInfo *outerrel, RelOptInfo *innerrel,
+ JoinType jointype, JoinPathExtraData *extra)
+{
+ pgpa_join_state *pjs;
+
+ Assert(bms_membership(joinrel->relids) == BMS_MULTIPLE);
+
+ /*
+ * If we're considering implementing a semijoin by making one side unique,
+ * make a note of it in the pgpa_planner_state. See comments for
+ * pgpa_sj_unique_rel for why we do this.
+ */
+ if (jointype == JOIN_UNIQUE_OUTER || jointype == JOIN_UNIQUE_INNER)
+ {
+ pgpa_planner_state *pps;
+ RelOptInfo *uniquerel;
+
+ uniquerel = jointype == JOIN_UNIQUE_OUTER ? outerrel : innerrel;
+ pps = GetPlannerGlobalExtensionState(root->glob, planner_extension_id);
+ if (pps != NULL &&
+ (pps->generate_advice_string || pps->generate_advice_feedback))
+ {
+ bool found = false;
+
+ /* Avoid adding duplicates. */
+ foreach_ptr(pgpa_sj_unique_rel, ur, pps->sj_unique_rels)
+ {
+ /*
+ * We should always use the same pointer for the same plan
+ * name, so we need not use strcmp() here.
+ */
+ if (root->plan_name == ur->plan_name &&
+ bms_equal(uniquerel->relids, ur->relids))
+ {
+ found = true;
+ break;
+ }
+ }
+
+ /* If not a duplicate, append to the list. */
+ if (!found)
+ {
+ pgpa_sj_unique_rel *ur = palloc_object(pgpa_sj_unique_rel);
+
+ ur->plan_name = root->plan_name;
+ ur->relids = uniquerel->relids;
+ pps->sj_unique_rels = lappend(pps->sj_unique_rels, ur);
+ }
+ }
+ }
+
+ /* Get our private state information for this join. */
+ pjs = pgpa_get_join_state(root, joinrel, outerrel, innerrel);
+
+ /* If there is relevant advice, call a helper function to apply it. */
+ if (pjs != NULL)
+ {
+ uint64 original_mask = extra->pgs_mask;
+
+ pgpa_planner_apply_join_path_advice(jointype,
+ &extra->pgs_mask,
+ root->plan_name,
+ pjs);
+
+ /* Emit debugging message, if enabled. */
+ if (pg_plan_advice_trace_mask && original_mask != extra->pgs_mask)
+ ereport(WARNING,
+ (errmsg("strategy mask for %s join on %s with outer %s and inner %s changed from 0x%" PRIx64 " to 0x%" PRIx64,
+ pgpa_jointype_to_cstring(jointype),
+ pgpa_bms_to_cstring(joinrel->relids),
+ pgpa_bms_to_cstring(outerrel->relids),
+ pgpa_bms_to_cstring(innerrel->relids),
+ original_mask,
+ extra->pgs_mask)));
+ }
+
+ /* Pass call to previous hook. */
+ if (prev_join_path_setup)
+ (*prev_join_path_setup) (root, joinrel, outerrel, innerrel,
+ jointype, extra);
+}
+
+/*
+ * Search for advice pertaining to a proposed join.
+ */
+static pgpa_join_state *
+pgpa_get_join_state(PlannerInfo *root, RelOptInfo *joinrel,
+ RelOptInfo *outerrel, RelOptInfo *innerrel)
+{
+ pgpa_planner_state *pps;
+ pgpa_join_state *pjs;
+ bool new_pjs = false;
+
+ /* Fetch our private state, set up by pgpa_planner_setup(). */
+ pps = GetPlannerGlobalExtensionState(root->glob, planner_extension_id);
+ if (pps == NULL || pps->trove == NULL)
+ {
+ /* No advice applies to this query, hence none to this joinrel. */
+ return NULL;
+ }
+
+ /*
+ * See whether we've previously associated a pgpa_join_state with this
+ * joinrel. If we have not, we need to try to construct one. If we have,
+ * then there are two cases: (a) if innerrel and outerrel are unchanged,
+ * we can simply use it, and (b) if they have changed, we need to rejigger
+ * the array of identifiers but can still skip the trove lookup.
+ */
+ pjs = GetRelOptInfoExtensionState(joinrel, planner_extension_id);
+ if (pjs != NULL)
+ {
+ if (pjs->join_indexes == NULL && pjs->rel_indexes == NULL)
+ {
+ /*
+ * If there's no potentially relevant advice, then the presence of
+ * this pgpa_join_state acts like a negative cache entry: it tells
+ * us not to bother searching the trove for advice, because we
+ * will not find any.
+ */
+ return NULL;
+ }
+
+ if (pjs->outerrel == outerrel && pjs->innerrel == innerrel)
+ {
+ /* No updates required, so just return. */
+ /* XXX. Does this need to do something different under GEQO? */
+ return pjs;
+ }
+ }
+
+ /*
+ * If there's no pgpa_join_state yet, we need to allocate one. Trove keys
+ * will not get built for RTE_JOIN RTEs, so the array may end up being
+ * larger than needed. It's not worth trying to compute a perfectly
+ * accurate count here.
+ */
+ if (pjs == NULL)
+ {
+ int pessimistic_count = bms_num_members(joinrel->relids);
+
+ pjs = palloc0_object(pgpa_join_state);
+ pjs->rids = palloc_array(pgpa_identifier, pessimistic_count);
+ new_pjs = true;
+ }
+
+ /*
+ * Either we just allocated a new pgpa_join_state, or the existing one
+ * needs reconfiguring for a new innerrel and outerrel. The required array
+ * size can't change, so we can overwrite the existing one.
+ */
+ pjs->outerrel = outerrel;
+ pjs->innerrel = innerrel;
+ pjs->outer_count =
+ pgpa_compute_identifiers_by_relids(root, outerrel->relids, pjs->rids);
+ pjs->inner_count =
+ pgpa_compute_identifiers_by_relids(root, innerrel->relids,
+ pjs->rids + pjs->outer_count);
+
+ /*
+ * If we allocated a new pgpa_join_state, search our trove of advice for
+ * relevant entries. The trove lookup will return the same results for
+ * every outerrel/innerrel combination, so we don't need to repeat that
+ * work every time.
+ */
+ if (new_pjs)
+ {
+ pgpa_trove_result tresult;
+
+ /* Find join entries. */
+ pgpa_trove_lookup(pps->trove, PGPA_TROVE_LOOKUP_JOIN,
+ pjs->outer_count + pjs->inner_count,
+ pjs->rids, &tresult);
+ pjs->join_entries = tresult.entries;
+ pjs->join_indexes = tresult.indexes;
+
+ /* Find rel entries. */
+ pgpa_trove_lookup(pps->trove, PGPA_TROVE_LOOKUP_REL,
+ pjs->outer_count + pjs->inner_count,
+ pjs->rids, &tresult);
+ pjs->rel_entries = tresult.entries;
+ pjs->rel_indexes = tresult.indexes;
+
+ /* Now that the new pgpa_join_state is fully valid, save a pointer. */
+ SetRelOptInfoExtensionState(joinrel, planner_extension_id, pjs);
+
+ /*
+ * If there was no relevant advice found, just return NULL. This
+ * pgpa_join_state will stick around as a sort of negative cache
+ * entry, so that future calls for this same joinrel quickly return
+ * NULL.
+ */
+ if (pjs->join_indexes == NULL && pjs->rel_indexes == NULL)
+ return NULL;
+ }
+
+ return pjs;
+}
+
+/*
+ * Enforce overall restrictions on a join relation that apply uniformly
+ * regardless of the choice of inner and outer rel.
+ */
+static void
+pgpa_planner_apply_joinrel_advice(uint64 *pgs_mask_p, char *plan_name,
+ pgpa_join_state *pjs)
+{
+ int i = -1;
+ int flags;
+ bool gather_conflict = false;
+ uint64 gather_mask = 0;
+ Bitmapset *gather_partial_match = NULL;
+ Bitmapset *gather_full_match = NULL;
+ bool partitionwise_conflict = false;
+ int partitionwise_outcome = 0;
+ Bitmapset *partitionwise_partial_match = NULL;
+ Bitmapset *partitionwise_full_match = NULL;
+
+ /* Iterate over all possibly-relevant advice. */
+ while ((i = bms_next_member(pjs->rel_indexes, i)) >= 0)
+ {
+ pgpa_trove_entry *entry = &pjs->rel_entries[i];
+ pgpa_itm_type itm;
+ bool full_match = false;
+ uint64 my_gather_mask = 0;
+ int my_partitionwise_outcome = 0; /* >0 yes, <0 no */
+
+ /*
+ * For GATHER and GATHER_MERGE, if the specified relations exactly
+ * match this joinrel, do whatever the advice says; otherwise, don't
+ * allow Gather or Gather Merge at this level. For NO_GATHER, there
+ * must be a single target relation which must be included in this
+ * joinrel, so just don't allow Gather or Gather Merge here, full
+ * stop.
+ */
+ if (entry->tag == PGPA_TAG_NO_GATHER)
+ {
+ my_gather_mask = PGS_CONSIDER_NONPARTIAL;
+ full_match = true;
+ }
+ else
+ {
+ int total_count;
+
+ total_count = pjs->outer_count + pjs->inner_count;
+ itm = pgpa_identifiers_match_target(total_count, pjs->rids,
+ entry->target);
+ Assert(itm != PGPA_ITM_DISJOINT);
+
+ if (itm == PGPA_ITM_EQUAL)
+ {
+ full_match = true;
+ if (entry->tag == PGPA_TAG_PARTITIONWISE)
+ my_partitionwise_outcome = 1;
+ else if (entry->tag == PGPA_TAG_GATHER)
+ my_gather_mask = PGS_GATHER;
+ else if (entry->tag == PGPA_TAG_GATHER_MERGE)
+ my_gather_mask = PGS_GATHER_MERGE;
+ else
+ elog(ERROR, "unexpected advice tag: %d",
+ (int) entry->tag);
+ }
+ else
+ {
+ if (entry->tag == PGPA_TAG_PARTITIONWISE)
+ {
+ my_partitionwise_outcome = -1;
+ my_gather_mask = PGS_CONSIDER_NONPARTIAL;
+ }
+ else if (entry->tag == PGPA_TAG_GATHER ||
+ entry->tag == PGPA_TAG_GATHER_MERGE)
+ {
+ my_partitionwise_outcome = -1;
+ my_gather_mask = PGS_CONSIDER_NONPARTIAL;
+ }
+ else
+ elog(ERROR, "unexpected advice tag: %d",
+ (int) entry->tag);
+ }
+ }
+
+ /*
+ * If we set my_gather_mask up above, then we (1) make a note if the
+ * advice conflicted, (2) remember the mask value, and (3) remember
+ * whether this was a full or partial match.
+ */
+ if (my_gather_mask != 0)
+ {
+ if (gather_mask != 0 && gather_mask != my_gather_mask)
+ gather_conflict = true;
+ gather_mask = my_gather_mask;
+ if (full_match)
+ gather_full_match = bms_add_member(gather_full_match, i);
+ else
+ gather_partial_match = bms_add_member(gather_partial_match, i);
+ }
+
+ /*
+ * Likewise, if we set my_partitionwise_outcome up above, then we (1)
+ * make a note if the advice conflicted, (2) remember what the desired
+ * outcome was, and (3) remember whether this was a full or partial
+ * match.
+ */
+ if (my_partitionwise_outcome != 0)
+ {
+ if (partitionwise_outcome != 0 &&
+ partitionwise_outcome != my_partitionwise_outcome)
+ partitionwise_conflict = true;
+ partitionwise_outcome = my_partitionwise_outcome;
+ if (full_match)
+ partitionwise_full_match =
+ bms_add_member(partitionwise_full_match, i);
+ else
+ partitionwise_partial_match =
+ bms_add_member(partitionwise_partial_match, i);
+ }
+ }
+
+ /*
+ * Mark every Gather-related piece of advice as partially matched, and if
+ * the set of targets exactly matched this relation, fully matched. If
+ * there was a conflict, mark them all as conflicting.
+ */
+ flags = PGPA_TE_MATCH_PARTIAL;
+ if (gather_conflict)
+ flags |= PGPA_TE_CONFLICTING;
+ pgpa_trove_set_flags(pjs->rel_entries, gather_partial_match, flags);
+ flags |= PGPA_TE_MATCH_FULL;
+ pgpa_trove_set_flags(pjs->rel_entries, gather_full_match, flags);
+
+ /* Likewise for partitionwise advice. */
+ flags = PGPA_TE_MATCH_PARTIAL;
+ if (partitionwise_conflict)
+ flags |= PGPA_TE_CONFLICTING;
+ pgpa_trove_set_flags(pjs->rel_entries, partitionwise_partial_match, flags);
+ flags |= PGPA_TE_MATCH_FULL;
+ pgpa_trove_set_flags(pjs->rel_entries, partitionwise_full_match, flags);
+
+ /*
+ * Enforce restrictions on the Gather/Gather Merge. Only clear bits here,
+ * so that we still respect the enable_* GUCs. Do nothing if the advise
+ * conflicts.
+ */
+ if (gather_mask != 0 && !gather_conflict)
+ {
+ uint64 all_gather_mask;
+
+ all_gather_mask =
+ PGS_GATHER | PGS_GATHER_MERGE | PGS_CONSIDER_NONPARTIAL;
+ *pgs_mask_p &= ~(all_gather_mask & ~gather_mask);
+ }
+
+ /*
+ * As above, but for partitionwise advice.
+ *
+ * To induce a partitionwise join, we disable all the ordinary means of
+ * performing a join, so that an Append or MergeAppend path will hopefully
+ * be chosen.
+ *
+ * To prevent one, we just disable Append and MergeAppend. Note that we
+ * must not unset PGS_CONSIDER_PARTITIONWISE even when we don't want a
+ * partitionwise join here, because we might want one at a higher level
+ * that is constructing using paths from this level.
+ */
+ if (partitionwise_outcome != 0 && !partitionwise_conflict)
+ {
+ if (partitionwise_outcome > 0)
+ *pgs_mask_p = (*pgs_mask_p & ~PGS_JOIN_ANY);
+ else
+ *pgs_mask_p &= ~(PGS_APPEND | PGS_MERGE_APPEND);
+ }
+}
+
+/*
+ * Enforce restrictions on the join order or join method.
+ */
+static void
+pgpa_planner_apply_join_path_advice(JoinType jointype, uint64 *pgs_mask_p,
+ char *plan_name,
+ pgpa_join_state *pjs)
+{
+ int i = -1;
+ Bitmapset *jo_permit_indexes = NULL;
+ Bitmapset *jo_deny_indexes = NULL;
+ Bitmapset *jo_deny_rel_indexes = NULL;
+ Bitmapset *jm_indexes = NULL;
+ bool jm_conflict = false;
+ uint32 join_mask = 0;
+ Bitmapset *sj_permit_indexes = NULL;
+ Bitmapset *sj_deny_indexes = NULL;
+
+ /*
+ * Reconsider PARTITIONWISE(...) advice.
+ *
+ * We already thought about this for the joinrel as a whole, but in some
+ * cases, partitionwise advice can also constrain the join order. For
+ * instance, if the advice says PARTITIONWISE((t1 t2)), we shouldn't build
+ * join paths for a any joinrel that includes t1 or t2 unless it also
+ * includes the other. In general, the paritionwise operation must have
+ * already been completed within one side of the current join or the
+ * other, else the join order is impermissible.
+ *
+ * NB: It might seem tempting to try to deal with PARTITIONWISE advise
+ * entirely in this function, but that doesn't work. Here, we can only
+ * affect the pgs_mask within a particular JoinPathExtraData, that is, for
+ * a particular choice of innerrel and outerrel. Partitionwise paths are
+ * not built that way, so we must set pgs_mask for the RelOptInfo, which
+ * is best done in pgpa_planner_apply_joinrel_advice.
+ */
+ while ((i = bms_next_member(pjs->rel_indexes, i)) >= 0)
+ {
+ pgpa_trove_entry *entry = &pjs->rel_entries[i];
+ pgpa_itm_type inner_itm;
+ pgpa_itm_type outer_itm;
+
+ if (entry->tag != PGPA_TAG_PARTITIONWISE)
+ continue;
+
+ outer_itm = pgpa_identifiers_match_target(pjs->outer_count,
+ pjs->rids, entry->target);
+ if (outer_itm == PGPA_ITM_EQUAL ||
+ outer_itm == PGPA_ITM_TARGETS_ARE_SUBSET)
+ continue;
+
+ inner_itm = pgpa_identifiers_match_target(pjs->inner_count,
+ pjs->rids + pjs->outer_count,
+ entry->target);
+ if (inner_itm == PGPA_ITM_EQUAL ||
+ inner_itm == PGPA_ITM_TARGETS_ARE_SUBSET)
+ continue;
+
+ jo_deny_rel_indexes = bms_add_member(jo_deny_rel_indexes, i);
+ }
+
+ /* Iterate over advice that pertains to the join order and method. */
+ i = -1;
+ while ((i = bms_next_member(pjs->join_indexes, i)) >= 0)
+ {
+ pgpa_trove_entry *entry = &pjs->join_entries[i];
+ uint32 my_join_mask;
+
+ /* Handle join order advice. */
+ if (entry->tag == PGPA_TAG_JOIN_ORDER)
+ {
+ pgpa_jo_outcome jo_outcome;
+
+ jo_outcome = pgpa_join_order_permits_join(pjs->outer_count,
+ pjs->inner_count,
+ pjs->rids,
+ entry);
+ if (jo_outcome == PGPA_JO_PERMITTED)
+ jo_permit_indexes = bms_add_member(jo_permit_indexes, i);
+ else if (jo_outcome == PGPA_JO_DENIED)
+ jo_deny_indexes = bms_add_member(jo_deny_indexes, i);
+ continue;
+ }
+
+ /* Handle join method advice. */
+ my_join_mask = pgpa_join_strategy_mask_from_advice_tag(entry->tag);
+ if (my_join_mask != 0)
+ {
+ bool permit;
+ bool restrict_method;
+
+ if (entry->tag == PGPA_TAG_FOREIGN_JOIN)
+ permit = pgpa_opaque_join_permits_join(pjs->outer_count,
+ pjs->inner_count,
+ pjs->rids,
+ entry,
+ &restrict_method);
+ else
+ permit = pgpa_join_method_permits_join(pjs->outer_count,
+ pjs->inner_count,
+ pjs->rids,
+ entry,
+ &restrict_method);
+ if (!permit)
+ jo_deny_indexes = bms_add_member(jo_deny_indexes, i);
+ else if (restrict_method)
+ {
+ jm_indexes = bms_add_member(jm_indexes, i);
+ if (join_mask != 0 && join_mask != my_join_mask)
+ jm_conflict = true;
+ join_mask = my_join_mask;
+ }
+ continue;
+ }
+
+ /* Handle semijoin uniqueness advice. */
+ if (entry->tag == PGPA_TAG_SEMIJOIN_UNIQUE ||
+ entry->tag == PGPA_TAG_SEMIJOIN_NON_UNIQUE)
+ {
+ bool outer_side_nullable;
+ bool restrict_method;
+
+ /* Planner has nullable side of the semijoin on the outer side? */
+ outer_side_nullable = (jointype == JOIN_UNIQUE_OUTER ||
+ jointype == JOIN_RIGHT_SEMI);
+
+ if (!pgpa_semijoin_permits_join(pjs->outer_count,
+ pjs->inner_count,
+ pjs->rids,
+ entry,
+ outer_side_nullable,
+ &restrict_method))
+ jo_deny_indexes = bms_add_member(jo_deny_indexes, i);
+ else if (restrict_method)
+ {
+ bool advice_unique;
+ bool jt_unique;
+ bool jt_non_unique;
+
+ /* Advice wants to unique-ify and use a regular join? */
+ advice_unique = (entry->tag == PGPA_TAG_SEMIJOIN_UNIQUE);
+
+ /* Planner is trying to unique-ify and use a regular join? */
+ jt_unique = (jointype == JOIN_UNIQUE_INNER ||
+ jointype == JOIN_UNIQUE_OUTER);
+
+ /* Planner is trying a semi-join, without unique-ifying? */
+ jt_non_unique = (jointype == JOIN_SEMI ||
+ jointype == JOIN_RIGHT_SEMI);
+
+ if (!jt_unique && !jt_non_unique)
+ {
+ /*
+ * This doesn't seem to be a semijoin to which SJ_UNIQUE
+ * or SJ_NON_UNIQUE can be applied.
+ */
+ entry->flags |= PGPA_TE_INAPPLICABLE;
+ }
+ else if (advice_unique != jt_unique)
+ sj_deny_indexes = bms_add_member(sj_deny_indexes, i);
+ else
+ sj_permit_indexes = bms_add_member(sj_permit_indexes, i);
+ }
+ continue;
+ }
+ }
+
+ /*
+ * If the advice indicates both that this join order is permissible and
+ * also that it isn't, then mark advice related to the join order as
+ * conflicting.
+ */
+ if (jo_permit_indexes != NULL &&
+ (jo_deny_indexes != NULL || jo_deny_rel_indexes != NULL))
+ {
+ pgpa_trove_set_flags(pjs->join_entries, jo_permit_indexes,
+ PGPA_TE_CONFLICTING);
+ pgpa_trove_set_flags(pjs->join_entries, jo_deny_indexes,
+ PGPA_TE_CONFLICTING);
+ pgpa_trove_set_flags(pjs->rel_entries, jo_deny_rel_indexes,
+ PGPA_TE_CONFLICTING);
+ }
+
+ /*
+ * If more than one join method specification is relevant here and they
+ * differ, mark them all as conflicting.
+ */
+ if (jm_conflict)
+ pgpa_trove_set_flags(pjs->join_entries, jm_indexes,
+ PGPA_TE_CONFLICTING);
+
+ /* If semijoin advice says both yes and no, mark it all as conflicting. */
+ if (sj_permit_indexes != NULL && sj_deny_indexes != NULL)
+ {
+ pgpa_trove_set_flags(pjs->join_entries, sj_permit_indexes,
+ PGPA_TE_CONFLICTING);
+ pgpa_trove_set_flags(pjs->join_entries, sj_deny_indexes,
+ PGPA_TE_CONFLICTING);
+ }
+
+ /*
+ * Enforce restrictions on the join order and join method, and any
+ * semijoin-related restrictions. Only clear bits here, so that we still
+ * respect the enable_* GUCs. Do nothing in cases where the advice on a
+ * single topic conflicts.
+ */
+ if ((jo_deny_indexes != NULL || jo_deny_rel_indexes != NULL) &&
+ jo_permit_indexes == NULL)
+ *pgs_mask_p &= ~PGS_JOIN_ANY;
+ if (join_mask != 0 && !jm_conflict)
+ *pgs_mask_p &= ~(PGS_JOIN_ANY & ~join_mask);
+ if (sj_deny_indexes != NULL && sj_permit_indexes == NULL)
+ *pgs_mask_p &= ~PGS_JOIN_ANY;
+}
+
+/*
+ * Translate an advice tag into a path generation strategy mask.
+ *
+ * This function can be called with tag types that don't represent join
+ * strategies. In such cases, we just return 0, which can't be confused with
+ * a valid mask.
+ */
+static uint64
+pgpa_join_strategy_mask_from_advice_tag(pgpa_advice_tag_type tag)
+{
+ switch (tag)
+ {
+ case PGPA_TAG_FOREIGN_JOIN:
+ return PGS_FOREIGNJOIN;
+ case PGPA_TAG_MERGE_JOIN_PLAIN:
+ return PGS_MERGEJOIN_PLAIN;
+ case PGPA_TAG_MERGE_JOIN_MATERIALIZE:
+ return PGS_MERGEJOIN_MATERIALIZE;
+ case PGPA_TAG_NESTED_LOOP_PLAIN:
+ return PGS_NESTLOOP_PLAIN;
+ case PGPA_TAG_NESTED_LOOP_MATERIALIZE:
+ return PGS_NESTLOOP_MATERIALIZE;
+ case PGPA_TAG_NESTED_LOOP_MEMOIZE:
+ return PGS_NESTLOOP_MEMOIZE;
+ case PGPA_TAG_HASH_JOIN:
+ return PGS_HASHJOIN;
+ default:
+ return 0;
+ }
+}
+
+/*
+ * Does a certain item of join order advice permit a certain join?
+ *
+ * Returns PGPA_JO_DENIED if the advice is incompatible with the proposed
+ * join order.
+ *
+ * Returns PGPA_JO_PERMITTED if the advice specifies exactly the proposed
+ * join order. This implies that a partitionwise join should not be
+ * performed at this level; rather, one of the traditional join methods
+ * should be used.
+ *
+ * Returns PGPA_JO_INDIFFERENT if the advice does not care what happens.
+ * We use this for unordered JOIN_ORDER sublists, which are compatible with
+ * partitionwise join but do not mandate it.
+ */
+static pgpa_jo_outcome
+pgpa_join_order_permits_join(int outer_count, int inner_count,
+ pgpa_identifier *rids,
+ pgpa_trove_entry *entry)
+{
+ bool loop = true;
+ bool sublist = false;
+ int length;
+ int outer_length;
+ pgpa_advice_target *target = entry->target;
+ pgpa_advice_target *prefix_target;
+
+ /* We definitely have at least a partial match for this trove entry. */
+ entry->flags |= PGPA_TE_MATCH_PARTIAL;
+
+ /*
+ * Find the innermost sublist that contains all keys; if no sublist does,
+ * then continue processing with the toplevel list.
+ *
+ * For example, if the advice says JOIN_ORDER(t1 t2 (t3 t4 t5)), then we
+ * should evaluate joins that only involve t3, t4, and/or t5 against the
+ * (t3 t4 t5) sublist, and others against the full list.
+ *
+ * Note that (1) outermost sublist is always ordered and (2) whenever we
+ * zoom into an unordered sublist, we instantly return
+ * PGPA_JO_INDIFFERENT.
+ */
+ while (loop)
+ {
+ Assert(target->ttype == PGPA_TARGET_ORDERED_LIST);
+
+ loop = false;
+ foreach_ptr(pgpa_advice_target, child_target, target->children)
+ {
+ pgpa_itm_type itm;
+
+ if (child_target->ttype == PGPA_TARGET_IDENTIFIER)
+ continue;
+
+ itm = pgpa_identifiers_match_target(outer_count + inner_count,
+ rids, child_target);
+ if (itm == PGPA_ITM_EQUAL || itm == PGPA_ITM_KEYS_ARE_SUBSET)
+ {
+ if (child_target->ttype == PGPA_TARGET_ORDERED_LIST)
+ {
+ target = child_target;
+ sublist = true;
+ loop = true;
+ break;
+ }
+ else
+ {
+ Assert(child_target->ttype == PGPA_TARGET_UNORDERED_LIST);
+ return PGPA_JO_INDIFFERENT;
+ }
+ }
+ }
+ }
+
+ /*
+ * Try to find a prefix of the selected join order list that is exactly
+ * equal to the outer side of the proposed join.
+ */
+ length = list_length(target->children);
+ prefix_target = palloc0_object(pgpa_advice_target);
+ prefix_target->ttype = PGPA_TARGET_ORDERED_LIST;
+ for (outer_length = 1; outer_length <= length; ++outer_length)
+ {
+ pgpa_itm_type itm;
+
+ /* Avoid leaking memory in every loop iteration. */
+ if (prefix_target->children != NULL)
+ list_free(prefix_target->children);
+ prefix_target->children = list_copy_head(target->children,
+ outer_length);
+
+ /* Search, hoping to find an exact match. */
+ itm = pgpa_identifiers_match_target(outer_count, rids, prefix_target);
+ if (itm == PGPA_ITM_EQUAL)
+ break;
+
+ /*
+ * If the prefix of the join order list that we're considering
+ * includes some but not all of the outer rels, we can make the prefix
+ * longer to find an exact match. But the advice hasn't mentioned
+ * everything that's part of our outer rel yet, but has mentioned
+ * things that are not, then this join doesn't match the join order
+ * list.
+ */
+ if (itm != PGPA_ITM_TARGETS_ARE_SUBSET)
+ return PGPA_JO_DENIED;
+ }
+
+ /*
+ * If the previous looped stopped before the prefix_target included the
+ * entire join order list, then the next member of the join order list
+ * must exactly match the inner side of the join.
+ *
+ * Example: Given JOIN_ORDER(t1 t2 (t3 t4 t5)), if the outer side of the
+ * current join includes only t1, then the inner side must be exactly t2;
+ * if the outer side includes both t1 and t2, then the inner side must
+ * include exactly t3, t4, and t5.
+ */
+ if (outer_length < length)
+ {
+ pgpa_advice_target *inner_target;
+ pgpa_itm_type itm;
+
+ inner_target = list_nth(target->children, outer_length);
+
+ itm = pgpa_identifiers_match_target(inner_count, rids + outer_count,
+ inner_target);
+
+ /*
+ * Before returning, consider whether we need to mark this entry as
+ * fully matched. If we're considering the full list rather than a
+ * sublist, and if we found every item but one on the outer side of
+ * the join and the last item on the inner side of the join, then the
+ * answer is yes.
+ */
+ if (!sublist && outer_length + 1 == length && itm == PGPA_ITM_EQUAL)
+ entry->flags |= PGPA_TE_MATCH_FULL;
+
+ return (itm == PGPA_ITM_EQUAL) ? PGPA_JO_PERMITTED : PGPA_JO_DENIED;
+ }
+
+ /*
+ * If we get here, then the outer side of the join includes the entirety
+ * of the join order list. In this case, we behave differently depending
+ * on whether we're looking at the top-level join order list or sublist.
+ * At the top-level, we treat the specified list as mandating that the
+ * actual join order has the given list as a prefix, but a sublist
+ * requires an exact match.
+ *
+ * Exmaple: Given JOIN_ORDER(t1 t2 (t3 t4 t5)), we must start by joining
+ * all five of those relations and in that sequence, but once that is
+ * done, it's OK to join any other rels that are part of the join problem.
+ * This allows a user to specify the driving table and perhaps the first
+ * few things to which it should be joined while leaving the rest of the
+ * join order up the optimizer. But it seems like it would be surprising,
+ * given that specification, if the user could add t6 to the (t3 t4 t5)
+ * sub-join, so we don't allow that. If we did want to allow it, the logic
+ * earlier in this function would require substantial adjustment: we could
+ * allow the t3-t4-t5-t6 join to be built here, but the next step of
+ * joining t1-t2 to the result would still be rejected.
+ */
+ if (!sublist)
+ entry->flags |= PGPA_TE_MATCH_FULL;
+ return sublist ? PGPA_JO_DENIED : PGPA_JO_PERMITTED;
+}
+
+/*
+ * Does a certain item of join method advice permit a certain join?
+ *
+ * Advice such as HASH_JOIN((x y)) means that there should be a hash join with
+ * exactly x and y on the inner side. Obviously, this means that if we are
+ * considering a join with exactly x and y on the inner side, we should enforce
+ * the use of a hash join. However, it also means that we must reject some
+ * incompatible join orders entirely. For example, a join with exactly x
+ * and y on the outer side shouldn't be allowed, because such paths might win
+ * over the advice-driven path on cost.
+ *
+ * To accommodate these requirements, this function returns true if the join
+ * should be allowed and false if it should not. Furthermore, *restrict_method
+ * is set to true if the join method should be enforced and false if not.
+ */
+static bool
+pgpa_join_method_permits_join(int outer_count, int inner_count,
+ pgpa_identifier *rids,
+ pgpa_trove_entry *entry,
+ bool *restrict_method)
+{
+ pgpa_advice_target *target = entry->target;
+ pgpa_itm_type inner_itm;
+ pgpa_itm_type outer_itm;
+ pgpa_itm_type join_itm;
+
+ /* We definitely have at least a partial match for this trove entry. */
+ entry->flags |= PGPA_TE_MATCH_PARTIAL;
+
+ *restrict_method = false;
+
+ /*
+ * If our inner rel mentions exactly the same relations as the advice
+ * target, allow the join and enforce the join method restriction.
+ *
+ * If our inner rel mentions a superset of the target relations, allow the
+ * join. The join we care about has already taken place, and this advice
+ * imposes no further restrictions.
+ */
+ inner_itm = pgpa_identifiers_match_target(inner_count,
+ rids + outer_count,
+ target);
+ if (inner_itm == PGPA_ITM_EQUAL)
+ {
+ entry->flags |= PGPA_TE_MATCH_FULL;
+ *restrict_method = true;
+ return true;
+ }
+ else if (inner_itm == PGPA_ITM_TARGETS_ARE_SUBSET)
+ return true;
+
+ /*
+ * If our outer rel mentions a supserset of the relations in the advice
+ * target, no restrictions apply. The join we care has already taken
+ * place, and this advice imposes no further restrictions.
+ *
+ * On the other hand, if our outer rel mentions exactly the relations
+ * mentioned in the advice target, the planner is trying to reverse the
+ * sides of the join as compared with our desired outcome. Reject that.
+ */
+ outer_itm = pgpa_identifiers_match_target(outer_count,
+ rids, target);
+ if (outer_itm == PGPA_ITM_TARGETS_ARE_SUBSET)
+ return true;
+ else if (outer_itm == PGPA_ITM_EQUAL)
+ return false;
+
+ /*
+ * If the advice target mentions only a single relation, the test below
+ * cannot ever pass, so save some work by exiting now.
+ */
+ if (target->ttype == PGPA_TARGET_IDENTIFIER)
+ return false;
+
+ /*
+ * If everything in the joinrel appears in the advice target, we're below
+ * the level of the join we want to control.
+ *
+ * For example, HASH_JOIN((x y)) doesn't restrict how x and y can be
+ * joined.
+ *
+ * This lookup shouldn't return PGPA_ITM_DISJOINT, because any such advice
+ * should not have been returned from the trove in the first place.
+ */
+ join_itm = pgpa_identifiers_match_target(outer_count + inner_count,
+ rids, target);
+ Assert(join_itm != PGPA_ITM_DISJOINT);
+ if (join_itm == PGPA_ITM_KEYS_ARE_SUBSET ||
+ join_itm == PGPA_ITM_EQUAL)
+ return true;
+
+ /*
+ * We've already permitted all allowable cases, so reject this.
+ *
+ * If we reach this point, then the advice overlaps with this join but
+ * isn't entirely contained within either side, and there's also at least
+ * one relation present in the join that isn't mentioned by the advice.
+ *
+ * For instance, in the HASH_JOIN((x y)) example, we would reach here if x
+ * were on one side of the join, y on the other, and at least one of the
+ * two sides also included some other relation, say t. In that case,
+ * accepting this join would allow the (x y t) joinrel to contain
+ * non-disabled paths that do not put (x y) on the inner side of a hash
+ * join; we could instead end up with something like (x JOIN t) JOIN y.
+ */
+ return false;
+}
+
+/*
+ * Does advice concerning an opaque join permit a certain join?
+ *
+ * By an opaque join, we mean one where the exact mechanism by which the
+ * join is performed is not visible to PostgreSQL. Currently this is the
+ * case only for foreign joins: FOREIGN_JOIN((x y z)) means that x, y, and
+ * z are joined on the remote side, but we know nothing about the join order
+ * or join methods used over there.
+ *
+ * The logic here needs to differ from pgpa_join_method_permits_join because,
+ * for other join types, the advice target is the set of inner rels; here, it
+ * includes both inner and outer rels.
+ */
+static bool
+pgpa_opaque_join_permits_join(int outer_count, int inner_count,
+ pgpa_identifier *rids,
+ pgpa_trove_entry *entry,
+ bool *restrict_method)
+{
+ pgpa_advice_target *target = entry->target;
+ pgpa_itm_type join_itm;
+
+ /* We definitely have at least a partial match for this trove entry. */
+ entry->flags |= PGPA_TE_MATCH_PARTIAL;
+
+ *restrict_method = false;
+
+ join_itm = pgpa_identifiers_match_target(outer_count + inner_count,
+ rids, target);
+ if (join_itm == PGPA_ITM_EQUAL)
+ {
+ /*
+ * We have an exact match, and should therefore allow the join and
+ * enforce the use of the relevant opaque join method.
+ */
+ entry->flags |= PGPA_TE_MATCH_FULL;
+ *restrict_method = true;
+ return true;
+ }
+
+ if (join_itm == PGPA_ITM_KEYS_ARE_SUBSET ||
+ join_itm == PGPA_ITM_TARGETS_ARE_SUBSET)
+ {
+ /*
+ * If join_itm == PGPA_ITM_TARGETS_ARE_SUBSET, then the join we care
+ * about has already taken place and no further restrictions apply.
+ *
+ * If join_itm == PGPA_ITM_KEYS_ARE_SUBSET, we're still building up to
+ * the join we care about and have not introduced any extraneous
+ * relations not named in the advice. Note that ForeignScan paths for
+ * joins are built up from ForeignScan paths from underlying joins and
+ * scans, so we must not disable this join when considering a subset
+ * of the relations we ultimately want.
+ */
+ return true;
+ }
+
+ /*
+ * The advice overlaps the join, but at least one relation is present in
+ * the join that isn't mentioned by the advice. We want to disable such
+ * paths so that we actually push down the join as intended.
+ */
+ return false;
+}
+
+/*
+ * Does advice concerning a semijoin permit a certain join?
+ *
+ * Unlike join method advice, which lists the rels on the inner side of the
+ * join, semijoin uniqueness advice lists the rels on the nullable side of the
+ * join. Those can be the same, if the join type is JOIN_UNIQUE_INNER or
+ * JOIN_SEMI, or they can be different, in case of JOIN_UNIQUE_OUTER or
+ * JOIN_RIGHT_SEMI.
+ *
+ * We don't know here whether the caller specified SEMIJOIN_UNIQUE or
+ * SEMIJOIN_NON_UNIQUE. The caller should check the join type against the
+ * advice type if and only if we set *restrict_method to true.
+ */
+static bool
+pgpa_semijoin_permits_join(int outer_count, int inner_count,
+ pgpa_identifier *rids,
+ pgpa_trove_entry *entry,
+ bool outer_is_nullable,
+ bool *restrict_method)
+{
+ pgpa_advice_target *target = entry->target;
+ pgpa_itm_type join_itm;
+ pgpa_itm_type inner_itm;
+ pgpa_itm_type outer_itm;
+
+ *restrict_method = false;
+
+ /* We definitely have at least a partial match for this trove entry. */
+ entry->flags |= PGPA_TE_MATCH_PARTIAL;
+
+ /*
+ * If outer rel is the nullable side and contains exactly the same
+ * relations as the advice target, then the join order is allowable, but
+ * the caller must check whether the advice tag (either SEMIJOIN_UNIQUE or
+ * SEMIJOIN_NON_UNIQUE) matches the join type.
+ *
+ * If the outer rel is a superset of the target relations, the join we
+ * care about has already taken place, so we should impose no futher
+ * restritions.
+ */
+ outer_itm = pgpa_identifiers_match_target(outer_count,
+ rids, target);
+ if (outer_itm == PGPA_ITM_EQUAL)
+ {
+ entry->flags |= PGPA_TE_MATCH_FULL;
+ if (outer_is_nullable)
+ {
+ *restrict_method = true;
+ return true;
+ }
+ }
+ else if (outer_itm == PGPA_ITM_TARGETS_ARE_SUBSET)
+ return true;
+
+ /* As above, but for the inner rel. */
+ inner_itm = pgpa_identifiers_match_target(inner_count,
+ rids + outer_count,
+ target);
+ if (inner_itm == PGPA_ITM_EQUAL)
+ {
+ entry->flags |= PGPA_TE_MATCH_FULL;
+ if (!outer_is_nullable)
+ {
+ *restrict_method = true;
+ return true;
+ }
+ }
+ else if (inner_itm == PGPA_ITM_TARGETS_ARE_SUBSET)
+ return true;
+
+ /*
+ * If everything in the joinrel appears in the advice target, we're below
+ * the level of the join we want to control.
+ */
+ join_itm = pgpa_identifiers_match_target(outer_count + inner_count,
+ rids, target);
+ Assert(join_itm != PGPA_ITM_DISJOINT);
+ if (join_itm == PGPA_ITM_KEYS_ARE_SUBSET ||
+ join_itm == PGPA_ITM_EQUAL)
+ return true;
+
+ /*
+ * We've tested for all allowable possibilities, and so must reject this
+ * join order. This can happen in two ways.
+ *
+ * First, we migh be considering a semijoin that overlaps incompletely
+ * with one or both sides of the join. For example, if the user has
+ * specified SEMIJOIN_UNIQUE((t1 t2)) or SEMIJOIN_NON_UNIQUE((t1 t2)), we
+ * should reject a proposed t2-t3 join, since that could not result in a
+ * final plan compatible with the advice.
+ *
+ * Second, we might be considering a semijoin where the advice target
+ * perfectly matches one side of the join, but it's the wrong one. For
+ * example, in the example above, we might see a 3-way join between t1,
+ * t2, and t3, with (t1 t2) on the non-nullable side. That, too, would be
+ * incompatible with the advice.
+ */
+ return false;
+}
+
+/*
+ * Apply scan advice to a RelOptInfo.
+ */
+static void
+pgpa_planner_apply_scan_advice(RelOptInfo *rel,
+ pgpa_trove_entry *scan_entries,
+ Bitmapset *scan_indexes,
+ pgpa_trove_entry *rel_entries,
+ Bitmapset *rel_indexes)
+{
+ bool gather_conflict = false;
+ Bitmapset *gather_partial_match = NULL;
+ Bitmapset *gather_full_match = NULL;
+ int i = -1;
+ pgpa_trove_entry *scan_entry = NULL;
+ int flags;
+ bool scan_type_conflict = false;
+ Bitmapset *scan_type_indexes = NULL;
+ Bitmapset *scan_type_rel_indexes = NULL;
+ uint64 gather_mask = 0;
+ uint64 scan_type = 0;
+
+ /* Scrutinize available scan advice. */
+ while ((i = bms_next_member(scan_indexes, i)) >= 0)
+ {
+ pgpa_trove_entry *my_entry = &scan_entries[i];
+ uint64 my_scan_type = 0;
+
+ /* Translate our advice tags to a scan strategy advice value. */
+ if (my_entry->tag == PGPA_TAG_BITMAP_HEAP_SCAN)
+ {
+ /*
+ * Clearly PGS_CONSIDER_INDEXONLY can suppress Bitmap Heap Scans,
+ * so don't clear it when such a scan is requested. This happens
+ * because build_index_scan() thinks that the possibility of an
+ * index-only scan is a sufficient reason to consider using an
+ * otherwise-useless index, and get_index_paths() thinks that the
+ * same paths that are useful for index or index-only scans should
+ * also be considered for bitmap scans. Perhaps that logic should
+ * be tightened up, but until then we need to include
+ * PGS_CONSIDER_INDEXONLY in my_scan_type here.
+ */
+ my_scan_type = PGS_BITMAPSCAN | PGS_CONSIDER_INDEXONLY;
+ }
+ else if (my_entry->tag == PGPA_TAG_INDEX_ONLY_SCAN)
+ my_scan_type = PGS_INDEXONLYSCAN | PGS_CONSIDER_INDEXONLY;
+ else if (my_entry->tag == PGPA_TAG_INDEX_SCAN)
+ my_scan_type = PGS_INDEXSCAN;
+ else if (my_entry->tag == PGPA_TAG_SEQ_SCAN)
+ my_scan_type = PGS_SEQSCAN;
+ else if (my_entry->tag == PGPA_TAG_TID_SCAN)
+ my_scan_type = PGS_TIDSCAN;
+
+ /*
+ * If this is understandable scan advice, hang on to the entry, the
+ * inferred scan type type, and the index at which we found it.
+ *
+ * Also make a note if we see conflicting scan type advice. Note that
+ * we regard two index specifications as conflicting unless they match
+ * exactly. In theory, perhaps we could regard INDEX_SCAN(a c) and
+ * INDEX_SCAN(a b.c) as non-conflicting if it happens that the only
+ * index named c is in schema b, but it doesn't seem worth the code.
+ */
+ if (my_scan_type != 0)
+ {
+ if (scan_type != 0 && scan_type != my_scan_type)
+ scan_type_conflict = true;
+ if (!scan_type_conflict && scan_entry != NULL &&
+ my_entry->target->itarget != NULL &&
+ scan_entry->target->itarget != NULL &&
+ !pgpa_index_targets_equal(scan_entry->target->itarget,
+ my_entry->target->itarget))
+ scan_type_conflict = true;
+ scan_entry = my_entry;
+ scan_type = my_scan_type;
+ scan_type_indexes = bms_add_member(scan_type_indexes, i);
+ }
+ }
+
+ /* Scrutinize available gather-related and partitionwise advice. */
+ i = -1;
+ while ((i = bms_next_member(rel_indexes, i)) >= 0)
+ {
+ pgpa_trove_entry *my_entry = &rel_entries[i];
+ uint64 my_gather_mask = 0;
+ bool just_one_rel;
+
+ just_one_rel = my_entry->target->ttype == PGPA_TARGET_IDENTIFIER
+ || list_length(my_entry->target->children) == 1;
+
+ /*
+ * PARTITIONWISE behaves like a scan type, except that if there's more
+ * than one relation targeted, it has no effect at this level.
+ */
+ if (my_entry->tag == PGPA_TAG_PARTITIONWISE)
+ {
+ if (just_one_rel)
+ {
+ const uint64 my_scan_type = PGS_APPEND | PGS_MERGE_APPEND;
+
+ if (scan_type != 0 && scan_type != my_scan_type)
+ scan_type_conflict = true;
+ scan_entry = my_entry;
+ scan_type = my_scan_type;
+ scan_type_rel_indexes =
+ bms_add_member(scan_type_rel_indexes, i);
+ }
+ continue;
+ }
+
+ /*
+ * GATHER and GATHER_MERGE applied to a single rel mean that we should
+ * use the correspondings strategy here, while applying either to more
+ * than one rel means we should not use those strategies here, but
+ * rather at the level of the joinrel that corresponds to what was
+ * specified. NO_GATHER can only be applied to single rels.
+ *
+ * Note that setting PGS_CONSIDER_NONPARTIAL in my_gather_mask is
+ * equivalent to allowing the non-use of either form of Gather here.
+ */
+ if (my_entry->tag == PGPA_TAG_GATHER ||
+ my_entry->tag == PGPA_TAG_GATHER_MERGE)
+ {
+ if (!just_one_rel)
+ my_gather_mask = PGS_CONSIDER_NONPARTIAL;
+ else if (my_entry->tag == PGPA_TAG_GATHER)
+ my_gather_mask = PGS_GATHER;
+ else
+ my_gather_mask = PGS_GATHER_MERGE;
+ }
+ else if (my_entry->tag == PGPA_TAG_NO_GATHER)
+ {
+ Assert(just_one_rel);
+ my_gather_mask = PGS_CONSIDER_NONPARTIAL;
+ }
+
+ /*
+ * If we set my_gather_mask up above, then we (1) make a note if the
+ * advice conflicted, (2) remember the mask value, and (3) remember
+ * whether this was a full or partial match.
+ */
+ if (my_gather_mask != 0)
+ {
+ if (gather_mask != 0 && gather_mask != my_gather_mask)
+ gather_conflict = true;
+ gather_mask = my_gather_mask;
+ if (just_one_rel)
+ gather_full_match = bms_add_member(gather_full_match, i);
+ else
+ gather_partial_match = bms_add_member(gather_partial_match, i);
+ }
+ }
+
+ /* Enforce choice of index. */
+ if (scan_entry != NULL && !scan_type_conflict &&
+ (scan_entry->tag == PGPA_TAG_INDEX_SCAN ||
+ scan_entry->tag == PGPA_TAG_INDEX_ONLY_SCAN))
+ {
+ pgpa_index_target *itarget = scan_entry->target->itarget;
+ IndexOptInfo *matched_index = NULL;
+
+ foreach_node(IndexOptInfo, index, rel->indexlist)
+ {
+ char *relname = get_rel_name(index->indexoid);
+ Oid nspoid = get_rel_namespace(index->indexoid);
+ char *relnamespace = get_namespace_name_or_temp(nspoid);
+
+ if (strcmp(itarget->indname, relname) == 0 &&
+ (itarget->indnamespace == NULL ||
+ strcmp(itarget->indnamespace, relnamespace) == 0))
+ {
+ matched_index = index;
+ break;
+ }
+ }
+
+ if (matched_index == NULL)
+ {
+ /* Don't force the scan type if the index doesn't exist. */
+ scan_type = 0;
+
+ /* Mark advice as inapplicable. */
+ pgpa_trove_set_flags(scan_entries, scan_type_indexes,
+ PGPA_TE_INAPPLICABLE);
+ }
+ else
+ {
+ /* Disable every other index. */
+ foreach_node(IndexOptInfo, index, rel->indexlist)
+ {
+ if (index != matched_index)
+ index->disabled = true;
+ }
+ }
+ }
+
+ /*
+ * Mark all the scan method entries as fully matched; and if they specify
+ * different things, mark them all as conflicting.
+ */
+ flags = PGPA_TE_MATCH_PARTIAL | PGPA_TE_MATCH_FULL;
+ if (scan_type_conflict)
+ flags |= PGPA_TE_CONFLICTING;
+ pgpa_trove_set_flags(scan_entries, scan_type_indexes, flags);
+ pgpa_trove_set_flags(rel_entries, scan_type_rel_indexes, flags);
+
+ /*
+ * Mark every Gather-related piece of advice as partially matched. Mark
+ * the ones that included this relation as a target by itself as fully
+ * matched. If there was a conflict, mark them all as conflicting.
+ */
+ flags = PGPA_TE_MATCH_PARTIAL;
+ if (gather_conflict)
+ flags |= PGPA_TE_CONFLICTING;
+ pgpa_trove_set_flags(rel_entries, gather_partial_match, flags);
+ flags |= PGPA_TE_MATCH_FULL;
+ pgpa_trove_set_flags(rel_entries, gather_full_match, flags);
+
+ /*
+ * Enforce restrictions on the scan type and use of Gather/Gather Merge.
+ * Only clear bits here, so that we still respect the enable_* GUCs. Do
+ * nothing in cases where the advice on a single topic conflicts.
+ */
+ if (scan_type != 0 && !scan_type_conflict)
+ {
+ uint64 all_scan_mask;
+
+ all_scan_mask = PGS_SCAN_ANY | PGS_APPEND | PGS_MERGE_APPEND |
+ PGS_CONSIDER_INDEXONLY;
+ rel->pgs_mask &= ~(all_scan_mask & ~scan_type);
+ }
+ if (gather_mask != 0 && !gather_conflict)
+ {
+ uint64 all_gather_mask;
+
+ all_gather_mask =
+ PGS_GATHER | PGS_GATHER_MERGE | PGS_CONSIDER_NONPARTIAL;
+ rel->pgs_mask &= ~(all_gather_mask & ~gather_mask);
+ }
+}
+
+/*
+ * Add feedback entries to for one trove slice to the provided list and
+ * return the resulting list.
+ *
+ * Feedback entries are generated from the trove entry's flags. It's assumed
+ * that the caller has already set all relevant flags with the exception of
+ * PGPA_TE_FAILED. We set that flag here if appropriate.
+ */
+static List *
+pgpa_planner_append_feedback(List *list, pgpa_trove *trove,
+ pgpa_trove_lookup_type type,
+ pgpa_identifier *rt_identifiers,
+ pgpa_plan_walker_context *walker)
+{
+ pgpa_trove_entry *entries;
+ int nentries;
+ StringInfoData buf;
+
+ initStringInfo(&buf);
+ pgpa_trove_lookup_all(trove, type, &entries, &nentries);
+ for (int i = 0; i < nentries; ++i)
+ {
+ pgpa_trove_entry *entry = &entries[i];
+ DefElem *item;
+
+ /*
+ * If this entry was fully matched, check whether generating advice
+ * from this plan would produce such an entry. If not, label the entry
+ * as failed.
+ */
+ if ((entry->flags & PGPA_TE_MATCH_FULL) != 0 &&
+ !pgpa_walker_would_advise(walker, rt_identifiers,
+ entry->tag, entry->target))
+ entry->flags |= PGPA_TE_FAILED;
+
+ item = makeDefElem(pgpa_cstring_trove_entry(entry),
+ (Node *) makeInteger(entry->flags), -1);
+ list = lappend(list, item);
+ }
+
+ return list;
+}
+
+/*
+ * Emit a WARNING tell the user about a problem with the supplied plan advice.
+ */
+static void
+pgpa_planner_feedback_warning(List *feedback)
+{
+ StringInfoData detailbuf;
+ StringInfoData flagbuf;
+
+ /* Quick exit if there's no feedback. */
+ if (feedback == NIL)
+ return;
+
+ /* Initialize buffers. */
+ initStringInfo(&detailbuf);
+ initStringInfo(&flagbuf);
+
+ /* Main loop. */
+ foreach_node(DefElem, item, feedback)
+ {
+ int flags = defGetInt32(item);
+
+ /*
+ * Don't emit anything if it was fully matched with no problems found.
+ *
+ * NB: Feedback should never be marked fully matched without also
+ * being marked partially matched.
+ */
+ if (flags == (PGPA_TE_MATCH_PARTIAL | PGPA_TE_MATCH_FULL))
+ continue;
+
+ /*
+ * Terminate each detail line except the last with a newline. This is
+ * also a convenient place to reset flagbuf.
+ */
+ if (detailbuf.len > 0)
+ {
+ appendStringInfoChar(&detailbuf, '\n');
+ resetStringInfo(&flagbuf);
+ }
+
+ /* Generate output. */
+ pgpa_trove_append_flags(&flagbuf, flags);
+ appendStringInfo(&detailbuf, _("advice %s feedback is \"%s\""),
+ item->defname, flagbuf.data);
+ }
+
+ /* Emit the warning, if any problems were found. */
+ if (detailbuf.len > 0)
+ ereport(WARNING,
+ errmsg("supplied plan advice was not enforced"),
+ errdetail("%s", detailbuf.data));
+}
+
+#ifdef USE_ASSERT_CHECKING
+
+/*
+ * Fast hash function for a key consisting of an RTI and plan name.
+ */
+static uint32
+pgpa_ri_checker_hash_key(pgpa_ri_checker_key key)
+{
+ fasthash_state hs;
+ int sp_len;
+
+ fasthash_init(&hs, 0);
+
+ hs.accum = key.rti;
+ fasthash_combine(&hs);
+
+ /* plan_name can be NULL */
+ if (key.plan_name == NULL)
+ sp_len = 0;
+ else
+ sp_len = fasthash_accum_cstring(&hs, key.plan_name);
+
+ /* hashfn_unstable.h recommends using string length as tweak */
+ return fasthash_final32(&hs, sp_len);
+}
+
+#endif
+
+/*
+ * Save the range table identifier for one relation for future cross-checking.
+ */
+static void
+pgpa_ri_checker_save(pgpa_planner_state *pps, PlannerInfo *root,
+ RelOptInfo *rel)
+{
+#ifdef USE_ASSERT_CHECKING
+ pgpa_ri_checker_key key;
+ pgpa_ri_checker *check;
+ pgpa_identifier rid;
+ const char *rid_string;
+ bool found;
+
+ key.rti = bms_singleton_member(rel->relids);
+ key.plan_name = root->plan_name;
+ pgpa_compute_identifier_by_rti(root, key.rti, &rid);
+ rid_string = pgpa_identifier_string(&rid);
+ check = pgpa_ri_check_insert(pps->ri_check_hash, key, &found);
+ Assert(!found || strcmp(check->rid_string, rid_string) == 0);
+ check->rid_string = rid_string;
+#endif
+}
+
+/*
+ * Validate that the range table identifiers we were able to generate during
+ * planning match the ones we generated from the final plan.
+ */
+static void
+pgpa_ri_checker_validate(pgpa_planner_state *pps, PlannedStmt *pstmt)
+{
+#ifdef USE_ASSERT_CHECKING
+ pgpa_identifier *rt_identifiers;
+ pgpa_ri_check_iterator it;
+ pgpa_ri_checker *check;
+
+ /* Create identifiers from the planned statement. */
+ rt_identifiers = pgpa_create_identifiers_for_planned_stmt(pstmt);
+
+ /* Iterate over identifiers created during planning, so we can compare. */
+ pgpa_ri_check_start_iterate(pps->ri_check_hash, &it);
+ while ((check = pgpa_ri_check_iterate(pps->ri_check_hash, &it)) != NULL)
+ {
+ int rtoffset = 0;
+ const char *rid_string;
+ Index flat_rti;
+
+ /*
+ * If there's no plan name associated with this entry, then the
+ * rtoffset is 0. Otherwise, we can search the SubPlanRTInfo list to
+ * find the rtoffset.
+ */
+ if (check->key.plan_name != NULL)
+ {
+ foreach_node(SubPlanRTInfo, rtinfo, pstmt->subrtinfos)
+ {
+ /*
+ * If rtinfo->dummy is set, then the subquery's range table
+ * will only have been partially copied to the final range
+ * table. Specifically, only RTE_RELATION entries and
+ * RTE_SUBQUERY entries that were once RTE_RELATION entries
+ * will be copied, as per add_rtes_to_flat_rtable. Therefore,
+ * there's no fixed rtoffset that we can apply to the RTIs
+ * used during planning to locate the corresponding relations
+ * in the final rtable.
+ *
+ * With more complex logic, we could work around that problem
+ * by remembering the whole contents of the subquery's rtable
+ * during planning, determining which of those would have been
+ * copied to the final rtable, and matching them up. But it
+ * doesn't seem like a worthwhile endeavor for right now,
+ * because RTIs from such subqueries won't appear in the plan
+ * tree itself, just in the range table. Hence, we can neither
+ * generate nor accept advice for them.
+ */
+ if (strcmp(check->key.plan_name, rtinfo->plan_name) == 0
+ && !rtinfo->dummy)
+ {
+ rtoffset = rtinfo->rtoffset;
+ Assert(rtoffset > 0);
+ break;
+ }
+ }
+
+ /*
+ * It's not an error if we don't find the plan name: that just
+ * means that we planned a subplan by this name but it ended up
+ * being a dummy subplan and so wasn't included in the final plan
+ * tree.
+ */
+ if (rtoffset == 0)
+ continue;
+ }
+
+ /*
+ * check->key.rti is the RTI that we saw prior to range-table
+ * flattening, so we must add the appropriate RT offset to get the
+ * final RTI.
+ */
+ flat_rti = check->key.rti + rtoffset;
+ Assert(flat_rti <= list_length(pstmt->rtable));
+
+ /* Assert that the string we compute now matches the previous one. */
+ rid_string = pgpa_identifier_string(&rt_identifiers[flat_rti - 1]);
+ Assert(strcmp(rid_string, check->rid_string) == 0);
+ }
+#endif
+}
+
+/*
+ * Convert a bitmapset to a C string of comma-separated integers.
+ */
+static char *
+pgpa_bms_to_cstring(Bitmapset *bms)
+{
+ StringInfoData buf;
+ int x = -1;
+
+ if (bms_is_empty(bms))
+ return "none";
+
+ initStringInfo(&buf);
+ while ((x = bms_next_member(bms, x)) >= 0)
+ {
+ if (buf.len > 0)
+ appendStringInfo(&buf, ", %d", x);
+ else
+ appendStringInfo(&buf, "%d", x);
+ }
+
+ return buf.data;
+}
+
+/*
+ * Convert a JoinType to a C string.
+ */
+static const char *
+pgpa_jointype_to_cstring(JoinType jointype)
+{
+ switch (jointype)
+ {
+ case JOIN_INNER:
+ return "inner";
+ case JOIN_LEFT:
+ return "left";
+ case JOIN_FULL:
+ return "full";
+ case JOIN_RIGHT:
+ return "right";
+ case JOIN_SEMI:
+ return "semi";
+ case JOIN_ANTI:
+ return "anti";
+ case JOIN_RIGHT_SEMI:
+ return "right semi";
+ case JOIN_RIGHT_ANTI:
+ return "right anti";
+ case JOIN_UNIQUE_OUTER:
+ return "unique outer";
+ case JOIN_UNIQUE_INNER:
+ return "unique inner";
+ }
+ return "???";
+}
diff --git a/contrib/pg_plan_advice/pgpa_planner.h b/contrib/pg_plan_advice/pgpa_planner.h
new file mode 100644
index 00000000000..7d40b910b00
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_planner.h
@@ -0,0 +1,17 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_planner.h
+ * planner hooks
+ *
+ * Copyright (c) 2016-2024, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_planner.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PGPA_PLANNER_H
+#define PGPA_PLANNER_H
+
+extern void pgpa_planner_install_hooks(void);
+
+#endif
diff --git a/contrib/pg_plan_advice/pgpa_scan.c b/contrib/pg_plan_advice/pgpa_scan.c
new file mode 100644
index 00000000000..75d1a3efa36
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_scan.c
@@ -0,0 +1,288 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_scan.c
+ * analysis of scans in Plan trees
+ *
+ * Copyright (c) 2016-2025, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_scan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "pgpa_scan.h"
+#include "pgpa_walker.h"
+
+#include "nodes/parsenodes.h"
+#include "parser/parsetree.h"
+
+static pgpa_scan *pgpa_make_scan(pgpa_plan_walker_context *walker, Plan *plan,
+ pgpa_scan_strategy strategy,
+ Bitmapset *relids);
+
+
+static RTEKind unique_nonjoin_rtekind(Bitmapset *relids, List *rtable);
+
+/*
+ * Build a pgpa_scan object for a Plan node and update the plan walker
+ * context as appopriate. If this is an Append or MergeAppend scan, also
+ * build pgpa_scan for any scans that were consolidated into this one by
+ * Append/MergeAppend pull-up.
+ *
+ * If there is at least one ElidedNode for this plan node, pass the uppermost
+ * one as elided_node, else pass NULL.
+ *
+ * Set the 'beneath_any_gather' node if we are underneath a Gather or
+ * Gather Merge node (except for a single-copy Gather node, for which
+ * GATHER or GATHER_MERGE advice should not be emitted).
+ *
+ * Set the 'within_join_problem' flag if we're inside of a join problem and
+ * not otherwise.
+ */
+pgpa_scan *
+pgpa_build_scan(pgpa_plan_walker_context *walker, Plan *plan,
+ ElidedNode *elided_node,
+ bool beneath_any_gather, bool within_join_problem)
+{
+ pgpa_scan_strategy strategy = PGPA_SCAN_ORDINARY;
+ Bitmapset *relids = NULL;
+ int rti = -1;
+ List *child_append_relid_sets = NIL;
+
+ if (elided_node != NULL)
+ {
+ NodeTag elided_type = elided_node->elided_type;
+
+ /*
+ * If setrefs processing elided an Append or MergeAppend node that had
+ * only one surviving child, it might be a partitionwise operation,
+ * but then this is either a setop over subqueries, or a partitionwise
+ * operation (which might be a scan or a join in reality, but here we
+ * don't care about the distinction and consider it simply a scan).
+ *
+ * A setop over subqueries, or a trivial SubQueryScan that was elided,
+ * is an "ordinary" scan i.e. one for which we need to generate advice
+ * because the planner has not made any meaningful choice.
+ */
+ relids = elided_node->relids;
+ if ((elided_type == T_Append || elided_type == T_MergeAppend) &&
+ unique_nonjoin_rtekind(relids,
+ walker->pstmt->rtable) == RTE_RELATION)
+ strategy = PGPA_SCAN_PARTITIONWISE;
+ else
+ strategy = PGPA_SCAN_ORDINARY;
+
+ /* Join RTIs can be present, but advice never refers to them. */
+ relids = pgpa_filter_out_join_relids(relids, walker->pstmt->rtable);
+ }
+ else if ((rti = pgpa_scanrelid(plan)) != 0)
+ {
+ relids = bms_make_singleton(rti);
+
+ switch (nodeTag(plan))
+ {
+ case T_SeqScan:
+ strategy = PGPA_SCAN_SEQ;
+ break;
+ case T_BitmapHeapScan:
+ strategy = PGPA_SCAN_BITMAP_HEAP;
+ break;
+ case T_IndexScan:
+ strategy = PGPA_SCAN_INDEX;
+ break;
+ case T_IndexOnlyScan:
+ strategy = PGPA_SCAN_INDEX_ONLY;
+ break;
+ case T_TidScan:
+ case T_TidRangeScan:
+ strategy = PGPA_SCAN_TID;
+ break;
+ default:
+
+ /*
+ * This case includes a ForeignScan targeting a single
+ * relation; no other strategy is possible in that case, but
+ * see below, where things are different in multi-relation
+ * cases.
+ */
+ strategy = PGPA_SCAN_ORDINARY;
+ break;
+ }
+ }
+ else if ((relids = pgpa_relids(plan)) != NULL)
+ {
+ switch (nodeTag(plan))
+ {
+ case T_ForeignScan:
+
+ /*
+ * If multiple relations are being targeted by a single
+ * foreign scan, then the foreign join has been pushed to the
+ * remote side, and we want that to be reflected in the
+ * generated advice.
+ */
+ strategy = PGPA_SCAN_FOREIGN;
+ break;
+ case T_Append:
+
+ /*
+ * Append nodes can represent partitionwise scans of a a
+ * relation, but when they implement a set operation, they are
+ * just ordinary scans.
+ */
+ if (unique_nonjoin_rtekind(relids, walker->pstmt->rtable)
+ == RTE_RELATION)
+ strategy = PGPA_SCAN_PARTITIONWISE;
+ else
+ strategy = PGPA_SCAN_ORDINARY;
+
+ /* Be sure to account for pulled-up scans. */
+ child_append_relid_sets =
+ ((Append *) plan)->child_append_relid_sets;
+ break;
+ case T_MergeAppend:
+ /* Some logic here as for Append, above. */
+ if (unique_nonjoin_rtekind(relids, walker->pstmt->rtable)
+ == RTE_RELATION)
+ strategy = PGPA_SCAN_PARTITIONWISE;
+ else
+ strategy = PGPA_SCAN_ORDINARY;
+
+ /* Be sure to account for pulled-up scans. */
+ child_append_relid_sets =
+ ((MergeAppend *) plan)->child_append_relid_sets;
+ break;
+ default:
+ strategy = PGPA_SCAN_ORDINARY;
+ break;
+ }
+
+
+ /* Join RTIs can be present, but advice never refers to them. */
+ relids = pgpa_filter_out_join_relids(relids, walker->pstmt->rtable);
+ }
+
+ /*
+ * If this is an Append or MergeAppend node into which subordinate Append
+ * or MergeAppend paths were merged, each of those merged paths is
+ * effectively another scan for which we need to account.
+ */
+ foreach_node(Bitmapset, child_relids, child_append_relid_sets)
+ {
+ Bitmapset *child_nonjoin_relids;
+
+ child_nonjoin_relids =
+ pgpa_filter_out_join_relids(child_relids,
+ walker->pstmt->rtable);
+ (void) pgpa_make_scan(walker, plan, strategy,
+ child_nonjoin_relids);
+ }
+
+ /*
+ * If this plan node has no associated RTIs, it's not a scan. When the
+ * 'within_join_problem' flag is set, that's unexpected, so throw an
+ * error, else return quietly.
+ */
+ if (relids == NULL)
+ {
+ if (within_join_problem)
+ elog(ERROR, "plan node has no RTIs: %d", (int) nodeTag(plan));
+ return NULL;
+ }
+
+ /*
+ * Add the appropriate set of RTIs to walker->no_gather_scans.
+ *
+ * Add nothing if we're beneath a Gather or Gather Merge node, since
+ * NO_GATHER advice is clearly inappropriate in that situation.
+ *
+ * Add nothing if this is an Append or MergeAppend node, we'll emit
+ * NO_GATHER() for the underlying scan, which is good enough.
+ *
+ * Add nothing if this is an elided node. If it's an elided Append or
+ * MergeAppend node, the same argument applies as for a non-elided Append
+ * or MergeAppend. An elided SubqueryScan is likely to have underlying
+ * tables as well, but even if it doesn't, emitting NO_GATHER() for a
+ * non-RTE_RELATION won't work anyway, since get_relation_info() isn't
+ * called in such cases.
+ *
+ * In fact, we need to filter out any non-RTE_RELATION RTIs for exactly
+ * this reason, and avoid adding them to the no_gather_scans set.
+ */
+ if (!beneath_any_gather && elided_node == NULL &&
+ !IsA(plan, Append) && !IsA(plan, MergeAppend))
+ {
+ int no_gather_rti = -1;
+
+ while ((no_gather_rti = bms_next_member(relids, no_gather_rti)) >= 0)
+ {
+ RangeTblEntry *rte;
+
+ rte = rt_fetch(no_gather_rti, walker->pstmt->rtable);
+ if (rte->rtekind == RTE_RELATION)
+ walker->no_gather_scans =
+ bms_add_member(walker->no_gather_scans, no_gather_rti);
+ }
+ }
+
+ /* Caller tells us whether NO_GATHER() advice for this scan is needed. */
+ return pgpa_make_scan(walker, plan, strategy, relids);
+}
+
+/*
+ * Create a single pgpa_scan object and update the pgpa_plan_walker_context.
+ */
+static pgpa_scan *
+pgpa_make_scan(pgpa_plan_walker_context *walker, Plan *plan,
+ pgpa_scan_strategy strategy, Bitmapset *relids)
+{
+ pgpa_scan *scan;
+
+ /* Create the scan object. */
+ scan = palloc(sizeof(pgpa_scan));
+ scan->plan = plan;
+ scan->strategy = strategy;
+ scan->relids = relids;
+
+ /* Add it to the appropriate list. */
+ walker->scans[scan->strategy] = lappend(walker->scans[scan->strategy],
+ scan);
+
+ return scan;
+}
+
+/*
+ * Determine the unique rtekind of a set of relids.
+ */
+static RTEKind
+unique_nonjoin_rtekind(Bitmapset *relids, List *rtable)
+{
+ int rti = -1;
+ bool first = true;
+ RTEKind rtekind;
+
+ Assert(relids != NULL);
+
+ while ((rti = bms_next_member(relids, rti)) >= 0)
+ {
+ RangeTblEntry *rte = rt_fetch(rti, rtable);
+
+ if (rte->rtekind == RTE_JOIN)
+ continue;
+
+ if (first)
+ {
+ rtekind = rte->rtekind;
+ first = false;
+ }
+ else if (rtekind != rte->rtekind)
+ elog(ERROR, "rtekind mismatch: %d vs. %d",
+ rtekind, rte->rtekind);
+ }
+
+ if (first)
+ elog(ERROR, "no non-RTE_JOIN RTEs found");
+
+ return rtekind;
+}
diff --git a/contrib/pg_plan_advice/pgpa_scan.h b/contrib/pg_plan_advice/pgpa_scan.h
new file mode 100644
index 00000000000..3bb8726ff1e
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_scan.h
@@ -0,0 +1,85 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_scan.h
+ * analysis of scans in Plan trees
+ *
+ * For purposes of this module, a "scan" includes (1) single plan nodes that
+ * scan multiple RTIs, such as a degenerate Result node that replaces what
+ * would otherwise have been a join, and (2) Append and MergeAppend nodes
+ * implementing a partitionwise scan or a partitionwise join. Said
+ * differently, scans are the leaves of the join tree for a single join
+ * problem.
+ *
+ * Copyright (c) 2016-2025, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_scan.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PGPA_SCAN_H
+#define PGPA_SCAN_H
+
+#include "nodes/plannodes.h"
+
+typedef struct pgpa_plan_walker_context pgpa_plan_walker_context;
+
+/*
+ * Scan strategies.
+ *
+ * PGPA_SCAN_ORDINARY is any scan strategy that isn't interesting to us
+ * because there is no meaningful planner decision involved. For example,
+ * the only way to scan a subquery is a SubqueryScan, and the only way to
+ * scan a VALUES construct is a ValuesScan. We need not care exactly which
+ * type of planner node was used in such cases, because the same thing will
+ * happen when replanning.
+ *
+ * PGPA_SCAN_ORDINARY also includes Result nodes that correspond to scans
+ * or even joins that are proved empty. We don't know whether or not the scan
+ * or join will still be provably empty at replanning time, but if it is,
+ * then no scan-type advice is needed, and if it's not, we can't recommend
+ * a scan type based on the current plan.
+ *
+ * PGPA_SCAN_PARTITIONWISE also lumps together scans and joins: this can
+ * be either a partitionwise scan of a partitioned table or a partitionwise
+ * join between several partitioned tables. Note that all decisions about
+ * whether or not to use partitionwise join are meaningful: no matter what
+ * we decided this time, we could do more or fewer things partitionwise the
+ * next time.
+ *
+ * PGPA_SCAN_FOREIGN is only used when there's more than one relation involved;
+ * a single-table foreign scan is classified as ordinary, since there is no
+ * decision to make in that case.
+ *
+ * Other scan strategies map one-to-one to plan nodes.
+ */
+typedef enum
+{
+ PGPA_SCAN_ORDINARY = 0,
+ PGPA_SCAN_SEQ,
+ PGPA_SCAN_BITMAP_HEAP,
+ PGPA_SCAN_FOREIGN,
+ PGPA_SCAN_INDEX,
+ PGPA_SCAN_INDEX_ONLY,
+ PGPA_SCAN_PARTITIONWISE,
+ PGPA_SCAN_TID
+ /* update NUM_PGPA_SCAN_STRATEGY if you add anything here */
+} pgpa_scan_strategy;
+
+#define NUM_PGPA_SCAN_STRATEGY ((int) PGPA_SCAN_TID + 1)
+
+/*
+ * All of the details we need regarding a scan.
+ */
+typedef struct pgpa_scan
+{
+ Plan *plan;
+ pgpa_scan_strategy strategy;
+ Bitmapset *relids;
+} pgpa_scan;
+
+extern pgpa_scan *pgpa_build_scan(pgpa_plan_walker_context *walker, Plan *plan,
+ ElidedNode *elided_node,
+ bool beneath_any_gather,
+ bool within_join_problem);
+
+#endif
diff --git a/contrib/pg_plan_advice/pgpa_scanner.l b/contrib/pg_plan_advice/pgpa_scanner.l
new file mode 100644
index 00000000000..a887735f314
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_scanner.l
@@ -0,0 +1,297 @@
+%top{
+/*
+ * Scanner for plan advice
+ *
+ * Copyright (c) 2000-2025, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_scanner.l
+ */
+#include "postgres.h"
+
+#include "common/string.h"
+#include "nodes/miscnodes.h"
+#include "parser/scansup.h"
+
+#include "pgpa_ast.h"
+#include "pgpa_parser.h"
+
+/*
+ * Extra data that we pass around when during scanning.
+ *
+ * 'litbuf' is used to implement the exclusive state, which handles
+ * double-quoted identifiers.
+ */
+typedef struct pgpa_yy_extra_type
+{
+ StringInfoData litbuf;
+} pgpa_yy_extra_type;
+
+}
+
+%{
+/* LCOV_EXCL_START */
+
+#define YY_DECL \
+ extern int pgpa_yylex(union YYSTYPE *yylval_param, List **result, \
+ char **parse_error_msg_p, yyscan_t yyscanner)
+
+/* No reason to constrain amount of data slurped */
+#define YY_READ_BUF_SIZE 16777216
+
+/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
+#undef fprintf
+#define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg)
+
+static void
+fprintf_to_ereport(const char *fmt, const char *msg)
+{
+ ereport(ERROR, (errmsg_internal("%s", msg)));
+}
+%}
+
+%option reentrant
+%option bison-bridge
+%option 8bit
+%option never-interactive
+%option nodefault
+%option noinput
+%option nounput
+%option noyywrap
+%option noyyalloc
+%option noyyrealloc
+%option noyyfree
+%option warn
+%option prefix="pgpa_yy"
+%option extra-type="pgpa_yy_extra_type *"
+
+/*
+ * What follows is a severely stripped-down version of the core scanner. We
+ * only care about recognizing identifiers with or without identifier quoting
+ * (i.e. double-quoting), decimal integers, and a small handful of other
+ * things. Keep these rules in sync with src/backend/parser/scan.l. As in that
+ * file, we use an exclusive state called 'xc' for C-style comments, and an
+ * exclusive state called 'xd' for double-quoted identifiers.
+ */
+%x xc
+%x xd
+
+ident_start [A-Za-z\200-\377_]
+ident_cont [A-Za-z\200-\377_0-9\$]
+
+identifier {ident_start}{ident_cont}*
+
+decdigit [0-9]
+decinteger {decdigit}(_?{decdigit})*
+
+space [ \t\n\r\f\v]
+whitespace {space}+
+
+dquote \"
+xdstart {dquote}
+xdstop {dquote}
+xddouble {dquote}{dquote}
+xdinside [^"]+
+
+xcstart \/\*
+xcstop \*+\/
+xcinside [^*/]+
+
+%%
+
+{whitespace} { /* ignore */ }
+
+{identifier} {
+ char *str;
+ bool fail;
+ pgpa_advice_tag_type tag;
+
+ /*
+ * Unlike the core scanner, we don't truncate identifiers
+ * here. There is no obvious reason to do so.
+ */
+ str = downcase_identifier(yytext, yyleng, false, false);
+ yylval->str = str;
+
+ /*
+ * If it's not a tag, just return TOK_IDENT; else, return
+ * a token type based on how further parsing should
+ * proceed.
+ */
+ tag = pgpa_parse_advice_tag(str, &fail);
+ if (fail)
+ return TOK_IDENT;
+ else if (tag == PGPA_TAG_JOIN_ORDER)
+ return TOK_TAG_JOIN_ORDER;
+ else if (tag == PGPA_TAG_INDEX_SCAN ||
+ tag == PGPA_TAG_INDEX_ONLY_SCAN)
+ return TOK_TAG_INDEX;
+ else if (tag == PGPA_TAG_SEQ_SCAN ||
+ tag == PGPA_TAG_TID_SCAN ||
+ tag == PGPA_TAG_BITMAP_HEAP_SCAN ||
+ tag == PGPA_TAG_NO_GATHER)
+ return TOK_TAG_SIMPLE;
+ else
+ return TOK_TAG_GENERIC;
+ }
+
+{decinteger} {
+ char *endptr;
+
+ errno = 0;
+ yylval->integer = strtoint(yytext, &endptr, 10);
+ if (*endptr != '\0' || errno == ERANGE)
+ pgpa_yyerror(result, parse_error_msg_p, yyscanner,
+ "integer out of range");
+ return TOK_INTEGER;
+ }
+
+{xcstart} {
+ BEGIN(xc);
+ }
+
+{xdstart} {
+ BEGIN(xd);
+ resetStringInfo(&yyextra->litbuf);
+ }
+
+. { return yytext[0]; }
+
+{xcstop} {
+ BEGIN(INITIAL);
+ }
+
+{xcinside} {
+ /* discard multiple characters without slash or asterisk */
+ }
+
+. {
+ /*
+ * Discard any single character. flex prefers longer
+ * matches, so this rule will never be picked when we could
+ * have matched xcstop.
+ *
+ * NB: At present, we don't bother to support nested
+ * C-style comments here, but this logic could be extended
+ * if that restriction poses a problem.
+ */
+ }
+
+<> {
+ BEGIN(INITIAL);
+ pgpa_yyerror(result, parse_error_msg_p, yyscanner,
+ "unterminated comment");
+ }
+
+{xdstop} {
+ BEGIN(INITIAL);
+ if (yyextra->litbuf.len == 0)
+ pgpa_yyerror(result, parse_error_msg_p, yyscanner,
+ "zero-length delimited identifier");
+ yylval->str = pstrdup(yyextra->litbuf.data);
+ return TOK_IDENT;
+ }
+
+{xddouble} {
+ appendStringInfoChar(&yyextra->litbuf, '"');
+ }
+
+{xdinside} {
+ appendBinaryStringInfo(&yyextra->litbuf, yytext, yyleng);
+ }
+
+<> {
+ BEGIN(INITIAL);
+ pgpa_yyerror(result, parse_error_msg_p, yyscanner,
+ "unterminated quoted identifier");
+ }
+
+%%
+
+/* LCOV_EXCL_STOP */
+
+/*
+ * Handler for errors while scanning or parsing advice.
+ *
+ * bison passes the error message to us via 'message', and the context is
+ * available via the 'yytext' macro. We assemble those values into a final
+ * error text and then arrange to pass it back to the caller of pgpa_yyparse()
+ * by storing it into *parse_error_msg_p.
+ */
+void
+pgpa_yyerror(List **result, char **parse_error_msg_p, yyscan_t yyscanner,
+ const char *message)
+{
+ struct yyguts_t *yyg = (struct yyguts_t *) yyscanner; /* needed for yytext
+ * macro */
+
+
+ /* report only the first error in a parse operation */
+ if (*parse_error_msg_p)
+ return;
+
+ if (yytext[0])
+ *parse_error_msg_p = psprintf("%s at or near \"%s\"", message, yytext);
+ else
+ *parse_error_msg_p = psprintf("%s at end of input", message);
+}
+
+/*
+ * Initialize the advice scanner.
+ *
+ * This should be called before parsing begins.
+ */
+void
+pgpa_scanner_init(const char *str, yyscan_t *yyscannerp)
+{
+ yyscan_t yyscanner;
+ pgpa_yy_extra_type *yyext = palloc0_object(pgpa_yy_extra_type);
+
+ if (yylex_init(yyscannerp) != 0)
+ elog(ERROR, "yylex_init() failed: %m");
+
+ yyscanner = *yyscannerp;
+
+ initStringInfo(&yyext->litbuf);
+ pgpa_yyset_extra(yyext, yyscanner);
+
+ yy_scan_string(str, yyscanner);
+}
+
+
+/*
+ * Shut down the advice scanner.
+ *
+ * This should be called after parsing is complete.
+ */
+void
+pgpa_scanner_finish(yyscan_t yyscanner)
+{
+ yylex_destroy(yyscanner);
+}
+
+/*
+ * Interface functions to make flex use palloc() instead of malloc().
+ * It'd be better to make these static, but flex insists otherwise.
+ */
+
+void *
+yyalloc(yy_size_t size, yyscan_t yyscanner)
+{
+ return palloc(size);
+}
+
+void *
+yyrealloc(void *ptr, yy_size_t size, yyscan_t yyscanner)
+{
+ if (ptr)
+ return repalloc(ptr, size);
+ else
+ return palloc(size);
+}
+
+void
+yyfree(void *ptr, yyscan_t yyscanner)
+{
+ if (ptr)
+ pfree(ptr);
+}
diff --git a/contrib/pg_plan_advice/pgpa_trove.c b/contrib/pg_plan_advice/pgpa_trove.c
new file mode 100644
index 00000000000..e924959c010
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_trove.c
@@ -0,0 +1,516 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_trove.c
+ * All of the advice given for a particular query, appropriately
+ * organized for convenient access.
+ *
+ * This name comes from the English expression "trove of advice", which
+ * means a collection of wisdom. This slightly unusual term is chosen to
+ * avoid naming confusion; for example, "collection of advice" would
+ * invite confusion with pgpa_collector.c. Note that, while we don't know
+ * whether the provided advice is actually wise, it's not our job to
+ * question the user's choices.
+ *
+ * The goal of this module is to make it easy to locate the specific
+ * bits of advice that pertain to any given part of a query, or to
+ * determine that there are none.
+ *
+ * Copyright (c) 2016-2024, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_trove.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "pgpa_trove.h"
+
+#include "common/hashfn_unstable.h"
+
+/*
+ * An advice trove is organized into a series of "slices", each of which
+ * contains information about one topic e.g. scan methods. Each slice consists
+ * of an array of trove entries plus a hash table that we can use to determine
+ * which ones are relevant to a particular part of the query.
+ */
+typedef struct pgpa_trove_slice
+{
+ unsigned nallocated;
+ unsigned nused;
+ pgpa_trove_entry *entries;
+ struct pgpa_trove_entry_hash *hash;
+} pgpa_trove_slice;
+
+/*
+ * Scan advice is stored into 'scan'; join advice is stored into 'join'; and
+ * advice that can apply to both cases is stored into 'rel'. This lets callers
+ * ask just for what's relevant. These slices correspond to the possible values
+ * of pgpa_trove_lookup_type.
+ */
+struct pgpa_trove
+{
+ pgpa_trove_slice join;
+ pgpa_trove_slice rel;
+ pgpa_trove_slice scan;
+};
+
+/*
+ * We're going to build a hash table to allow clients of this module to find
+ * relevant advice for a given part of the query quickly. However, we're going
+ * to use only three of the five key fields as hash keys. There are two reasons
+ * for this.
+ *
+ * First, it's allowable to set partition_schema to NULL to match a partition
+ * with the correct name in any schema.
+ *
+ * Second, we expect the "occurrence" and "partition_schema" portions of the
+ * relation identifiers to be mostly uninteresting. Most of the time, the
+ * occurrence field will be 1 and the partition_schema values will all be the
+ * same. Even when there is some variation, the absolute number of entries
+ * that have the same values for all three of these key fields should be
+ * quite small.
+ */
+typedef struct
+{
+ const char *alias_name;
+ const char *partition_name;
+ const char *plan_name;
+} pgpa_trove_entry_key;
+
+typedef struct
+{
+ pgpa_trove_entry_key key;
+ int status;
+ Bitmapset *indexes;
+} pgpa_trove_entry_element;
+
+static uint32 pgpa_trove_entry_hash_key(pgpa_trove_entry_key key);
+
+static inline bool
+pgpa_trove_entry_compare_key(pgpa_trove_entry_key a, pgpa_trove_entry_key b)
+{
+ if (strcmp(a.alias_name, b.alias_name) != 0)
+ return false;
+
+ if (!strings_equal_or_both_null(a.partition_name, b.partition_name))
+ return false;
+
+ if (!strings_equal_or_both_null(a.plan_name, b.plan_name))
+ return false;
+
+ return true;
+}
+
+#define SH_PREFIX pgpa_trove_entry
+#define SH_ELEMENT_TYPE pgpa_trove_entry_element
+#define SH_KEY_TYPE pgpa_trove_entry_key
+#define SH_KEY key
+#define SH_HASH_KEY(tb, key) pgpa_trove_entry_hash_key(key)
+#define SH_EQUAL(tb, a, b) pgpa_trove_entry_compare_key(a, b)
+#define SH_SCOPE static inline
+#define SH_DECLARE
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+static void pgpa_init_trove_slice(pgpa_trove_slice *tslice);
+static void pgpa_trove_add_to_slice(pgpa_trove_slice *tslice,
+ pgpa_advice_tag_type tag,
+ pgpa_advice_target *target);
+static void pgpa_trove_add_to_hash(pgpa_trove_entry_hash *hash,
+ pgpa_advice_target *target,
+ int index);
+static Bitmapset *pgpa_trove_slice_lookup(pgpa_trove_slice *tslice,
+ pgpa_identifier *rid);
+
+/*
+ * Build a trove of advice from a list of advice items.
+ *
+ * Caller can obtain a list of advice items to pass to this function by
+ * calling pgpa_parse().
+ */
+pgpa_trove *
+pgpa_build_trove(List *advice_items)
+{
+ pgpa_trove *trove = palloc_object(pgpa_trove);
+
+ pgpa_init_trove_slice(&trove->join);
+ pgpa_init_trove_slice(&trove->rel);
+ pgpa_init_trove_slice(&trove->scan);
+
+ foreach_ptr(pgpa_advice_item, item, advice_items)
+ {
+ switch (item->tag)
+ {
+ case PGPA_TAG_JOIN_ORDER:
+ {
+ pgpa_advice_target *target;
+
+ /*
+ * For most advice types, each element in the top-level
+ * list is a separate target, but it's most convenient to
+ * regard the entirety of a JOIN_ORDER specification as a
+ * single target. Since it wasn't represented that way
+ * during parsing, build a surrogate object now.
+ */
+ target = palloc0_object(pgpa_advice_target);
+ target->ttype = PGPA_TARGET_ORDERED_LIST;
+ target->children = item->targets;
+
+ pgpa_trove_add_to_slice(&trove->join,
+ item->tag, target);
+ }
+ break;
+
+ case PGPA_TAG_BITMAP_HEAP_SCAN:
+ case PGPA_TAG_INDEX_ONLY_SCAN:
+ case PGPA_TAG_INDEX_SCAN:
+ case PGPA_TAG_SEQ_SCAN:
+ case PGPA_TAG_TID_SCAN:
+
+ /*
+ * Scan advice.
+ */
+ foreach_ptr(pgpa_advice_target, target, item->targets)
+ {
+ /*
+ * For now, all of our scan types target single relations,
+ * but in the future this might not be true, e.g. a custom
+ * scan could replace a join.
+ */
+ Assert(target->ttype == PGPA_TARGET_IDENTIFIER);
+ pgpa_trove_add_to_slice(&trove->scan,
+ item->tag, target);
+ }
+ break;
+
+ case PGPA_TAG_FOREIGN_JOIN:
+ case PGPA_TAG_HASH_JOIN:
+ case PGPA_TAG_MERGE_JOIN_MATERIALIZE:
+ case PGPA_TAG_MERGE_JOIN_PLAIN:
+ case PGPA_TAG_NESTED_LOOP_MATERIALIZE:
+ case PGPA_TAG_NESTED_LOOP_MEMOIZE:
+ case PGPA_TAG_NESTED_LOOP_PLAIN:
+ case PGPA_TAG_SEMIJOIN_NON_UNIQUE:
+ case PGPA_TAG_SEMIJOIN_UNIQUE:
+
+ /*
+ * Join strategy advice.
+ */
+ foreach_ptr(pgpa_advice_target, target, item->targets)
+ {
+ pgpa_trove_add_to_slice(&trove->join,
+ item->tag, target);
+ }
+ break;
+
+ case PGPA_TAG_PARTITIONWISE:
+ case PGPA_TAG_GATHER:
+ case PGPA_TAG_GATHER_MERGE:
+ case PGPA_TAG_NO_GATHER:
+
+ /*
+ * Advice about a RelOptInfo relevant to both scans and joins.
+ */
+ foreach_ptr(pgpa_advice_target, target, item->targets)
+ {
+ pgpa_trove_add_to_slice(&trove->rel,
+ item->tag, target);
+ }
+ break;
+ }
+ }
+
+ return trove;
+}
+
+/*
+ * Search a trove of advice for relevant entries.
+ *
+ * All parameters are input parameters except for *result, which is an output
+ * parameter used to return results to the caller.
+ */
+void
+pgpa_trove_lookup(pgpa_trove *trove, pgpa_trove_lookup_type type,
+ int nrids, pgpa_identifier *rids, pgpa_trove_result *result)
+{
+ pgpa_trove_slice *tslice;
+ Bitmapset *indexes;
+
+ Assert(nrids > 0);
+
+ if (type == PGPA_TROVE_LOOKUP_SCAN)
+ tslice = &trove->scan;
+ else if (type == PGPA_TROVE_LOOKUP_JOIN)
+ tslice = &trove->join;
+ else
+ tslice = &trove->rel;
+
+ indexes = pgpa_trove_slice_lookup(tslice, &rids[0]);
+ for (int i = 1; i < nrids; ++i)
+ {
+ Bitmapset *other_indexes;
+
+ /*
+ * If the caller is asking about two relations that aren't part of the
+ * same subquery, they've messed up.
+ */
+ Assert(strings_equal_or_both_null(rids[0].plan_name,
+ rids[i].plan_name));
+
+ other_indexes = pgpa_trove_slice_lookup(tslice, &rids[i]);
+ indexes = bms_union(indexes, other_indexes);
+ }
+
+ result->entries = tslice->entries;
+ result->indexes = indexes;
+}
+
+/*
+ * Return all entries in a trove slice to the caller.
+ *
+ * The first two arguments are input arguments, and the remainder are output
+ * arguments.
+ */
+void
+pgpa_trove_lookup_all(pgpa_trove *trove, pgpa_trove_lookup_type type,
+ pgpa_trove_entry **entries, int *nentries)
+{
+ pgpa_trove_slice *tslice;
+
+ if (type == PGPA_TROVE_LOOKUP_SCAN)
+ tslice = &trove->scan;
+ else if (type == PGPA_TROVE_LOOKUP_JOIN)
+ tslice = &trove->join;
+ else
+ tslice = &trove->rel;
+
+ *entries = tslice->entries;
+ *nentries = tslice->nused;
+}
+
+/*
+ * Convert a trove entry to an item of plan advice that would produce it.
+ */
+char *
+pgpa_cstring_trove_entry(pgpa_trove_entry *entry)
+{
+ StringInfoData buf;
+
+ initStringInfo(&buf);
+ appendStringInfo(&buf, "%s", pgpa_cstring_advice_tag(entry->tag));
+
+ /* JOIN_ORDER tags are transformed by pgpa_build_trove; undo that here */
+ if (entry->tag != PGPA_TAG_JOIN_ORDER)
+ appendStringInfoChar(&buf, '(');
+ else
+ Assert(entry->target->ttype == PGPA_TARGET_ORDERED_LIST);
+
+ pgpa_format_advice_target(&buf, entry->target);
+
+ if (entry->target->itarget != NULL)
+ {
+ appendStringInfoChar(&buf, ' ');
+ pgpa_format_index_target(&buf, entry->target->itarget);
+ }
+
+ if (entry->tag != PGPA_TAG_JOIN_ORDER)
+ appendStringInfoChar(&buf, ')');
+
+ return buf.data;
+}
+
+/*
+ * Set PGPA_TE_* flags on a set of trove entries.
+ */
+void
+pgpa_trove_set_flags(pgpa_trove_entry *entries, Bitmapset *indexes, int flags)
+{
+ int i = -1;
+
+ while ((i = bms_next_member(indexes, i)) >= 0)
+ {
+ pgpa_trove_entry *entry = &entries[i];
+
+ entry->flags |= flags;
+ }
+}
+
+/*
+ * Append a string representation of the specified PGPA_TE_* flags to the
+ * given StringInfo.
+ */
+void
+pgpa_trove_append_flags(StringInfo buf, int flags)
+{
+ if ((flags & PGPA_TE_MATCH_FULL) != 0)
+ {
+ Assert((flags & PGPA_TE_MATCH_PARTIAL) != 0);
+ appendStringInfo(buf, "matched");
+ }
+ else if ((flags & PGPA_TE_MATCH_PARTIAL) != 0)
+ appendStringInfo(buf, "partially matched");
+ else
+ appendStringInfo(buf, "not matched");
+ if ((flags & PGPA_TE_INAPPLICABLE) != 0)
+ appendStringInfo(buf, ", inapplicable");
+ if ((flags & PGPA_TE_CONFLICTING) != 0)
+ appendStringInfo(buf, ", conflicting");
+ if ((flags & PGPA_TE_FAILED) != 0)
+ appendStringInfo(buf, ", failed");
+}
+
+/*
+ * Add a new advice target to an existing pgpa_trove_slice object.
+ */
+static void
+pgpa_trove_add_to_slice(pgpa_trove_slice *tslice,
+ pgpa_advice_tag_type tag,
+ pgpa_advice_target *target)
+{
+ pgpa_trove_entry *entry;
+
+ if (tslice->nused >= tslice->nallocated)
+ {
+ int new_allocated;
+
+ new_allocated = tslice->nallocated * 2;
+ tslice->entries = repalloc_array(tslice->entries, pgpa_trove_entry,
+ new_allocated);
+ tslice->nallocated = new_allocated;
+ }
+
+ entry = &tslice->entries[tslice->nused];
+ entry->tag = tag;
+ entry->target = target;
+ entry->flags = 0;
+
+ pgpa_trove_add_to_hash(tslice->hash, target, tslice->nused);
+
+ tslice->nused++;
+}
+
+/*
+ * Update the hash table for a newly-added advice target.
+ */
+static void
+pgpa_trove_add_to_hash(pgpa_trove_entry_hash *hash, pgpa_advice_target *target,
+ int index)
+{
+ pgpa_trove_entry_key key;
+ pgpa_trove_entry_element *element;
+ bool found;
+
+ /* For non-identifiers, add entries for all descendents. */
+ if (target->ttype != PGPA_TARGET_IDENTIFIER)
+ {
+ foreach_ptr(pgpa_advice_target, child_target, target->children)
+ {
+ pgpa_trove_add_to_hash(hash, child_target, index);
+ }
+ return;
+ }
+
+ /* Sanity checks. */
+ Assert(target->rid.occurrence > 0);
+ Assert(target->rid.alias_name != NULL);
+
+ /* Add an entry for this relation identifier. */
+ key.alias_name = target->rid.alias_name;
+ key.partition_name = target->rid.partrel;
+ key.plan_name = target->rid.plan_name;
+ element = pgpa_trove_entry_insert(hash, key, &found);
+ if (!found)
+ element->indexes = NULL;
+ element->indexes = bms_add_member(element->indexes, index);
+}
+
+/*
+ * Create and initialize a new pgpa_trove_slice object.
+ */
+static void
+pgpa_init_trove_slice(pgpa_trove_slice *tslice)
+{
+ /*
+ * In an ideal world, we'll make tslice->nallocated big enough that the
+ * array and hash table will be large enough to contain the number of
+ * advice items in this trove slice, but a generous default value is not
+ * good for performance, because pgpa_init_trove_slice() has to zero an
+ * amount of memory proportional to tslice->nallocated. Hence, we keep the
+ * starting value quite small, on the theory that advice strings will
+ * often be relatively short.
+ */
+ tslice->nallocated = 16;
+ tslice->nused = 0;
+ tslice->entries = palloc_array(pgpa_trove_entry, tslice->nallocated);
+ tslice->hash = pgpa_trove_entry_create(CurrentMemoryContext,
+ tslice->nallocated, NULL);
+}
+
+/*
+ * Fast hash function for a key consisting of alias_name, partition_name,
+ * and plan_name.
+ */
+static uint32
+pgpa_trove_entry_hash_key(pgpa_trove_entry_key key)
+{
+ fasthash_state hs;
+ int sp_len;
+
+ fasthash_init(&hs, 0);
+
+ /* alias_name may not be NULL */
+ sp_len = fasthash_accum_cstring(&hs, key.alias_name);
+
+ /* partition_name and plan_name, however, can be NULL */
+ if (key.partition_name != NULL)
+ sp_len += fasthash_accum_cstring(&hs, key.partition_name);
+ if (key.plan_name != NULL)
+ sp_len += fasthash_accum_cstring(&hs, key.plan_name);
+
+ /*
+ * hashfn_unstable.h recommends using string length as tweak. It's not
+ * clear to me what to do if there are multiple strings, so for now I'm
+ * just using the total of all of the lengths.
+ */
+ return fasthash_final32(&hs, sp_len);
+}
+
+/*
+ * Look for matching entries.
+ */
+static Bitmapset *
+pgpa_trove_slice_lookup(pgpa_trove_slice *tslice, pgpa_identifier *rid)
+{
+ pgpa_trove_entry_key key;
+ pgpa_trove_entry_element *element;
+ Bitmapset *result = NULL;
+
+ Assert(rid->occurrence >= 1);
+
+ key.alias_name = rid->alias_name;
+ key.partition_name = rid->partrel;
+ key.plan_name = rid->plan_name;
+
+ element = pgpa_trove_entry_lookup(tslice->hash, key);
+
+ if (element != NULL)
+ {
+ int i = -1;
+
+ while ((i = bms_next_member(element->indexes, i)) >= 0)
+ {
+ pgpa_trove_entry *entry = &tslice->entries[i];
+
+ /*
+ * We know that this target or one of its descendents matches the
+ * identifier on the three key fields above, but we don't know
+ * which descendent or whether the occurence and schema also
+ * match.
+ */
+ if (pgpa_identifier_matches_target(rid, entry->target))
+ result = bms_add_member(result, i);
+ }
+ }
+
+ return result;
+}
diff --git a/contrib/pg_plan_advice/pgpa_trove.h b/contrib/pg_plan_advice/pgpa_trove.h
new file mode 100644
index 00000000000..a1b75af724a
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_trove.h
@@ -0,0 +1,114 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_trove.h
+ * All of the advice given for a particular query, appropriately
+ * organized for convenient access.
+ *
+ * Copyright (c) 2016-2024, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_trove.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PGPA_TROVE_H
+#define PGPA_TROVE_H
+
+#include "pgpa_ast.h"
+
+#include "nodes/bitmapset.h"
+
+typedef struct pgpa_trove pgpa_trove;
+
+/*
+ * Flags that can be set on a pgpa_trove_entry to indicate what happened when
+ * trying to plan using advice.
+ *
+ * PGPA_TE_MATCH_PARTIAL means that we found some part of the query that at
+ * least partially matched the target; e.g. given JOIN_ORDER(a b), this would
+ * be set if we ever saw any joinrel including either "a" or "b".
+ *
+ * PGPA_TE_MATCH_FULL means that we found an exact match for the target; e.g.
+ * given JOIN_ORDER(a b), this would be set if we saw a joinrel containing
+ * exactly "a" and "b" and nothing else.
+ *
+ * PGPA_TE_INAPPLICABLE means that the advice doesn't properly apply to the
+ * target; e.g. INDEX_SCAN(foo bar_idx) would be so marked if bar_idx does not
+ * exist on foo. The fact that this bit has been set does not mean that the
+ * advice had no effect.
+ *
+ * PGPA_TE_CONFLICTING means that a conflict was detected between what this
+ * advice wants and what some other plan advice wants; e.g. JOIN_ORDER(a b)
+ * would conflict with HASH_JOIN(a), because the former requires "a" to be the
+ * outer table while the latter requires it to be the inner table.
+ *
+ * PGPA_TE_FAILED means that the resulting plan did not conform to the advice.
+ */
+#define PGPA_TE_MATCH_PARTIAL 0x0001
+#define PGPA_TE_MATCH_FULL 0x0002
+#define PGPA_TE_INAPPLICABLE 0x0004
+#define PGPA_TE_CONFLICTING 0x0008
+#define PGPA_TE_FAILED 0x0010
+
+/*
+ * Each entry in a trove of advice represents the application of a tag to
+ * a single target.
+ */
+typedef struct pgpa_trove_entry
+{
+ pgpa_advice_tag_type tag;
+ pgpa_advice_target *target;
+ int flags;
+} pgpa_trove_entry;
+
+/*
+ * What kind of information does the caller want to find in a trove?
+ *
+ * PGPA_TROVE_LOOKUP_SCAN means we're looking for scan advice.
+ *
+ * PGPA_TROVE_LOOKUP_JOIN means we're looking for join-related advice.
+ * This includes join order advice, join method advice, and semijoin-uniqueness
+ * advice.
+ *
+ * PGPA_TROVE_LOOKUP_REL means we're looking for general advice about this
+ * a RelOptInfo that may correspond to either a scan or a join. This includes
+ * gather-related advice and partitionwise advice. Note that partitionwise
+ * advice might seem like join advice, but that's not a helpful way of viewing
+ * the matter because (1) partitionwise advice is also relevant at the scan
+ * level and (2) other types of join advice affect only what to do from
+ * join_path_setup_hook, but partitionwise advice affects what to do in
+ * joinrel_setup_hook.
+ */
+typedef enum pgpa_trove_lookup_type
+{
+ PGPA_TROVE_LOOKUP_JOIN,
+ PGPA_TROVE_LOOKUP_REL,
+ PGPA_TROVE_LOOKUP_SCAN
+} pgpa_trove_lookup_type;
+
+/*
+ * This struct is used to store the result of a trove lookup. For each member
+ * of "indexes", the entry at the corresponding offset within "entries" is one
+ * of the results.
+ */
+typedef struct pgpa_trove_result
+{
+ pgpa_trove_entry *entries;
+ Bitmapset *indexes;
+} pgpa_trove_result;
+
+extern pgpa_trove *pgpa_build_trove(List *advice_items);
+extern void pgpa_trove_lookup(pgpa_trove *trove,
+ pgpa_trove_lookup_type type,
+ int nrids,
+ pgpa_identifier *rids,
+ pgpa_trove_result *result);
+extern void pgpa_trove_lookup_all(pgpa_trove *trove,
+ pgpa_trove_lookup_type type,
+ pgpa_trove_entry **entries,
+ int *nentries);
+extern char *pgpa_cstring_trove_entry(pgpa_trove_entry *entry);
+extern void pgpa_trove_set_flags(pgpa_trove_entry *entries,
+ Bitmapset *indexes, int flags);
+extern void pgpa_trove_append_flags(StringInfo buf, int flags);
+
+#endif
diff --git a/contrib/pg_plan_advice/pgpa_walker.c b/contrib/pg_plan_advice/pgpa_walker.c
new file mode 100644
index 00000000000..210d30891b2
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_walker.c
@@ -0,0 +1,1006 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_walker.c
+ * Plan tree iteration
+ *
+ * Copyright (c) 2016-2025, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_walker.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "pgpa_join.h"
+#include "pgpa_scan.h"
+#include "pgpa_walker.h"
+
+#include "nodes/plannodes.h"
+#include "parser/parsetree.h"
+#include "utils/lsyscache.h"
+
+static void pgpa_walk_recursively(pgpa_plan_walker_context *walker, Plan *plan,
+ bool within_join_problem,
+ pgpa_join_unroller *join_unroller,
+ List *active_query_features,
+ bool beneath_any_gather);
+static Bitmapset *pgpa_process_unrolled_join(pgpa_plan_walker_context *walker,
+ pgpa_unrolled_join *ujoin);
+
+static pgpa_query_feature *pgpa_add_feature(pgpa_plan_walker_context *walker,
+ pgpa_qf_type type,
+ Plan *plan);
+
+static void pgpa_qf_add_rti(List *active_query_features, Index rti);
+static void pgpa_qf_add_rtis(List *active_query_features, Bitmapset *relids);
+static void pgpa_qf_add_plan_rtis(List *active_query_features, Plan *plan,
+ List *rtable);
+
+static bool pgpa_walker_join_order_matches(pgpa_unrolled_join *ujoin,
+ Index rtable_length,
+ pgpa_identifier *rt_identifiers,
+ pgpa_advice_target *target,
+ bool toplevel);
+static bool pgpa_walker_join_order_matches_member(pgpa_join_member *member,
+ Index rtable_length,
+ pgpa_identifier *rt_identifiers,
+ pgpa_advice_target *target);
+static pgpa_scan *pgpa_walker_find_scan(pgpa_plan_walker_context *walker,
+ pgpa_scan_strategy strategy,
+ Bitmapset *relids);
+static bool pgpa_walker_index_target_matches_plan(pgpa_index_target *itarget,
+ Plan *plan);
+static bool pgpa_walker_contains_feature(pgpa_plan_walker_context *walker,
+ pgpa_qf_type type,
+ Bitmapset *relids);
+static bool pgpa_walker_contains_join(pgpa_plan_walker_context *walker,
+ pgpa_join_strategy strategy,
+ Bitmapset *relids);
+static bool pgpa_walker_contains_no_gather(pgpa_plan_walker_context *walker,
+ Bitmapset *relids);
+
+/*
+ * Top-level entrypoint for the plan tree walk.
+ *
+ * Populates walker based on a traversal of the Plan trees in pstmt.
+ *
+ * sj_unique_rels is a list of pgpa_sj_unique_rel objects, one for each
+ * relation we considered making unique as part of semijoin planning.
+ */
+void
+pgpa_plan_walker(pgpa_plan_walker_context *walker, PlannedStmt *pstmt,
+ List *sj_unique_rels)
+{
+ ListCell *lc;
+ List *sj_unique_rtis = NULL;
+ List *sj_nonunique_qfs = NULL;
+
+ /* Initialization. */
+ memset(walker, 0, sizeof(pgpa_plan_walker_context));
+ walker->pstmt = pstmt;
+
+ /* Walk the main plan tree. */
+ pgpa_walk_recursively(walker, pstmt->planTree, 0, NULL, NIL, false);
+
+ /* Main plan tree walk won't reach subplans, so walk those. */
+ foreach(lc, pstmt->subplans)
+ {
+ Plan *plan = lfirst(lc);
+
+ if (plan != NULL)
+ pgpa_walk_recursively(walker, plan, 0, NULL, NIL, false);
+ }
+
+ /* Adjust RTIs from sj_unique_rels for the flattened range table. */
+ foreach_ptr(pgpa_sj_unique_rel, ur, sj_unique_rels)
+ {
+ int rtindex = -1;
+ int rtoffset = 0;
+ bool dummy = false;
+ Bitmapset *relids = NULL;
+
+ /* If this is a subplan, find the range table offset. */
+ if (ur->plan_name != NULL)
+ {
+ foreach_node(SubPlanRTInfo, rtinfo, pstmt->subrtinfos)
+ {
+ if (strcmp(ur->plan_name, rtinfo->plan_name) == 0)
+ {
+ rtoffset = rtinfo->rtoffset;
+ dummy = rtinfo->dummy;
+ break;
+ }
+ }
+
+ if (rtoffset == 0)
+ elog(ERROR, "no rtoffset for plan %s", ur->plan_name);
+ }
+
+ /* If this entry pertains to a dummy subquery, ignore it. */
+ if (dummy)
+ continue;
+
+ /* Offset each entry from the original set. */
+ while ((rtindex = bms_next_member(ur->relids, rtindex)) >= 0)
+ relids = bms_add_member(relids, rtindex + rtoffset);
+
+ /* Store the resulting set. */
+ sj_unique_rtis = lappend(sj_unique_rtis, relids);
+ }
+
+ /*
+ * Remove any non-unique semjoin query features for which making the rel
+ * unique wasn't considered.
+ */
+ foreach_ptr(pgpa_query_feature, qf,
+ walker->query_features[PGPAQF_SEMIJOIN_NON_UNIQUE])
+ {
+ if (list_member(sj_unique_rtis, qf->relids))
+ sj_nonunique_qfs = lappend(sj_nonunique_qfs, qf);
+ }
+ walker->query_features[PGPAQF_SEMIJOIN_NON_UNIQUE] = sj_nonunique_qfs;
+
+ /*
+ * If we find any cases where analysis of the Plan tree shows that the
+ * semijoin was made unique but this possibility was never observed to be
+ * considered during planning, then we have a bug somewhere.
+ */
+ foreach_ptr(pgpa_query_feature, qf,
+ walker->query_features[PGPAQF_SEMIJOIN_UNIQUE])
+ {
+ if (!list_member(sj_unique_rtis, qf->relids))
+ {
+ StringInfoData buf;
+
+ initStringInfo(&buf);
+ outBitmapset(&buf, qf->relids);
+ elog(ERROR,
+ "unique semijoin found for relids %s but not observed during planning",
+ buf.data);
+ }
+ }
+}
+
+/*
+ * Main workhorse for the plan tree walk.
+ *
+ * If within_join_problem is true, we encountered a join at some higher level
+ * of the tree walk and haven't yet descended out of the portion of the plan
+ * tree that is part of that same join problem. We're no longer in the same
+ * join problem if (1) we cross into a different subquery or (2) we descend
+ * through an Append or MergeAppend node, below which any further joins would
+ * be partitionwise joins planned separately from the outer join problem.
+ *
+ * If join_unroller != NULL, the join unroller code expects us to find a join
+ * that should be unrolled into that object. This implies that we're within a
+ * join problem, but the reverse is not true: when we've traversed all the
+ * joins but are still looking for the scan that is the leaf of the join tree,
+ * join_unroller will be NULL but within_join_problem will be true.
+ *
+ * Each element of active_query_features corresponds to some item of advice
+ * that needs to enumerate all the relations it affects. We add RTIs we find
+ * during tree traversal to each of these query features.
+ *
+ * If beneath_any_gather == true, some higher level of the tree traversal found
+ * a Gather or Gather Merge node.
+ */
+static void
+pgpa_walk_recursively(pgpa_plan_walker_context *walker, Plan *plan,
+ bool within_join_problem,
+ pgpa_join_unroller *join_unroller,
+ List *active_query_features,
+ bool beneath_any_gather)
+{
+ pgpa_join_unroller *outer_join_unroller = NULL;
+ pgpa_join_unroller *inner_join_unroller = NULL;
+ bool join_unroller_toplevel = false;
+ List *pushdown_query_features = NIL;
+ ListCell *lc;
+ List *extraplans = NIL;
+ List *elided_nodes = NIL;
+
+ Assert(within_join_problem || join_unroller == NULL);
+
+ /*
+ * If this is a Gather or Gather Merge node, directly add it to the list
+ * of currently-active query features. (Exception: Disregard single_copy
+ * Gather nodes. These are created by debug_parallel_query, and having
+ * them affect the plan advice is counterproductive, as the result will be
+ * to advise the use of a real Gather node, rather than a single copy
+ * one.)
+ *
+ * Otherwise, check the future_query_features list to see whether this was
+ * previously identified as a plan node that needs to be treated as a
+ * query feature.
+ *
+ * Note that the caller also has a copy to active_query_features, so we
+ * can't destructively modify it without making a copy.
+ */
+ if (IsA(plan, Gather) && !((Gather *) plan)->single_copy)
+ {
+ active_query_features =
+ lappend(list_copy(active_query_features),
+ pgpa_add_feature(walker, PGPAQF_GATHER, plan));
+ beneath_any_gather = true;
+ }
+ else if (IsA(plan, GatherMerge))
+ {
+ active_query_features =
+ lappend(list_copy(active_query_features),
+ pgpa_add_feature(walker, PGPAQF_GATHER_MERGE, plan));
+ beneath_any_gather = true;
+ }
+ else
+ {
+ foreach_ptr(pgpa_query_feature, qf, walker->future_query_features)
+ {
+ if (qf->plan == plan)
+ {
+ active_query_features = list_copy(active_query_features);
+ active_query_features = lappend(active_query_features, qf);
+ walker->future_query_features =
+ list_delete_ptr(walker->future_query_features, plan);
+ break;
+ }
+ }
+ }
+
+ /*
+ * Find all elided nodes for this Plan node.
+ */
+ foreach_node(ElidedNode, n, walker->pstmt->elidedNodes)
+ {
+ if (n->plan_node_id == plan->plan_node_id)
+ elided_nodes = lappend(elided_nodes, n);
+ }
+
+ /* If we found any elided_nodes, handle them. */
+ if (elided_nodes != NIL)
+ {
+ int num_elided_nodes = list_length(elided_nodes);
+ ElidedNode *last_elided_node;
+
+ /*
+ * RTIs for the final -- and thus logically uppermost -- elided node
+ * should be collected for query features passed down by the caller.
+ * However, elided nodes act as barriers to query features, which
+ * means that (1) the remaining elided nodes, if any, should be
+ * ignored for purposes of query features and (2) the list of active
+ * query features should be reset to empty so that we do not add RTIs
+ * from the plan node that is logically beneath the elided node to the
+ * query features passed down from the caller.
+ */
+ last_elided_node = list_nth(elided_nodes, num_elided_nodes - 1);
+ pgpa_qf_add_rtis(active_query_features,
+ pgpa_filter_out_join_relids(last_elided_node->relids,
+ walker->pstmt->rtable));
+ active_query_features = NIL;
+
+ /*
+ * If we're within a join problem, the join_unroller is responsible
+ * for building the scan for the final elided node, so throw it out.
+ */
+ if (within_join_problem)
+ elided_nodes = list_truncate(elided_nodes, num_elided_nodes - 1);
+
+ /* Build scans for all (or the remaining) elided nodes. */
+ foreach_node(ElidedNode, elided_node, elided_nodes)
+ {
+ (void) pgpa_build_scan(walker, plan, elided_node,
+ beneath_any_gather, within_join_problem);
+ }
+
+ /*
+ * If there were any elided nodes, then everything beneath those nodes
+ * is not part of the same join problem.
+ *
+ * In more detail, if an Append or MergeAppend was elided, then a
+ * partitionwise join was chosen and only a single child survived; if
+ * a SubqueryScan was elided, the subquery was planned without
+ * flattening it into the parent.
+ */
+ within_join_problem = false;
+ join_unroller = NULL;
+ }
+
+ /*
+ * If we're within a join problem, the join unroller is responsible for
+ * building any required scan for this node. If not, we do it here.
+ */
+ if (!within_join_problem)
+ (void) pgpa_build_scan(walker, plan, NULL, beneath_any_gather, false);
+
+ /*
+ * If this join needs to unrolled but there's no join unroller already
+ * available, create one.
+ */
+ if (join_unroller == NULL && pgpa_is_join(plan))
+ {
+ join_unroller = pgpa_create_join_unroller();
+ join_unroller_toplevel = true;
+ within_join_problem = true;
+ }
+
+ /*
+ * If this join is to be unrolled, pgpa_unroll_join() will return the join
+ * unroller object that should be passed down when we recurse into the
+ * outer and inner sides of the plan.
+ */
+ if (join_unroller != NULL)
+ pgpa_unroll_join(walker, plan, beneath_any_gather, join_unroller,
+ &outer_join_unroller, &inner_join_unroller);
+
+ /* Add RTIs from the plan node to all active query features. */
+ pgpa_qf_add_plan_rtis(active_query_features, plan, walker->pstmt->rtable);
+
+ /*
+ * Recurse into the outer and inner subtrees.
+ *
+ * As an exception, if this is a ForeignScan, don't recurse. postgres_fdw
+ * sometimes stores an EPQ recheck plan in plan->leftree, but that's going
+ * to mention the same set of relations as the ForeignScan itself, and we
+ * have no way to emit advice targeting the EPQ case vs. the non-EPQ case.
+ * Moreover, it's not entirely clear what other FDWs might do with the
+ * left and right subtrees. Maybe some better handling is needed here, but
+ * for now, we just punt.
+ */
+ if (!IsA(plan, ForeignScan))
+ {
+ if (plan->lefttree != NULL)
+ pgpa_walk_recursively(walker, plan->lefttree, within_join_problem,
+ outer_join_unroller, active_query_features,
+ beneath_any_gather);
+ if (plan->righttree != NULL)
+ pgpa_walk_recursively(walker, plan->righttree, within_join_problem,
+ inner_join_unroller, active_query_features,
+ beneath_any_gather);
+ }
+
+ /*
+ * If we created a join unroller up above, then it's also our join to use
+ * it to build the final pgpa_unrolled_join, and to destroy the object.
+ */
+ if (join_unroller_toplevel)
+ {
+ pgpa_unrolled_join *ujoin;
+
+ ujoin = pgpa_build_unrolled_join(walker, join_unroller);
+ walker->toplevel_unrolled_joins =
+ lappend(walker->toplevel_unrolled_joins, ujoin);
+ pgpa_destroy_join_unroller(join_unroller);
+ (void) pgpa_process_unrolled_join(walker, ujoin);
+ }
+
+ /*
+ * Some plan types can have additional children. Nodes like Append that
+ * can have any number of children store them in a List; a SubqueryScan
+ * just has a field for a single additional Plan.
+ */
+ switch (nodeTag(plan))
+ {
+ case T_Append:
+ {
+ Append *aplan = (Append *) plan;
+
+ extraplans = aplan->appendplans;
+ if (bms_is_empty(aplan->apprelids))
+ pushdown_query_features = active_query_features;
+ }
+ break;
+ case T_MergeAppend:
+ {
+ MergeAppend *maplan = (MergeAppend *) plan;
+
+ extraplans = maplan->mergeplans;
+ if (bms_is_empty(maplan->apprelids))
+ pushdown_query_features = active_query_features;
+ }
+ break;
+ case T_BitmapAnd:
+ extraplans = ((BitmapAnd *) plan)->bitmapplans;
+ break;
+ case T_BitmapOr:
+ extraplans = ((BitmapOr *) plan)->bitmapplans;
+ break;
+ case T_SubqueryScan:
+
+ /*
+ * We don't pass down active_query_features across here, because
+ * those are specific to a subquery level.
+ */
+ pgpa_walk_recursively(walker, ((SubqueryScan *) plan)->subplan,
+ 0, NULL, NIL, beneath_any_gather);
+ break;
+ case T_CustomScan:
+ extraplans = ((CustomScan *) plan)->custom_plans;
+ break;
+ default:
+ break;
+ }
+
+ /* If we found a list of extra children, iterate over it. */
+ foreach(lc, extraplans)
+ {
+ Plan *subplan = lfirst(lc);
+
+ pgpa_walk_recursively(walker, subplan, 0, NULL, pushdown_query_features,
+ beneath_any_gather);
+ }
+}
+
+/*
+ * Perform final processing of a newly-constructed pgpa_unrolled_join. This
+ * only needs to be called for toplevel pgpa_unrolled_join objects, since it
+ * recurses to sub-joins as needed.
+ *
+ * Our goal is to add the set of inner relids to the relevant join_strategies
+ * list, and to do the same for any sub-joins. To that end, the return value
+ * is the set of relids found beneath the the join, but it is expected that
+ * the toplevel caller will ignore this.
+ */
+static Bitmapset *
+pgpa_process_unrolled_join(pgpa_plan_walker_context *walker,
+ pgpa_unrolled_join *ujoin)
+{
+ Bitmapset *all_relids = bms_copy(ujoin->outer.scan->relids);
+
+ /* If this fails, we didn't unroll properly. */
+ Assert(ujoin->outer.unrolled_join == NULL);
+
+ for (int k = 0; k < ujoin->ninner; ++k)
+ {
+ pgpa_join_member *member = &ujoin->inner[k];
+ Bitmapset *relids;
+
+ if (member->unrolled_join != NULL)
+ relids = pgpa_process_unrolled_join(walker,
+ member->unrolled_join);
+ else
+ {
+ Assert(member->scan != NULL);
+ relids = member->scan->relids;
+ }
+ walker->join_strategies[ujoin->strategy[k]] =
+ lappend(walker->join_strategies[ujoin->strategy[k]], relids);
+ all_relids = bms_add_members(all_relids, relids);
+ }
+
+ return all_relids;
+}
+
+/*
+ * Arrange for the given plan node to be treated as a query feature when the
+ * tree walk reaches it.
+ *
+ * Make sure to only use this for nodes that the tree walk can't have reached
+ * yet!
+ */
+void
+pgpa_add_future_feature(pgpa_plan_walker_context *walker,
+ pgpa_qf_type type, Plan *plan)
+{
+ pgpa_query_feature *qf = pgpa_add_feature(walker, type, plan);
+
+ walker->future_query_features =
+ lappend(walker->future_query_features, qf);
+}
+
+/*
+ * Return the last of any elided nodes associated with this plan node ID.
+ *
+ * The last elided node is the one that would have been uppermost in the plan
+ * tree had it not been removed during setrefs processig.
+ */
+ElidedNode *
+pgpa_last_elided_node(PlannedStmt *pstmt, Plan *plan)
+{
+ ElidedNode *elided_node = NULL;
+
+ foreach_node(ElidedNode, n, pstmt->elidedNodes)
+ {
+ if (n->plan_node_id == plan->plan_node_id)
+ elided_node = n;
+ }
+
+ return elided_node;
+}
+
+/*
+ * Certain plan nodes can refer to a set of RTIs. Extract and return the set.
+ */
+Bitmapset *
+pgpa_relids(Plan *plan)
+{
+ if (IsA(plan, Result))
+ return ((Result *) plan)->relids;
+ else if (IsA(plan, ForeignScan))
+ return ((ForeignScan *) plan)->fs_relids;
+ else if (IsA(plan, Append))
+ return ((Append *) plan)->apprelids;
+ else if (IsA(plan, MergeAppend))
+ return ((MergeAppend *) plan)->apprelids;
+
+ return NULL;
+}
+
+/*
+ * Extract the scanned RTI from a plan node.
+ *
+ * Returns 0 if there isn't one.
+ */
+Index
+pgpa_scanrelid(Plan *plan)
+{
+ switch (nodeTag(plan))
+ {
+ case T_SeqScan:
+ case T_SampleScan:
+ case T_BitmapHeapScan:
+ case T_TidScan:
+ case T_TidRangeScan:
+ case T_SubqueryScan:
+ case T_FunctionScan:
+ case T_TableFuncScan:
+ case T_ValuesScan:
+ case T_CteScan:
+ case T_NamedTuplestoreScan:
+ case T_WorkTableScan:
+ case T_ForeignScan:
+ case T_CustomScan:
+ case T_IndexScan:
+ case T_IndexOnlyScan:
+ return ((Scan *) plan)->scanrelid;
+ default:
+ return 0;
+ }
+}
+
+/*
+ * Construct a new Bitmapset containing non-RTE_JOIN members of 'relids'.
+ */
+Bitmapset *
+pgpa_filter_out_join_relids(Bitmapset *relids, List *rtable)
+{
+ int rti = -1;
+ Bitmapset *result = NULL;
+
+ while ((rti = bms_next_member(relids, rti)) >= 0)
+ {
+ RangeTblEntry *rte = rt_fetch(rti, rtable);
+
+ if (rte->rtekind != RTE_JOIN)
+ result = bms_add_member(result, rti);
+ }
+
+ return result;
+}
+
+/*
+ * Create a pgpa_query_feature and add it to the list of all query features
+ * for this plan.
+ */
+static pgpa_query_feature *
+pgpa_add_feature(pgpa_plan_walker_context *walker,
+ pgpa_qf_type type, Plan *plan)
+{
+ pgpa_query_feature *qf = palloc0_object(pgpa_query_feature);
+
+ qf->type = type;
+ qf->plan = plan;
+
+ walker->query_features[qf->type] =
+ lappend(walker->query_features[qf->type], qf);
+
+ return qf;
+}
+
+/*
+ * Add a single RTI to each active query feature.
+ */
+static void
+pgpa_qf_add_rti(List *active_query_features, Index rti)
+{
+ foreach_ptr(pgpa_query_feature, qf, active_query_features)
+ {
+ qf->relids = bms_add_member(qf->relids, rti);
+ }
+}
+
+/*
+ * Add a set of RTIs to each active query feature.
+ */
+static void
+pgpa_qf_add_rtis(List *active_query_features, Bitmapset *relids)
+{
+ foreach_ptr(pgpa_query_feature, qf, active_query_features)
+ {
+ qf->relids = bms_add_members(qf->relids, relids);
+ }
+}
+
+/*
+ * Add RTIs directly contained in a plan node to each active query feature,
+ * but filter out any join RTIs, since advice doesn't mention those.
+ */
+static void
+pgpa_qf_add_plan_rtis(List *active_query_features, Plan *plan, List *rtable)
+{
+ Bitmapset *relids;
+ Index rti;
+
+ if ((relids = pgpa_relids(plan)) != NULL)
+ {
+ relids = pgpa_filter_out_join_relids(relids, rtable);
+ pgpa_qf_add_rtis(active_query_features, relids);
+ }
+ else if ((rti = pgpa_scanrelid(plan)) != 0)
+ pgpa_qf_add_rti(active_query_features, rti);
+}
+
+/*
+ * If we generated plan advice using the provided walker object and array
+ * of identifiers, would we generate the specified tag/target combination?
+ *
+ * If yes, the plan conforms to the advice; if no, it does not. Note that
+ * we have know way of knowing whether the planner was forced to emit a plan
+ * that conformed to the advice or just happened to do so.
+ */
+bool
+pgpa_walker_would_advise(pgpa_plan_walker_context *walker,
+ pgpa_identifier *rt_identifiers,
+ pgpa_advice_tag_type tag,
+ pgpa_advice_target *target)
+{
+ Index rtable_length = list_length(walker->pstmt->rtable);
+ Bitmapset *relids = NULL;
+
+ if (tag == PGPA_TAG_JOIN_ORDER)
+ {
+ foreach_ptr(pgpa_unrolled_join, ujoin, walker->toplevel_unrolled_joins)
+ {
+ if (pgpa_walker_join_order_matches(ujoin, rtable_length,
+ rt_identifiers, target, true))
+ return true;
+ }
+
+ return false;
+ }
+
+ if (target->ttype == PGPA_TARGET_IDENTIFIER)
+ {
+ Index rti;
+
+ rti = pgpa_compute_rti_from_identifier(rtable_length, rt_identifiers,
+ &target->rid);
+ if (rti == 0)
+ return false;
+ relids = bms_make_singleton(rti);
+ }
+ else
+ {
+ Assert(target->ttype == PGPA_TARGET_ORDERED_LIST);
+ foreach_ptr(pgpa_advice_target, child_target, target->children)
+ {
+ Index rti;
+
+ Assert(child_target->ttype == PGPA_TARGET_IDENTIFIER);
+ rti = pgpa_compute_rti_from_identifier(rtable_length,
+ rt_identifiers,
+ &child_target->rid);
+ if (rti == 0)
+ return false;
+ relids = bms_add_member(relids, rti);
+ }
+ }
+
+ switch (tag)
+ {
+ case PGPA_TAG_JOIN_ORDER:
+ /* should have been handled above */
+ pg_unreachable();
+ break;
+ case PGPA_TAG_BITMAP_HEAP_SCAN:
+ return pgpa_walker_find_scan(walker,
+ PGPA_SCAN_BITMAP_HEAP,
+ relids) != NULL;
+ case PGPA_TAG_FOREIGN_JOIN:
+ return pgpa_walker_find_scan(walker,
+ PGPA_SCAN_FOREIGN,
+ relids) != NULL;
+ case PGPA_TAG_INDEX_ONLY_SCAN:
+ {
+ pgpa_scan *scan;
+
+ scan = pgpa_walker_find_scan(walker, PGPA_SCAN_INDEX_ONLY,
+ relids);
+ if (scan == NULL)
+ return false;
+
+ return pgpa_walker_index_target_matches_plan(target->itarget, scan->plan);
+ }
+ case PGPA_TAG_INDEX_SCAN:
+ {
+ pgpa_scan *scan;
+
+ scan = pgpa_walker_find_scan(walker, PGPA_SCAN_INDEX,
+ relids);
+ if (scan == NULL)
+ return false;
+
+ return pgpa_walker_index_target_matches_plan(target->itarget, scan->plan);
+ }
+ case PGPA_TAG_PARTITIONWISE:
+ return pgpa_walker_find_scan(walker,
+ PGPA_SCAN_PARTITIONWISE,
+ relids) != NULL;
+ case PGPA_TAG_SEQ_SCAN:
+ return pgpa_walker_find_scan(walker,
+ PGPA_SCAN_SEQ,
+ relids) != NULL;
+ case PGPA_TAG_TID_SCAN:
+ return pgpa_walker_find_scan(walker,
+ PGPA_SCAN_TID,
+ relids) != NULL;
+ case PGPA_TAG_GATHER:
+ return pgpa_walker_contains_feature(walker,
+ PGPAQF_GATHER,
+ relids);
+ case PGPA_TAG_GATHER_MERGE:
+ return pgpa_walker_contains_feature(walker,
+ PGPAQF_GATHER_MERGE,
+ relids);
+ case PGPA_TAG_SEMIJOIN_NON_UNIQUE:
+ return pgpa_walker_contains_feature(walker,
+ PGPAQF_SEMIJOIN_NON_UNIQUE,
+ relids);
+ case PGPA_TAG_SEMIJOIN_UNIQUE:
+ return pgpa_walker_contains_feature(walker,
+ PGPAQF_SEMIJOIN_UNIQUE,
+ relids);
+ case PGPA_TAG_HASH_JOIN:
+ return pgpa_walker_contains_join(walker,
+ JSTRAT_HASH_JOIN,
+ relids);
+ case PGPA_TAG_MERGE_JOIN_MATERIALIZE:
+ return pgpa_walker_contains_join(walker,
+ JSTRAT_MERGE_JOIN_MATERIALIZE,
+ relids);
+ case PGPA_TAG_MERGE_JOIN_PLAIN:
+ return pgpa_walker_contains_join(walker,
+ JSTRAT_MERGE_JOIN_PLAIN,
+ relids);
+ case PGPA_TAG_NESTED_LOOP_MATERIALIZE:
+ return pgpa_walker_contains_join(walker,
+ JSTRAT_NESTED_LOOP_MATERIALIZE,
+ relids);
+ case PGPA_TAG_NESTED_LOOP_MEMOIZE:
+ return pgpa_walker_contains_join(walker,
+ JSTRAT_NESTED_LOOP_MEMOIZE,
+ relids);
+ case PGPA_TAG_NESTED_LOOP_PLAIN:
+ return pgpa_walker_contains_join(walker,
+ JSTRAT_NESTED_LOOP_PLAIN,
+ relids);
+ case PGPA_TAG_NO_GATHER:
+ return pgpa_walker_contains_no_gather(walker, relids);
+ }
+
+ /* should not get here */
+ return false;
+}
+
+/*
+ * Does the index target match the Plan?
+ *
+ * Should only be called when we know that itarget mandates an Index Scan or
+ * Index Only Scan and this corresponds to the type of Plan. Here, our job is
+ * just to check whether it's the same index.
+ */
+static bool
+pgpa_walker_index_target_matches_plan(pgpa_index_target *itarget, Plan *plan)
+{
+ Oid indexoid = InvalidOid;
+
+ /* Retrieve the index OID from the plan. */
+ if (IsA(plan, IndexScan))
+ indexoid = ((IndexScan *) plan)->indexid;
+ else if (IsA(plan, IndexOnlyScan))
+ indexoid = ((IndexOnlyScan *) plan)->indexid;
+ else
+ elog(ERROR, "unrecognized node type: %d", (int) nodeTag(plan));
+
+ /* Check whether schema name matches, if specified in index target. */
+ if (itarget->indnamespace != NULL)
+ {
+ Oid nspoid = get_rel_namespace(indexoid);
+ char *relnamespace = get_namespace_name_or_temp(nspoid);
+
+ if (strcmp(itarget->indnamespace, relnamespace) != 0)
+ return false;
+ }
+
+ /* Check whether relation name matches. */
+ return (strcmp(itarget->indname, get_rel_name(indexoid)) == 0);
+}
+
+/*
+ * Does an unrolled join match the join order specified by an advice target?
+ */
+static bool
+pgpa_walker_join_order_matches(pgpa_unrolled_join *ujoin,
+ Index rtable_length,
+ pgpa_identifier *rt_identifiers,
+ pgpa_advice_target *target,
+ bool toplevel)
+{
+ int nchildren = list_length(target->children);
+
+ Assert(target->ttype == PGPA_TARGET_ORDERED_LIST);
+
+ /* At toplevel, we allow a prefix match. */
+ if (toplevel)
+ {
+ if (nchildren > ujoin->ninner + 1)
+ return false;
+ }
+ else
+ {
+ if (nchildren != ujoin->ninner + 1)
+ return false;
+ }
+
+ /* Outermost rel must match. */
+ if (!pgpa_walker_join_order_matches_member(&ujoin->outer,
+ rtable_length,
+ rt_identifiers,
+ linitial(target->children)))
+ return false;
+
+ /* Each inner rel must match. */
+ for (int n = 0; n < nchildren - 1; ++n)
+ {
+ pgpa_advice_target *child_target = list_nth(target->children, n + 1);
+
+ if (!pgpa_walker_join_order_matches_member(&ujoin->inner[n],
+ rtable_length,
+ rt_identifiers,
+ child_target))
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Does one member of an unrolled join match an advice target?
+ */
+static bool
+pgpa_walker_join_order_matches_member(pgpa_join_member *member,
+ Index rtable_length,
+ pgpa_identifier *rt_identifiers,
+ pgpa_advice_target *target)
+{
+ Bitmapset *relids = NULL;
+
+ if (member->unrolled_join != NULL)
+ {
+ if (target->ttype != PGPA_TARGET_ORDERED_LIST)
+ return false;
+ return pgpa_walker_join_order_matches(member->unrolled_join,
+ rtable_length,
+ rt_identifiers,
+ target,
+ false);
+ }
+
+ Assert(member->scan != NULL);
+ switch (target->ttype)
+ {
+ case PGPA_TARGET_ORDERED_LIST:
+ /* Could only match an unrolled join */
+ return false;
+
+ case PGPA_TARGET_UNORDERED_LIST:
+ {
+ foreach_ptr(pgpa_advice_target, child_target, target->children)
+ {
+ Index rti;
+
+ rti = pgpa_compute_rti_from_identifier(rtable_length,
+ rt_identifiers,
+ &child_target->rid);
+ if (rti == 0)
+ return false;
+ relids = bms_add_member(relids, rti);
+ }
+ break;
+ }
+
+ case PGPA_TARGET_IDENTIFIER:
+ {
+ Index rti;
+
+ rti = pgpa_compute_rti_from_identifier(rtable_length,
+ rt_identifiers,
+ &target->rid);
+ if (rti == 0)
+ return false;
+ relids = bms_make_singleton(rti);
+ break;
+ }
+ }
+
+ return bms_equal(member->scan->relids, relids);
+}
+
+/*
+ * Find the scan where the walker says that the given scan strategy should be
+ * used for the given relid set, if one exists.
+ *
+ * Returns the pgpa_scan object, or NULL if none was found.
+ */
+static pgpa_scan *
+pgpa_walker_find_scan(pgpa_plan_walker_context *walker,
+ pgpa_scan_strategy strategy,
+ Bitmapset *relids)
+{
+ List *scans = walker->scans[strategy];
+
+ foreach_ptr(pgpa_scan, scan, scans)
+ {
+ if (bms_equal(scan->relids, relids))
+ return scan;
+ }
+
+ return NULL;
+}
+
+/*
+ * Does this walker say that the given query feature applies to the given
+ * relid set?
+ */
+static bool
+pgpa_walker_contains_feature(pgpa_plan_walker_context *walker,
+ pgpa_qf_type type,
+ Bitmapset *relids)
+{
+ List *query_features = walker->query_features[type];
+
+ foreach_ptr(pgpa_query_feature, qf, query_features)
+ {
+ if (bms_equal(qf->relids, relids))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Does the walker say that the given join strategy should be used for the
+ * given relid set?
+ */
+static bool
+pgpa_walker_contains_join(pgpa_plan_walker_context *walker,
+ pgpa_join_strategy strategy,
+ Bitmapset *relids)
+{
+ List *join_strategies = walker->join_strategies[strategy];
+
+ foreach_ptr(Bitmapset, jsrelids, join_strategies)
+ {
+ if (bms_equal(jsrelids, relids))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Does the walker say that the given relids should be marked as NO_GATHER?
+ */
+static bool
+pgpa_walker_contains_no_gather(pgpa_plan_walker_context *walker,
+ Bitmapset *relids)
+{
+ return bms_is_subset(relids, walker->no_gather_scans);
+}
diff --git a/contrib/pg_plan_advice/pgpa_walker.h b/contrib/pg_plan_advice/pgpa_walker.h
new file mode 100644
index 00000000000..b91a36ca3dd
--- /dev/null
+++ b/contrib/pg_plan_advice/pgpa_walker.h
@@ -0,0 +1,141 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgpa_walker.h
+ * Plan tree iteration
+ *
+ * Copyright (c) 2016-2025, PostgreSQL Global Development Group
+ *
+ * contrib/pg_plan_advice/pgpa_walker.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PGPA_WALKER_H
+#define PGPA_WALKER_H
+
+#include "pgpa_ast.h"
+#include "pgpa_join.h"
+#include "pgpa_scan.h"
+
+/*
+ * When generating advice, we should emit either SEMIJOIN_UNIQUE advice or
+ * SEMIJOIN_NON_UNIQUE advice for each semijoin depending on whether we chose
+ * to implement it as a semijoin or whether we instead chose to make the
+ * nullable side unique and then perform an inner join. When the make-unique
+ * strategy is not chosen, it's not easy to tell from the final plan tree
+ * whether it was considered. That's awkward, because we don't want to emit
+ * useless SEMIJOIN_NON_UNIQUE advice when there was no decision to be made.
+ *
+ * To avoid that, during planning, we create a pgpa_sj_unique_rel for each
+ * relation that we considered making unique for purposes of semijoin planning.
+ */
+typedef struct pgpa_sj_unique_rel
+{
+ char *plan_name;
+ Bitmapset *relids;
+} pgpa_sj_unique_rel;
+
+/*
+ * We use the term "query feature" to refer to plan nodes that are interesting
+ * in the following way: to generate advice, we'll need to know the set of
+ * same-subquery, non-join RTIs occuring at or below that plan node, without
+ * admixture of parent and child RTIs.
+ *
+ * For example, Gather nodes, desiginated by PGPAQF_GATHER, and Gather Merge
+ * nodes, designated by PGPAQF_GATHER_MERGE, are query features, because we'll
+ * want to admit some kind of advice that describes the portion of the plan
+ * tree that appears beneath those nodes.
+ *
+ * Each semijoin can be implemented either by directly performing a semijoin,
+ * or by making one side unique and then performing a normal join. Either way,
+ * we use a query feature to notice what decision was made, so that we can
+ * describe it by enumerating the RTIs on that side of the join.
+ *
+ * To elaborate on the "no admixture of parent and child RTIs" rule, in all of
+ * these cases, if the entirety of an inheritance hierarchy appears beneath
+ * the query feature, we only want to name the parent table. But it's also
+ * possible to have cases where we must name child tables. This is particularly
+ * likely to happen when partitionwise join is in use, but could happen for
+ * Gather or Gather Merge even without that, if one of those appears below
+ * an Append or MergeAppend node for a single table.
+ */
+typedef enum pgpa_qf_type
+{
+ PGPAQF_GATHER,
+ PGPAQF_GATHER_MERGE,
+ PGPAQF_SEMIJOIN_NON_UNIQUE,
+ PGPAQF_SEMIJOIN_UNIQUE
+ /* update NUM_PGPA_QF_TYPES if you add anything here */
+} pgpa_qf_type;
+
+#define NUM_PGPA_QF_TYPES ((int) PGPAQF_SEMIJOIN_UNIQUE + 1)
+
+/*
+ * For each query feature, we keep track of the feature type and the set of
+ * relids that we found underneath the relevant plan node. See the comments
+ * on pgpa_qf_type, above, for additional details.
+ */
+typedef struct pgpa_query_feature
+{
+ pgpa_qf_type type;
+ Plan *plan;
+ Bitmapset *relids;
+} pgpa_query_feature;
+
+/*
+ * Context object for plan tree walk.
+ *
+ * pstmt is the PlannedStmt we're studying.
+ *
+ * scans is an array of lists of pgpa_scan objects. The array is indexed by
+ * the scan's pgpa_scan_strategy.
+ *
+ * no_gather_scans is the set of scan RTIs that do not appear beneath any
+ * Gather or Gather Merge node.
+ *
+ * toplevel_unrolled_joins is a list of all pgpa_unrolled_join objects that
+ * are not a child of some other pgpa_unrolled_join.
+ *
+ * join_strategy is an array of lists of Bitmapset objects. Each Bitmapset
+ * is the set of relids that appears on the inner side of some join (excluding
+ * RTIs from partition children and subqueries). The array is indexed by
+ * pgpa_join_strategy.
+ *
+ * query_features is an array lists of pgpa_query_feature objects, indexed
+ * by pgpa_qf_type.
+ *
+ * future_query_features is only used during the plan tree walk and should
+ * be empty when the tree walk concludes. It is a list of pgpa_query_feature
+ * objects for Plan nodes that the plan tree walk has not yet encountered;
+ * when encountered, they will be moved to the list of active query features
+ * that is propagated via the call stack.
+ */
+typedef struct pgpa_plan_walker_context
+{
+ PlannedStmt *pstmt;
+ List *scans[NUM_PGPA_SCAN_STRATEGY];
+ Bitmapset *no_gather_scans;
+ List *toplevel_unrolled_joins;
+ List *join_strategies[NUM_PGPA_JOIN_STRATEGY];
+ List *query_features[NUM_PGPA_QF_TYPES];
+ List *future_query_features;
+} pgpa_plan_walker_context;
+
+extern void pgpa_plan_walker(pgpa_plan_walker_context *walker,
+ PlannedStmt *pstmt,
+ List *sj_unique_rels);
+
+extern void pgpa_add_future_feature(pgpa_plan_walker_context *walker,
+ pgpa_qf_type type,
+ Plan *plan);
+
+extern ElidedNode *pgpa_last_elided_node(PlannedStmt *pstmt, Plan *plan);
+extern Bitmapset *pgpa_relids(Plan *plan);
+extern Index pgpa_scanrelid(Plan *plan);
+extern Bitmapset *pgpa_filter_out_join_relids(Bitmapset *relids, List *rtable);
+
+extern bool pgpa_walker_would_advise(pgpa_plan_walker_context *walker,
+ pgpa_identifier *rt_identifiers,
+ pgpa_advice_tag_type tag,
+ pgpa_advice_target *target);
+
+#endif
diff --git a/contrib/pg_plan_advice/sql/gather.sql b/contrib/pg_plan_advice/sql/gather.sql
new file mode 100644
index 00000000000..776666bf196
--- /dev/null
+++ b/contrib/pg_plan_advice/sql/gather.sql
@@ -0,0 +1,86 @@
+LOAD 'pg_plan_advice';
+SET max_parallel_workers_per_gather = 1;
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET debug_parallel_query = off;
+
+CREATE TABLE gt_dim (id serial primary key, dim text)
+ WITH (autovacuum_enabled = false);
+INSERT INTO gt_dim (dim) SELECT random()::text FROM generate_series(1,100) g;
+VACUUM ANALYZE gt_dim;
+
+CREATE TABLE gt_fact (
+ id int not null,
+ dim_id integer not null references gt_dim (id)
+) WITH (autovacuum_enabled = false);
+INSERT INTO gt_fact
+ SELECT g, (g%3)+1 FROM generate_series(1,100000) g;
+VACUUM ANALYZE gt_fact;
+
+-- By default, we expect Gather Merge with a parallel hash join.
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+
+-- Force Gather or Gather Merge of both relations together.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'gather_merge((f d))';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+SET LOCAL pg_plan_advice.advice = 'gather((f d))';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+COMMIT;
+
+-- Force a separate Gather or Gather Merge operation for each relation.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'gather_merge(f d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+SET LOCAL pg_plan_advice.advice = 'gather(f d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+SET LOCAL pg_plan_advice.advice = 'gather((d d/d.d))';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+COMMIT;
+
+-- Force a Gather or Gather Merge on one relation but no parallelism on other.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'gather_merge(f) no_gather(d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+SET LOCAL pg_plan_advice.advice = 'gather_merge(d) no_gather(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+SET LOCAL pg_plan_advice.advice = 'gather(f) no_gather(d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+SET LOCAL pg_plan_advice.advice = 'gather(d) no_gather(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+COMMIT;
+
+-- Force no Gather or Gather Merge use at all.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'no_gather(f d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+COMMIT;
+
+-- Can't force Gather Merge without the ORDER BY clause, but just Gather is OK.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'gather_merge((f d))';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id;
+SET LOCAL pg_plan_advice.advice = 'gather((f d))';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id;
+COMMIT;
+
+-- Test conflicting advice.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'gather((f d)) no_gather(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM gt_fact f JOIN gt_dim d ON f.dim_id = d.id ORDER BY d.id;
+COMMIT;
diff --git a/contrib/pg_plan_advice/sql/join_order.sql b/contrib/pg_plan_advice/sql/join_order.sql
new file mode 100644
index 00000000000..5e16e54efad
--- /dev/null
+++ b/contrib/pg_plan_advice/sql/join_order.sql
@@ -0,0 +1,145 @@
+LOAD 'pg_plan_advice';
+SET max_parallel_workers_per_gather = 0;
+
+CREATE TABLE jo_dim1 (id integer primary key, dim1 text, val1 int)
+ WITH (autovacuum_enabled = false);
+INSERT INTO jo_dim1 (id, dim1, val1)
+ SELECT g, 'some filler text ' || g, (g % 3) + 1
+ FROM generate_series(1,100) g;
+VACUUM ANALYZE jo_dim1;
+CREATE TABLE jo_dim2 (id integer primary key, dim2 text, val2 int)
+ WITH (autovacuum_enabled = false);
+INSERT INTO jo_dim2 (id, dim2, val2)
+ SELECT g, 'some filler text ' || g, (g % 7) + 1
+ FROM generate_series(1,1000) g;
+VACUUM ANALYZE jo_dim2;
+
+CREATE TABLE jo_fact (
+ id int primary key,
+ dim1_id integer not null references jo_dim1 (id),
+ dim2_id integer not null references jo_dim2 (id)
+) WITH (autovacuum_enabled = false);
+INSERT INTO jo_fact
+ SELECT g, (g%100)+1, (g%100)+1 FROM generate_series(1,100000) g;
+VACUUM ANALYZE jo_fact;
+
+-- We expect to join to d2 first and then d1, since the condition on d2
+-- is more selective.
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+
+-- Force a few different join orders. Some of these are very inefficient,
+-- but the planner considers them all viable.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'join_order(f d1 d2)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+SET LOCAL pg_plan_advice.advice = 'join_order(f d2 d1)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+SET LOCAL pg_plan_advice.advice = 'join_order(d1 f d2)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+SET LOCAL pg_plan_advice.advice = 'join_order(f (d1 d2))';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+SET LOCAL pg_plan_advice.advice = 'join_order(f {d1 d2})';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+COMMIT;
+
+-- Force a join order by mentioning just a prefix of the join list.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'join_order(d2)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+SET LOCAL pg_plan_advice.advice = 'join_order(d2 d1)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+COMMIT;
+
+-- jo_fact is not partitioned, but let's try pretending that it is and
+-- verifying that the advice does not apply.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'join_order(f/d1 d1 d2)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+SET LOCAL pg_plan_advice.advice = 'join_order(f/d1 (d1 d2))';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+COMMIT;
+
+-- The unusual formulation of this query is intended to prevent the query
+-- planner from reducing the FULL JOIN to some other join type, so that we
+-- can test what happens with a join type that cannot be reordered.
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_dim1 d1
+ INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0)
+ ON d1.id = f.dim1_id OR f.dim1_id IS NULL;
+
+-- We should not be able to force the planner to join f to d1 first, because
+-- that is not a valid join order, but we should be able to force the planner
+-- to make either d2 or f the driving table.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'join_order(f d1 d2)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_dim1 d1
+ INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0)
+ ON d1.id = f.dim1_id OR f.dim1_id IS NULL;
+SET LOCAL pg_plan_advice.advice = 'join_order(f d2 d1)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_dim1 d1
+ INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0)
+ ON d1.id = f.dim1_id OR f.dim1_id IS NULL;
+SET LOCAL pg_plan_advice.advice = 'join_order(d2 f d1)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_dim1 d1
+ INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0)
+ ON d1.id = f.dim1_id OR f.dim1_id IS NULL;
+COMMIT;
+
+-- Two incompatible join orders should conflict. In the second case,
+-- the conflict is implicit: if d1 is on the inner side of a join of any
+-- type, it cannot also be the driving table.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'join_order(f) join_order(d1)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_dim1 d1
+ INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0)
+ ON d1.id = f.dim1_id OR f.dim1_id IS NULL;
+SET LOCAL pg_plan_advice.advice = 'join_order(d1) hash_join(d1)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM jo_dim1 d1
+ INNER JOIN (jo_fact f FULL JOIN jo_dim2 d2 ON f.dim2_id + 0 = d2.id + 0)
+ ON d1.id = f.dim1_id OR f.dim1_id IS NULL;
+COMMIT;
diff --git a/contrib/pg_plan_advice/sql/join_strategy.sql b/contrib/pg_plan_advice/sql/join_strategy.sql
new file mode 100644
index 00000000000..edd5c4c0e14
--- /dev/null
+++ b/contrib/pg_plan_advice/sql/join_strategy.sql
@@ -0,0 +1,84 @@
+LOAD 'pg_plan_advice';
+SET max_parallel_workers_per_gather = 0;
+
+CREATE TABLE join_dim (id serial primary key, dim text)
+ WITH (autovacuum_enabled = false);
+INSERT INTO join_dim (dim) SELECT random()::text FROM generate_series(1,100) g;
+VACUUM ANALYZE join_dim;
+
+CREATE TABLE join_fact (
+ id int primary key,
+ dim_id integer not null references join_dim (id)
+) WITH (autovacuum_enabled = false);
+INSERT INTO join_fact
+ SELECT g, (g%3)+1 FROM generate_series(1,100000) g;
+CREATE INDEX join_fact_dim_id ON join_fact (dim_id);
+VACUUM ANALYZE join_fact;
+
+-- We expect a hash join by default.
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+
+-- Try forcing each join method in turn with join_dim as the inner table.
+-- All of these should work except for MERGE_JOIN_MATERIALIZE; that will
+-- fail, because the planner knows that join_dim (id) is unique, and will
+-- refuse to add mark/restore overhead.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'HASH_JOIN(d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_MATERIALIZE(d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_PLAIN(d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MATERIALIZE(d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MEMOIZE(d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+COMMIT;
+
+-- Now try forcing each join method in turn with join_fact as the inner
+-- table. All of these should work.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'HASH_JOIN(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_MATERIALIZE(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+SET LOCAL pg_plan_advice.advice = 'MERGE_JOIN_PLAIN(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MATERIALIZE(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_MEMOIZE(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+COMMIT;
+
+-- Non-working cases. We can't force a foreign join between these tables,
+-- because they aren't foreign tables. We also can't use two different
+-- strategies on the same table, nor can we put both tables on the inner
+-- side of the same join.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'FOREIGN_JOIN((f d))';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f) NESTED_LOOP_MATERIALIZE(f)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+SET LOCAL pg_plan_advice.advice = 'NESTED_LOOP_PLAIN(f d)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+COMMIT;
diff --git a/contrib/pg_plan_advice/sql/local_collector.sql b/contrib/pg_plan_advice/sql/local_collector.sql
new file mode 100644
index 00000000000..db1e23488d4
--- /dev/null
+++ b/contrib/pg_plan_advice/sql/local_collector.sql
@@ -0,0 +1,44 @@
+CREATE EXTENSION pg_plan_advice;
+SET debug_parallel_query = off;
+
+-- Try clearing advice before we've collected any.
+SELECT pg_clear_collected_local_advice();
+
+-- Set a small advice collection limit so that we'll exceed it.
+SET pg_plan_advice.local_collection_limit = 2;
+
+-- Enable the collector.
+SET pg_plan_advice.local_collector = on;
+
+-- Set up a dummy table.
+CREATE TABLE dummy_table (a int primary key, b text)
+ WITH (autovacuum_enabled = false, parallel_workers = 0);
+
+-- Test queries.
+SELECT * FROM dummy_table a, dummy_table b;
+SELECT * FROM dummy_table;
+
+-- Should return the advice from the second test query.
+SELECT advice FROM pg_get_collected_local_advice() ORDER BY id DESC LIMIT 1;
+
+-- Now try clearing advice again.
+SELECT pg_clear_collected_local_advice();
+
+-- Raise the collection limit so that the collector uses multiple chunks.
+SET pg_plan_advice.local_collection_limit = 2000;
+
+-- Push a bunch of queries through the collector.
+DO $$
+BEGIN
+ FOR x IN 1..2000 LOOP
+ EXECUTE 'SELECT * FROM dummy_table';
+ END LOOP;
+END
+$$;
+
+-- Check that the collector worked.
+SELECT COUNT(*) FROM pg_get_collected_local_advice();
+
+-- And clear one more time, to verify that this doesn't cause a problem
+-- even with a larger number of entries.
+SELECT pg_clear_collected_local_advice();
diff --git a/contrib/pg_plan_advice/sql/partitionwise.sql b/contrib/pg_plan_advice/sql/partitionwise.sql
new file mode 100644
index 00000000000..c51456dbbb5
--- /dev/null
+++ b/contrib/pg_plan_advice/sql/partitionwise.sql
@@ -0,0 +1,99 @@
+LOAD 'pg_plan_advice';
+SET max_parallel_workers_per_gather = 0;
+SET enable_partitionwise_join = true;
+
+CREATE TABLE pt1 (id integer primary key, dim1 text, val1 int)
+ PARTITION BY RANGE (id);
+CREATE TABLE pt1a PARTITION OF pt1 FOR VALUES FROM (1) to (1001)
+ WITH (autovacuum_enabled = false);
+CREATE TABLE pt1b PARTITION OF pt1 FOR VALUES FROM (1001) to (2001)
+ WITH (autovacuum_enabled = false);
+CREATE TABLE pt1c PARTITION OF pt1 FOR VALUES FROM (2001) to (3001)
+ WITH (autovacuum_enabled = false);
+INSERT INTO pt1 (id, dim1, val1)
+ SELECT g, 'some filler text ' || g, (g % 3) + 1
+ FROM generate_series(1,3000) g;
+VACUUM ANALYZE pt1;
+
+CREATE TABLE pt2 (id integer primary key, dim2 text, val2 int)
+ PARTITION BY RANGE (id);
+CREATE TABLE pt2a PARTITION OF pt2 FOR VALUES FROM (1) to (1001)
+ WITH (autovacuum_enabled = false);
+CREATE TABLE pt2b PARTITION OF pt2 FOR VALUES FROM (1001) to (2001)
+ WITH (autovacuum_enabled = false);
+CREATE TABLE pt2c PARTITION OF pt2 FOR VALUES FROM (2001) to (3001)
+ WITH (autovacuum_enabled = false);
+INSERT INTO pt2 (id, dim2, val2)
+ SELECT g, 'some other text ' || g, (g % 5) + 1
+ FROM generate_series(1,3000,2) g;
+VACUUM ANALYZE pt2;
+
+CREATE TABLE pt3 (id integer primary key, dim3 text, val3 int)
+ PARTITION BY RANGE (id);
+CREATE TABLE pt3a PARTITION OF pt3 FOR VALUES FROM (1) to (1001)
+ WITH (autovacuum_enabled = false);
+CREATE TABLE pt3b PARTITION OF pt3 FOR VALUES FROM (1001) to (2001)
+ WITH (autovacuum_enabled = false);
+CREATE TABLE pt3c PARTITION OF pt3 FOR VALUES FROM (2001) to (3001)
+ WITH (autovacuum_enabled = false);
+INSERT INTO pt3 (id, dim3, val3)
+ SELECT g, 'a third random text ' || g, (g % 7) + 1
+ FROM generate_series(1,3000,3) g;
+VACUUM ANALYZE pt3;
+
+CREATE TABLE ptmismatch (id integer primary key, dimm text, valm int)
+ PARTITION BY RANGE (id);
+CREATE TABLE ptmismatcha PARTITION OF ptmismatch
+ FOR VALUES FROM (1) to (1501)
+ WITH (autovacuum_enabled = false);
+CREATE TABLE ptmismatchb PARTITION OF ptmismatch
+ FOR VALUES FROM (1501) to (3001)
+ WITH (autovacuum_enabled = false);
+INSERT INTO ptmismatch (id, dimm, valm)
+ SELECT g, 'yet another text ' || g, (g % 2) + 1
+ FROM generate_series(1,3000) g;
+VACUUM ANALYZE ptmismatch;
+
+EXPLAIN (PLAN_ADVICE, COSTS OFF)
+SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id
+ AND val1 = 1 AND val2 = 1 AND val3 = 1;
+
+-- Suppress partitionwise join, or do it just partially.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE(pt1 pt2 pt3)';
+EXPLAIN (PLAN_ADVICE, COSTS OFF)
+SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id
+ AND val1 = 1 AND val2 = 1 AND val3 = 1;
+SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 pt2) pt3)';
+EXPLAIN (PLAN_ADVICE, COSTS OFF)
+SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id
+ AND val1 = 1 AND val2 = 1 AND val3 = 1;
+COMMIT;
+
+-- Test conflicting advice.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 pt2) (pt1 pt3))';
+EXPLAIN (PLAN_ADVICE, COSTS OFF)
+SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id
+ AND val1 = 1 AND val2 = 1 AND val3 = 1;
+COMMIT;
+
+-- Can't force a partitionwise join with a mismatched table.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'PARTITIONWISE((pt1 ptmismatch))';
+EXPLAIN (PLAN_ADVICE, COSTS OFF)
+SELECT * FROM pt1, ptmismatch WHERE pt1.id = ptmismatch.id;
+COMMIT;
+
+-- Force join order for a particular branch of the partitionwise join with
+-- and without mentioning the schema name.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'JOIN_ORDER(pt3/public.pt3a pt2/public.pt2a pt1/public.pt1a)';
+EXPLAIN (PLAN_ADVICE, COSTS OFF)
+SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id
+ AND val1 = 1 AND val2 = 1 AND val3 = 1;
+SET LOCAL pg_plan_advice.advice = 'JOIN_ORDER(pt3/pt3a pt2/pt2a pt1/pt1a)';
+EXPLAIN (PLAN_ADVICE, COSTS OFF)
+SELECT * FROM pt1, pt2, pt3 WHERE pt1.id = pt2.id AND pt2.id = pt3.id
+ AND val1 = 1 AND val2 = 1 AND val3 = 1;
+COMMIT;
diff --git a/contrib/pg_plan_advice/sql/prepared.sql b/contrib/pg_plan_advice/sql/prepared.sql
new file mode 100644
index 00000000000..3ec30eedee5
--- /dev/null
+++ b/contrib/pg_plan_advice/sql/prepared.sql
@@ -0,0 +1,37 @@
+LOAD 'pg_plan_advice';
+SET max_parallel_workers_per_gather = 0;
+
+CREATE TABLE ptab (id integer, val text) WITH (autovacuum_enabled = false);
+
+SET pg_plan_advice.always_store_advice_details = false;
+
+-- Not prepared, so advice should be generated.
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM ptab;
+
+-- Prepared, so advice should not be generated.
+PREPARE pt1 AS SELECT * FROM ptab;
+EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt1;
+
+SET pg_plan_advice.always_store_advice_details = true;
+
+-- Prepared, but always_store_advice_details = true, so should show advice.
+PREPARE pt2 AS SELECT * FROM ptab;
+EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt2;
+
+-- Not prepared, so feedback should be generated.
+SET pg_plan_advice.always_store_advice_details = false;
+SET pg_plan_advice.advice = 'SEQ_SCAN(ptab)';
+EXPLAIN (COSTS OFF)
+SELECT * FROM ptab;
+
+-- Prepared, so advice should not be generated.
+PREPARE pt3 AS SELECT * FROM ptab;
+EXPLAIN (COSTS OFF) EXECUTE pt1;
+
+SET pg_plan_advice.always_store_advice_details = true;
+
+-- Prepared, but always_store_advice_details = true, so should show feedback.
+PREPARE pt4 AS SELECT * FROM ptab;
+EXPLAIN (COSTS OFF, PLAN_ADVICE) EXECUTE pt2;
+
diff --git a/contrib/pg_plan_advice/sql/scan.sql b/contrib/pg_plan_advice/sql/scan.sql
new file mode 100644
index 00000000000..4fc494c7d8e
--- /dev/null
+++ b/contrib/pg_plan_advice/sql/scan.sql
@@ -0,0 +1,195 @@
+LOAD 'pg_plan_advice';
+SET max_parallel_workers_per_gather = 0;
+SET seq_page_cost = 0.1;
+SET random_page_cost = 0.1;
+SET cpu_tuple_cost = 0;
+SET cpu_index_tuple_cost = 0;
+
+CREATE TABLE scan_table (a int primary key, b text)
+ WITH (autovacuum_enabled = false);
+INSERT INTO scan_table
+ SELECT g, 'some text ' || g FROM generate_series(1, 100000) g;
+CREATE INDEX scan_table_b ON scan_table USING brin (b);
+VACUUM ANALYZE scan_table;
+
+-- Sequential scan
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table;
+
+-- Index scan
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+
+-- Index-only scan
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1;
+
+-- Bitmap heap scan
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table
+ WHERE b > 'some text 8';
+
+-- TID scan
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)';
+
+-- TID range scan
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table
+ WHERE ctid > '(1,1)' AND ctid < '(2,1)';
+
+-- Try forcing each of our test queries to use the scan type they
+-- wanted to use anyway. This should succeed.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table;
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1;
+SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table
+ WHERE b > 'some text 8';
+SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table
+ WHERE ctid > '(1,1)' AND ctid < '(2,1)';
+COMMIT;
+
+-- Try to force a full scan of the table to use some other scan type. All
+-- of these will fail. An index scan or bitmap heap scan could potentially
+-- generate the correct answer, but the planner does not even consider these
+-- possibilities due to the lack of a WHERE clause.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table;
+SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table;
+SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table;
+SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table;
+COMMIT;
+
+-- Try again to force index use. This should now succeed for the INDEX_SCAN
+-- and BITMAP_HEAP_SCAN, but the INDEX_ONLY_SCAN can't be forced because the
+-- query fetches columns not included in the index.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0;
+SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0;
+SET LOCAL pg_plan_advice.advice = 'BITMAP_HEAP_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a > 0;
+COMMIT;
+
+-- We can force a primary key lookup to use a sequential scan, but we
+-- can't force it to use an index-only scan (due to the column list)
+-- or a TID scan (due to the absence of a TID qual).
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+SET LOCAL pg_plan_advice.advice = 'TID_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+COMMIT;
+
+-- We can forcibly downgrade an index-only scan to an index scan, but we can't
+-- force the use of an index that the planner thinks is inapplicable.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table public.scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_b)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+COMMIT;
+
+-- We can force the use of a sequential scan in place of a bitmap heap scan,
+-- but a plain index scan on a BRIN index is not possible.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table
+ WHERE b > 'some text 8';
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_b)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+COMMIT;
+
+-- We can force the use of a sequential scan rather than a TID scan or
+-- TID range scan.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(scan_table)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE ctid = '(0,1)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table
+ WHERE ctid > '(1,1)' AND ctid < '(2,1)';
+COMMIT;
+
+-- Test more complex scenarios with index scans.
+BEGIN;
+-- Should still work if we mention the schema.
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table public.scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+-- But not if we mention the wrong schema.
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table cilbup.scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+-- It's OK to repeat the same advice.
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey scan_table scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+-- But it doesn't work if the index target is even notionally different.
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table scan_table_pkey scan_table public.scan_table_pkey)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT * FROM scan_table WHERE a = 1;
+COMMIT;
+
+-- Test assorted incorrect advice.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(nothing)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1;
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(nothing whatsoever)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1;
+SET LOCAL pg_plan_advice.advice = 'INDEX_SCAN(scan_table bogus)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1;
+SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(nothing whatsoever)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1;
+SET LOCAL pg_plan_advice.advice = 'INDEX_ONLY_SCAN(scan_table bogus)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE) SELECT a FROM scan_table WHERE a = 1;
+COMMIT;
+
+-- Test our ability to refer to multiple instances of the same alias.
+BEGIN;
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x
+ LEFT JOIN scan_table s ON g = s.a;
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x
+ LEFT JOIN scan_table s ON g = s.a;
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s#2)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x
+ LEFT JOIN scan_table s ON g = s.a;
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s) SEQ_SCAN(s#2)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (generate_series(1,10) g LEFT JOIN scan_table s ON g = s.a) x
+ LEFT JOIN scan_table s ON g = s.a;
+COMMIT;
+
+-- Test our ability to refer to scans within a subquery.
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x;
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0);
+BEGIN;
+-- Should not match.
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x;
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0);
+-- Should match first query only.
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s@x)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x;
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0);
+-- Should match second query only.
+SET LOCAL pg_plan_advice.advice = 'SEQ_SCAN(s@unnamed_subquery)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0) x;
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM (SELECT * FROM scan_table s WHERE a = 1 OFFSET 0);
+COMMIT;
diff --git a/contrib/pg_plan_advice/sql/semijoin.sql b/contrib/pg_plan_advice/sql/semijoin.sql
new file mode 100644
index 00000000000..5a4ae52d1d9
--- /dev/null
+++ b/contrib/pg_plan_advice/sql/semijoin.sql
@@ -0,0 +1,118 @@
+LOAD 'pg_plan_advice';
+SET max_parallel_workers_per_gather = 0;
+
+CREATE TABLE sj_wide (
+ id integer primary key,
+ val1 integer,
+ padding text storage plain
+) WITH (autovacuum_enabled = false);
+INSERT INTO sj_wide
+ SELECT g, g%10+1, repeat(' ', 300) FROM generate_series(1, 1000) g;
+CREATE INDEX ON sj_wide (val1);
+VACUUM ANALYZE sj_wide;
+
+CREATE TABLE sj_narrow (
+ id integer primary key,
+ val1 integer
+) WITH (autovacuum_enabled = false);
+INSERT INTO sj_narrow
+ SELECT g, g%10+1 FROM generate_series(1, 1000) g;
+CREATE INDEX ON sj_narrow (val1);
+VACUUM ANALYZE sj_narrow;
+
+-- We expect this to make the VALUES list unique and use index lookups to
+-- find the rows in sj_wide, so as to avoid a full scan of sj_wide.
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM sj_wide
+ WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
+
+-- If we ask for a unique semijoin, we should get the same plan as with
+-- no advice. If we ask for a non-unique semijoin, we should see a Semi
+-- Join operation in the plan tree.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'semijoin_unique("*VALUES*")';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM sj_wide
+ WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
+SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique("*VALUES*")';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM sj_wide
+ WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
+COMMIT;
+
+-- Because this table is narrower than the previous one, a sequential scan
+-- is less expensive, and we choose a straightforward Semi Join plan by
+-- default. (Note that this is also very sensitive to the length of the IN
+-- list, which affects how many index lookups the alternative plan will need.)
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM sj_narrow
+ WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
+
+-- Here, we expect advising a unique semijoin to swith to the same plan that
+-- we got with sj_wide, and advising a non-unique semijoin should not change
+-- the plan.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'semijoin_unique("*VALUES*")';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM sj_narrow
+ WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
+SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique("*VALUES*")';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM sj_narrow
+ WHERE (id, val1) IN (VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
+COMMIT;
+
+-- In the above example, we made the outer side of the join unique, but here,
+-- we should make the inner side unique.
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g
+ WHERE g in (select val1 from sj_narrow);
+
+-- We should be able to force a plan with or without the make-unique strategy,
+-- with either side as the driving table.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g
+ WHERE g in (select val1 from sj_narrow);
+SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(sj_narrow)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g
+ WHERE g in (select val1 from sj_narrow);
+SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow) join_order(sj_narrow)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g
+ WHERE g in (select val1 from sj_narrow);
+SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(sj_narrow) join_order(sj_narrow)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g
+ WHERE g in (select val1 from sj_narrow);
+COMMIT;
+
+-- However, mentioning the wrong side of the join should result in an advice
+-- failure.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'semijoin_unique(g)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g
+ WHERE g in (select val1 from sj_narrow);
+SET LOCAL pg_plan_advice.advice = 'semijoin_non_unique(g)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g
+ WHERE g in (select val1 from sj_narrow);
+COMMIT;
+
+-- Test conflicting advice.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'semijoin_unique(sj_narrow) semijoin_non_unique(sj_narrow)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g
+ WHERE g in (select val1 from sj_narrow);
+COMMIT;
+
+-- Try applying SEMIJOIN_UNIQUE() to a non-semijoin.
+BEGIN;
+SET LOCAL pg_plan_advice.advice = 'semijoin_unique(g)';
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+SELECT * FROM generate_series(1,1000) g, sj_narrow s WHERE g = s.val1;
+COMMIT;
diff --git a/contrib/pg_plan_advice/sql/syntax.sql b/contrib/pg_plan_advice/sql/syntax.sql
new file mode 100644
index 00000000000..56a5d54e2b5
--- /dev/null
+++ b/contrib/pg_plan_advice/sql/syntax.sql
@@ -0,0 +1,68 @@
+LOAD 'pg_plan_advice';
+
+-- An empty string is allowed. Empty target lists are allowed for most advice
+-- tags, but not for JOIN_ORDER. "Supplied Plan Advice" should be omitted in
+-- text format when there is no actual advice, but not in non-text format.
+SET pg_plan_advice.advice = '';
+EXPLAIN (COSTS OFF) SELECT 1;
+SET pg_plan_advice.advice = 'SEQ_SCAN()';
+EXPLAIN (COSTS OFF) SELECT 1;
+SET pg_plan_advice.advice = 'NESTED_LOOP_PLAIN()';
+EXPLAIN (COSTS OFF, FORMAT JSON) SELECT 1;
+SET pg_plan_advice.advice = 'JOIN_ORDER()';
+
+-- Test assorted variations in capitalization, whitespace, and which parts of
+-- the relation identifier are included. These should all work.
+SET pg_plan_advice.advice = 'SEQ_SCAN(x)';
+EXPLAIN (COSTS OFF) SELECT 1;
+SET pg_plan_advice.advice = 'seq_scan(x@y)';
+EXPLAIN (COSTS OFF) SELECT 1;
+SET pg_plan_advice.advice = 'SEQ_scan(x#2)';
+EXPLAIN (COSTS OFF) SELECT 1;
+SET pg_plan_advice.advice = 'SEQ_SCAN (x/y)';
+EXPLAIN (COSTS OFF) SELECT 1;
+SET pg_plan_advice.advice = ' SEQ_SCAN ( x / y . z ) ';
+EXPLAIN (COSTS OFF) SELECT 1;
+SET pg_plan_advice.advice = 'SEQ_SCAN("x"#2/"y"."z"@"t")';
+EXPLAIN (COSTS OFF) SELECT 1;
+
+-- Syntax errors.
+SET pg_plan_advice.advice = 'SEQUENTIAL_SCAN(x)';
+SET pg_plan_advice.advice = 'SEQ_SCAN';
+SET pg_plan_advice.advice = 'SEQ_SCAN(';
+SET pg_plan_advice.advice = 'SEQ_SCAN("';
+SET pg_plan_advice.advice = 'SEQ_SCAN("")';
+SET pg_plan_advice.advice = 'SEQ_SCAN("a"';
+SET pg_plan_advice.advice = 'SEQ_SCAN(#';
+SET pg_plan_advice.advice = '()';
+SET pg_plan_advice.advice = '123';
+
+-- Tags like SEQ_SCAN and NO_GATHER don't allow sublists at all; other tags,
+-- except for JOIN_ORDER, allow at most one level of sublist. Hence, these
+-- examples should error out.
+SET pg_plan_advice.advice = 'SEQ_SCAN((x))';
+SET pg_plan_advice.advice = 'GATHER(((x)))';
+
+-- Legal comments.
+SET pg_plan_advice.advice = '/**/';
+EXPLAIN (COSTS OFF) SELECT 1;
+SET pg_plan_advice.advice = 'HASH_JOIN(_)/***/';
+EXPLAIN (COSTS OFF) SELECT 1;
+SET pg_plan_advice.advice = '/* comment */ HASH_JOIN(/*x*/y)';
+EXPLAIN (COSTS OFF) SELECT 1;
+SET pg_plan_advice.advice = '/* comment */ HASH_JOIN(y//*x*/z)';
+EXPLAIN (COSTS OFF) SELECT 1;
+
+-- Unterminated comments.
+SET pg_plan_advice.advice = '/*';
+SET pg_plan_advice.advice = 'JOIN_ORDER("fOO") /* oops';
+
+-- Nested comments are not supported, so the first of these is legal and
+-- the second is not.
+SET pg_plan_advice.advice = '/*/*/';
+EXPLAIN (COSTS OFF) SELECT 1;
+SET pg_plan_advice.advice = '/*/* stuff */*/';
+
+-- Foreign join requires multiple relation identifiers.
+SET pg_plan_advice.advice = 'FOREIGN_JOIN(a)';
+SET pg_plan_advice.advice = 'FOREIGN_JOIN((a))';
diff --git a/contrib/pg_plan_advice/t/001_regress.pl b/contrib/pg_plan_advice/t/001_regress.pl
new file mode 100644
index 00000000000..67595cddf75
--- /dev/null
+++ b/contrib/pg_plan_advice/t/001_regress.pl
@@ -0,0 +1,148 @@
+# Copyright (c) 2021-2025, PostgreSQL Global Development Group
+
+# Run the core regression tests under pg_plan_advice to check for problems.
+use strict;
+use warnings FATAL => 'all';
+
+use Cwd qw(abs_path);
+use File::Basename qw(dirname);
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Initialize the primary node
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init();
+
+# Set up our desired configuration.
+#
+# We run with pg_plan_advice.shared_collection_limit set to ensure that the
+# plan tree walker code runs against every query in the regression tests. If
+# we're unable to properly analyze any of those plan trees, this test should fail.
+#
+# We set pg_plan_advice.advice to an advice string that will cause the advice
+# trove to be populated with a few entries of various sorts, but which we do
+# not expect to match anything in the regression test queries. This way, the
+# planner hooks will be called, improving code coverage, but no plans should
+# actually change.
+#
+# pg_plan_advice.always_explain_supplied_advice=false is needed to avoid breaking
+# regression test queries that use EXPLAIN. In the real world, it seems like
+# users will want EXPLAIN output to show supplied advice so that it's clear
+# whether normal planner behavior has been altered, but here that's undesirable.
+$node->append_conf('postgresql.conf', <start;
+
+my $srcdir = abs_path("../..");
+
+# --dlpath is needed to be able to find the location of regress.so
+# and any libraries the regression tests require.
+my $dlpath = dirname($ENV{REGRESS_SHLIB});
+
+# --outputdir points to the path where to place the output files.
+my $outputdir = $PostgreSQL::Test::Utils::tmp_check;
+
+# --inputdir points to the path of the input files.
+my $inputdir = "$srcdir/src/test/regress";
+
+# Run the tests.
+my $rc =
+ system($ENV{PG_REGRESS} . " "
+ . "--bindir= "
+ . "--dlpath=\"$dlpath\" "
+ . "--host=" . $node->host . " "
+ . "--port=" . $node->port . " "
+ . "--schedule=$srcdir/src/test/regress/parallel_schedule "
+ . "--max-concurrent-tests=20 "
+ . "--inputdir=\"$inputdir\" "
+ . "--outputdir=\"$outputdir\"");
+
+# Dump out the regression diffs file, if there is one
+if ($rc != 0)
+{
+ my $diffs = "$outputdir/regression.diffs";
+ if (-e $diffs)
+ {
+ print "=== dumping $diffs ===\n";
+ print slurp_file($diffs);
+ print "=== EOF ===\n";
+ }
+}
+
+# Report results
+is($rc, 0, 'regression tests pass');
+
+# Create the extension so we can access the collector
+$node->safe_psql('postgres', 'CREATE EXTENSION pg_plan_advice');
+
+# Verify that a large amount of advice was collected
+my $all_query_count = $node->safe_psql('postgres', <', 20000, "copious advice collected");
+
+# Verify that lots of different advice strings were collected
+my $distinct_query_count = $node->safe_psql('postgres', <', 3000, "diverse advice collected");
+
+# We want to test for the presence of our known tags in the collected advice.
+# Put all tags into the hash that follows; map any tags that aren't tested
+# by the core regression tests to 0, and others to 1.
+my %tag_map = (
+ BITMAP_HEAP_SCAN => 1,
+ FOREIGN_JOIN => 0,
+ GATHER => 1,
+ GATHER_MERGE => 1,
+ HASH_JOIN => 1,
+ INDEX_ONLY_SCAN => 1,
+ INDEX_SCAN => 1,
+ JOIN_ORDER => 1,
+ MERGE_JOIN_MATERIALIZE => 1,
+ MERGE_JOIN_PLAIN => 1,
+ NESTED_LOOP_MATERIALIZE => 1,
+ NESTED_LOOP_MEMOIZE => 1,
+ NESTED_LOOP_PLAIN => 1,
+ NO_GATHER => 1,
+ PARTITIONWISE => 1,
+ SEMIJOIN_NON_UNIQUE => 1,
+ SEMIJOIN_UNIQUE => 1,
+ SEQ_SCAN => 1,
+ TID_SCAN => 1,
+);
+for my $tag (sort keys %tag_map)
+{
+ my $checkit = $tag_map{$tag};
+
+ # Search for the given tag. This is not entirely robust: it could get thrown
+ # off by a table alias such as "FOREIGN_JOIN(", but that probably won't
+ # happen in the core regression tests.
+ my $tag_count = $node->safe_psql('postgres', <', 10, "multiple uses of $tag") if $checkit;
+
+ # Regardless, note the exact count in the log, for human consumption.
+ note("found $tag_count advice strings containing $tag");
+}
+
+# Trigger a partial cleanup of the shared advice collector, and then a full
+# cleanup.
+$node->safe_psql('postgres', < 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+-- Index 50 translations of the word "Mathematics"
+CREATE TEMP TABLE mb (s text);
+\copy mb from 'data/trgm_utf8.data'
+CREATE INDEX ON mb USING gist(s gist_trgm_ops);
diff --git a/contrib/pg_trgm/expected/pg_utf8_trgm_1.out b/contrib/pg_trgm/expected/pg_utf8_trgm_1.out
new file mode 100644
index 00000000000..8505c4fa552
--- /dev/null
+++ b/contrib/pg_trgm/expected/pg_utf8_trgm_1.out
@@ -0,0 +1,3 @@
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/contrib/pg_trgm/meson.build b/contrib/pg_trgm/meson.build
index 3cc299d5eaa..3ecf95ba862 100644
--- a/contrib/pg_trgm/meson.build
+++ b/contrib/pg_trgm/meson.build
@@ -39,6 +39,7 @@ tests += {
'regress': {
'sql': [
'pg_trgm',
+ 'pg_utf8_trgm',
'pg_word_trgm',
'pg_strict_word_trgm',
],
diff --git a/contrib/pg_trgm/sql/pg_utf8_trgm.sql b/contrib/pg_trgm/sql/pg_utf8_trgm.sql
new file mode 100644
index 00000000000..0dd962ced83
--- /dev/null
+++ b/contrib/pg_trgm/sql/pg_utf8_trgm.sql
@@ -0,0 +1,9 @@
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+-- Index 50 translations of the word "Mathematics"
+CREATE TEMP TABLE mb (s text);
+\copy mb from 'data/trgm_utf8.data'
+CREATE INDEX ON mb USING gist(s gist_trgm_ops);
diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h
index ca017585369..ca23aad4dd9 100644
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@@ -47,7 +47,7 @@ typedef char trgm[3];
} while(0)
extern int (*CMPTRGM) (const void *a, const void *b);
-#define ISWORDCHR(c) (t_isalnum(c))
+#define ISWORDCHR(c, len) (t_isalnum_with_len(c, len))
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c
index 2f0d61985a5..685275a0f9b 100644
--- a/contrib/pg_trgm/trgm_gist.c
+++ b/contrib/pg_trgm/trgm_gist.c
@@ -701,10 +701,13 @@ gtrgm_penalty(PG_FUNCTION_ARGS)
if (ISARRKEY(newval))
{
char *cache = (char *) fcinfo->flinfo->fn_extra;
- TRGM *cachedVal = (TRGM *) (cache + MAXALIGN(siglen));
+ TRGM *cachedVal = NULL;
Size newvalsize = VARSIZE(newval);
BITVECP sign;
+ if (cache != NULL)
+ cachedVal = (TRGM *) (cache + MAXALIGN(siglen));
+
/*
* Cache the sign data across multiple calls with the same newval.
*/
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index 81182a15e07..5fba594b61f 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -66,6 +66,78 @@ typedef uint8 TrgmBound;
#define WORD_SIMILARITY_STRICT 0x02 /* force bounds of extent to match
* word bounds */
+/*
+ * A growable array of trigrams
+ *
+ * The actual array of trigrams is in 'datum'. Note that the other fields in
+ * 'datum', i.e. datum->flags and the varlena length, are not kept up to date
+ * when items are added to the growable array. We merely reserve the space
+ * for them here. You must fill those other fields before using 'datum' as a
+ * proper TRGM datum.
+ */
+typedef struct
+{
+ TRGM *datum; /* trigram array */
+ int length; /* number of trigrams in the array */
+ int allocated; /* allocated size of 'datum' (# of trigrams) */
+} growable_trgm_array;
+
+/*
+ * Allocate a new growable array.
+ *
+ * 'slen' is the size of the source string that we're extracting the trigrams
+ * from. It is used to choose the initial size of the array.
+ */
+static void
+init_trgm_array(growable_trgm_array *arr, int slen)
+{
+ size_t init_size;
+
+ /*
+ * In the extreme case, the input string consists entirely of one
+ * character words, like "a b c", where each word is expanded to two
+ * trigrams. This is not a strict upper bound though, because when
+ * IGNORECASE is defined, we convert the input string to lowercase before
+ * extracting the trigrams, which in rare cases can expand one input
+ * character into multiple characters.
+ */
+ init_size = (size_t) slen + 1;
+
+ /*
+ * Guard against possible overflow in the palloc request. (We don't worry
+ * about the additive constants, since palloc can detect requests that are
+ * a little above MaxAllocSize --- we just need to prevent integer
+ * overflow in the multiplications.)
+ */
+ if (init_size > MaxAllocSize / sizeof(trgm))
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("out of memory")));
+
+ arr->datum = palloc(CALCGTSIZE(ARRKEY, init_size));
+ arr->allocated = init_size;
+ arr->length = 0;
+}
+
+/* Make sure the array can hold at least 'needed' more trigrams */
+static void
+enlarge_trgm_array(growable_trgm_array *arr, int needed)
+{
+ size_t new_needed = (size_t) arr->length + needed;
+
+ if (new_needed > arr->allocated)
+ {
+ /* Guard against possible overflow, like in init_trgm_array */
+ if (new_needed > MaxAllocSize / sizeof(trgm))
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("out of memory")));
+
+ arr->datum = repalloc(arr->datum, CALCGTSIZE(ARRKEY, new_needed));
+ arr->allocated = new_needed;
+ }
+}
+
/*
* Module load callback
*/
@@ -220,22 +292,31 @@ comp_trgm(const void *a, const void *b)
* endword points to the character after word
*/
static char *
-find_word(char *str, int lenstr, char **endword, int *charlen)
+find_word(char *str, int lenstr, char **endword)
{
char *beginword = str;
+ const char *endstr = str + lenstr;
- while (beginword - str < lenstr && !ISWORDCHR(beginword))
- beginword += pg_mblen(beginword);
+ while (beginword < endstr)
+ {
+ int clen = pg_mblen_range(beginword, endstr);
- if (beginword - str >= lenstr)
+ if (ISWORDCHR(beginword, clen))
+ break;
+ beginword += clen;
+ }
+
+ if (beginword >= endstr)
return NULL;
*endword = beginword;
- *charlen = 0;
- while (*endword - str < lenstr && ISWORDCHR(*endword))
+ while (*endword < endstr)
{
- *endword += pg_mblen(*endword);
- (*charlen)++;
+ int clen = pg_mblen_range(*endword, endstr);
+
+ if (!ISWORDCHR(*endword, clen))
+ break;
+ *endword += clen;
}
return beginword;
@@ -269,78 +350,138 @@ compact_trigram(trgm *tptr, char *str, int bytelen)
}
/*
- * Adds trigrams from words (already padded).
+ * Adds trigrams from the word in 'str' (already padded if necessary).
*/
-static trgm *
-make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
+static void
+make_trigrams(growable_trgm_array *dst, char *str, int bytelen)
{
+ trgm *tptr;
char *ptr = str;
- if (charlen < 3)
- return tptr;
+ if (bytelen < 3)
+ return;
- if (bytelen > charlen)
- {
- /* Find multibyte character boundaries and apply compact_trigram */
- int lenfirst = pg_mblen(str),
- lenmiddle = pg_mblen(str + lenfirst),
- lenlast = pg_mblen(str + lenfirst + lenmiddle);
+ /* max number of trigrams = strlen - 2 */
+ enlarge_trgm_array(dst, bytelen - 2);
+ tptr = GETARR(dst->datum) + dst->length;
- while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
+ if (pg_encoding_max_length(GetDatabaseEncoding()) == 1)
+ {
+ while (ptr < str + bytelen - 2)
{
- compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
-
- ptr += lenfirst;
+ CPTRGM(tptr, ptr);
+ ptr++;
tptr++;
-
- lenfirst = lenmiddle;
- lenmiddle = lenlast;
- lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
}
}
else
{
- /* Fast path when there are no multibyte characters */
- Assert(bytelen == charlen);
+ int lenfirst,
+ lenmiddle,
+ lenlast;
+ char *endptr;
- while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
+ /*
+ * Fast path as long as there are no multibyte characters
+ */
+ if (!IS_HIGHBIT_SET(ptr[0]) && !IS_HIGHBIT_SET(ptr[1]))
{
- CPTRGM(tptr, ptr);
- ptr++;
+ while (!IS_HIGHBIT_SET(ptr[2]))
+ {
+ CPTRGM(tptr, ptr);
+ ptr++;
+ tptr++;
+
+ if (ptr == str + bytelen - 2)
+ goto done;
+ }
+
+ lenfirst = 1;
+ lenmiddle = 1;
+ lenlast = pg_mblen_unbounded(ptr + 2);
+ }
+ else
+ {
+ lenfirst = pg_mblen_unbounded(ptr);
+ if (ptr + lenfirst >= str + bytelen)
+ goto done;
+ lenmiddle = pg_mblen_unbounded(ptr + lenfirst);
+ if (ptr + lenfirst + lenmiddle >= str + bytelen)
+ goto done;
+ lenlast = pg_mblen_unbounded(ptr + lenfirst + lenmiddle);
+ }
+
+ /*
+ * Slow path to handle any remaining multibyte characters
+ *
+ * As we go, 'ptr' points to the beginning of the current
+ * three-character string and 'endptr' points to just past it.
+ */
+ endptr = ptr + lenfirst + lenmiddle + lenlast;
+ while (endptr <= str + bytelen)
+ {
+ compact_trigram(tptr, ptr, endptr - ptr);
tptr++;
+
+ /* Advance to the next character */
+ if (endptr == str + bytelen)
+ break;
+ ptr += lenfirst;
+ lenfirst = lenmiddle;
+ lenmiddle = lenlast;
+ lenlast = pg_mblen_unbounded(endptr);
+ endptr += lenlast;
}
}
- return tptr;
+done:
+ dst->length = tptr - GETARR(dst->datum);
+ Assert(dst->length <= dst->allocated);
}
/*
* Make array of trigrams without sorting and removing duplicate items.
*
- * trg: where to return the array of trigrams.
+ * dst: where to return the array of trigrams.
* str: source string, of length slen bytes.
- * bounds: where to return bounds of trigrams (if needed).
- *
- * Returns length of the generated array.
+ * bounds_p: where to return bounds of trigrams (if needed).
*/
-static int
-generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
+static void
+generate_trgm_only(growable_trgm_array *dst, char *str, int slen, TrgmBound **bounds_p)
{
- trgm *tptr;
+ size_t buflen;
char *buf;
- int charlen,
- bytelen;
+ int bytelen;
char *bword,
*eword;
+ TrgmBound *bounds = NULL;
+ int bounds_allocated = 0;
- if (slen + LPADDING + RPADDING < 3 || slen == 0)
- return 0;
+ init_trgm_array(dst, slen);
- tptr = trg;
+ /*
+ * If requested, allocate an array for the bounds, with the same size as
+ * the trigram array.
+ */
+ if (bounds_p)
+ {
+ bounds_allocated = dst->allocated;
+ bounds = *bounds_p = palloc0_array(TrgmBound, bounds_allocated);
+ }
- /* Allocate a buffer for case-folded, blank-padded words */
- buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
+ if (slen + LPADDING + RPADDING < 3 || slen == 0)
+ return;
+ /*
+ * Allocate a buffer for case-folded, blank-padded words.
+ *
+ * As an initial guess, allocate a buffer large enough to hold the
+ * original string with padding, which is always enough when compiled with
+ * !IGNORECASE. If the case-folding produces a string longer than the
+ * original, we'll grow the buffer.
+ */
+ buflen = (size_t) slen + 4;
+ buf = (char *) palloc(buflen);
if (LPADDING > 0)
{
*buf = ' ';
@@ -349,52 +490,59 @@ generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
}
eword = str;
- while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
+ while ((bword = find_word(eword, slen - (eword - str), &eword)) != NULL)
{
+ int oldlen;
+
+ /* Convert word to lower case before extracting trigrams from it */
#ifdef IGNORECASE
- bword = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
- bytelen = strlen(bword);
+ {
+ char *lowered;
+
+ lowered = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
+ bytelen = strlen(lowered);
+
+ /* grow the buffer if necessary */
+ if (bytelen > buflen - 4)
+ {
+ pfree(buf);
+ buflen = (size_t) bytelen + 4;
+ buf = (char *) palloc(buflen);
+ if (LPADDING > 0)
+ {
+ *buf = ' ';
+ if (LPADDING > 1)
+ *(buf + 1) = ' ';
+ }
+ }
+ memcpy(buf + LPADDING, lowered, bytelen);
+ pfree(lowered);
+ }
#else
bytelen = eword - bword;
-#endif
-
memcpy(buf + LPADDING, bword, bytelen);
-
-#ifdef IGNORECASE
- pfree(bword);
#endif
buf[LPADDING + bytelen] = ' ';
buf[LPADDING + bytelen + 1] = ' ';
/* Calculate trigrams marking their bounds if needed */
+ oldlen = dst->length;
+ make_trigrams(dst, buf, bytelen + LPADDING + RPADDING);
if (bounds)
- bounds[tptr - trg] |= TRGM_BOUND_LEFT;
- tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
- charlen + LPADDING + RPADDING);
- if (bounds)
- bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
+ {
+ if (bounds_allocated < dst->length)
+ {
+ bounds = repalloc0_array(bounds, TrgmBound, bounds_allocated, dst->allocated);
+ bounds_allocated = dst->allocated;
+ }
+
+ bounds[oldlen] |= TRGM_BOUND_LEFT;
+ bounds[dst->length - 1] |= TRGM_BOUND_RIGHT;
+ }
}
pfree(buf);
-
- return tptr - trg;
-}
-
-/*
- * Guard against possible overflow in the palloc requests below. (We
- * don't worry about the additive constants, since palloc can detect
- * requests that are a little above MaxAllocSize --- we just need to
- * prevent integer overflow in the multiplications.)
- */
-static void
-protect_out_of_mem(int slen)
-{
- if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
- (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
- ereport(ERROR,
- (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- errmsg("out of memory")));
}
/*
@@ -408,19 +556,14 @@ TRGM *
generate_trgm(char *str, int slen)
{
TRGM *trg;
+ growable_trgm_array arr;
int len;
- protect_out_of_mem(slen);
-
- trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
+ generate_trgm_only(&arr, str, slen, NULL);
+ len = arr.length;
+ trg = arr.datum;
trg->flag = ARRKEY;
- len = generate_trgm_only(GETARR(trg), str, slen, NULL);
- SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
-
- if (len == 0)
- return trg;
-
/*
* Make trigrams unique.
*/
@@ -675,8 +818,8 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
{
bool *found;
pos_trgm *ptrg;
- trgm *trg1;
- trgm *trg2;
+ growable_trgm_array trg1;
+ growable_trgm_array trg2;
int len1,
len2,
len,
@@ -685,27 +828,21 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
ulen1;
int *trg2indexes;
float4 result;
- TrgmBound *bounds;
-
- protect_out_of_mem(slen1 + slen2);
+ TrgmBound *bounds = NULL;
/* Make positional trigrams */
- trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
- trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
- if (flags & WORD_SIMILARITY_STRICT)
- bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
- else
- bounds = NULL;
- len1 = generate_trgm_only(trg1, str1, slen1, NULL);
- len2 = generate_trgm_only(trg2, str2, slen2, bounds);
+ generate_trgm_only(&trg1, str1, slen1, NULL);
+ len1 = trg1.length;
+ generate_trgm_only(&trg2, str2, slen2, (flags & WORD_SIMILARITY_STRICT) ? &bounds : NULL);
+ len2 = trg2.length;
- ptrg = make_positional_trgm(trg1, len1, trg2, len2);
+ ptrg = make_positional_trgm(GETARR(trg1.datum), len1, GETARR(trg2.datum), len2);
len = len1 + len2;
qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
- pfree(trg1);
- pfree(trg2);
+ pfree(trg1.datum);
+ pfree(trg2.datum);
/*
* Merge positional trigrams array: enumerate each trigram and find its
@@ -761,20 +898,20 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
* str: source string, of length lenstr bytes (need not be null-terminated)
* buf: where to return the substring (must be long enough)
* *bytelen: receives byte length of the found substring
- * *charlen: receives character length of the found substring
*
* Returns pointer to end+1 of the found substring in the source string.
- * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
+ * Returns NULL if no word found (in which case buf, bytelen is not set)
*
* If the found word is bounded by non-word characters or string boundaries
* then this function will include corresponding padding spaces into buf.
*/
static const char *
get_wildcard_part(const char *str, int lenstr,
- char *buf, int *bytelen, int *charlen)
+ char *buf, int *bytelen)
{
const char *beginword = str;
const char *endword;
+ const char *endstr = str + lenstr;
char *s = buf;
bool in_leading_wildcard_meta = false;
bool in_trailing_wildcard_meta = false;
@@ -787,11 +924,13 @@ get_wildcard_part(const char *str, int lenstr,
* from this loop to the next one, since we may exit at a word character
* that is in_escape.
*/
- while (beginword - str < lenstr)
+ while (beginword < endstr)
{
+ clen = pg_mblen_range(beginword, endstr);
+
if (in_escape)
{
- if (ISWORDCHR(beginword))
+ if (ISWORDCHR(beginword, clen))
break;
in_escape = false;
in_leading_wildcard_meta = false;
@@ -802,12 +941,12 @@ get_wildcard_part(const char *str, int lenstr,
in_escape = true;
else if (ISWILDCARDCHAR(beginword))
in_leading_wildcard_meta = true;
- else if (ISWORDCHR(beginword))
+ else if (ISWORDCHR(beginword, clen))
break;
else
in_leading_wildcard_meta = false;
}
- beginword += pg_mblen(beginword);
+ beginword += clen;
}
/*
@@ -820,18 +959,13 @@ get_wildcard_part(const char *str, int lenstr,
* Add left padding spaces if preceding character wasn't wildcard
* meta-character.
*/
- *charlen = 0;
if (!in_leading_wildcard_meta)
{
if (LPADDING > 0)
{
*s++ = ' ';
- (*charlen)++;
if (LPADDING > 1)
- {
*s++ = ' ';
- (*charlen)++;
- }
}
}
@@ -840,15 +974,14 @@ get_wildcard_part(const char *str, int lenstr,
* string boundary. Strip escapes during copy.
*/
endword = beginword;
- while (endword - str < lenstr)
+ while (endword < endstr)
{
- clen = pg_mblen(endword);
+ clen = pg_mblen_range(endword, endstr);
if (in_escape)
{
- if (ISWORDCHR(endword))
+ if (ISWORDCHR(endword, clen))
{
memcpy(s, endword, clen);
- (*charlen)++;
s += clen;
}
else
@@ -873,10 +1006,9 @@ get_wildcard_part(const char *str, int lenstr,
in_trailing_wildcard_meta = true;
break;
}
- else if (ISWORDCHR(endword))
+ else if (ISWORDCHR(endword, clen))
{
memcpy(s, endword, clen);
- (*charlen)++;
s += clen;
}
else
@@ -894,12 +1026,8 @@ get_wildcard_part(const char *str, int lenstr,
if (RPADDING > 0)
{
*s++ = ' ';
- (*charlen)++;
if (RPADDING > 1)
- {
*s++ = ' ';
- (*charlen)++;
- }
}
}
@@ -918,24 +1046,21 @@ TRGM *
generate_wildcard_trgm(const char *str, int slen)
{
TRGM *trg;
- char *buf,
- *buf2;
- trgm *tptr;
+ growable_trgm_array arr;
+ char *buf;
int len,
- charlen,
bytelen;
const char *eword;
- protect_out_of_mem(slen);
-
- trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
- trg->flag = ARRKEY;
- SET_VARSIZE(trg, TRGMHDRSIZE);
-
if (slen + LPADDING + RPADDING < 3 || slen == 0)
+ {
+ trg = (TRGM *) palloc(TRGMHDRSIZE);
+ trg->flag = ARRKEY;
+ SET_VARSIZE(trg, TRGMHDRSIZE);
return trg;
+ }
- tptr = GETARR(trg);
+ init_trgm_array(&arr, slen);
/* Allocate a buffer for blank-padded, but not yet case-folded, words */
buf = palloc_array(char, slen + 4);
@@ -945,39 +1070,41 @@ generate_wildcard_trgm(const char *str, int slen)
*/
eword = str;
while ((eword = get_wildcard_part(eword, slen - (eword - str),
- buf, &bytelen, &charlen)) != NULL)
+ buf, &bytelen)) != NULL)
{
+ char *word;
+
#ifdef IGNORECASE
- buf2 = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID);
- bytelen = strlen(buf2);
+ word = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID);
+ bytelen = strlen(word);
#else
- buf2 = buf;
+ word = buf;
#endif
/*
* count trigrams
*/
- tptr = make_trigrams(tptr, buf2, bytelen, charlen);
+ make_trigrams(&arr, word, bytelen);
#ifdef IGNORECASE
- pfree(buf2);
+ pfree(word);
#endif
}
pfree(buf);
- if ((len = tptr - GETARR(trg)) == 0)
- return trg;
-
/*
* Make trigrams unique.
*/
+ trg = arr.datum;
+ len = arr.length;
if (len > 1)
{
qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
}
+ trg->flag = ARRKEY;
SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
return trg;
diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c
index 1d1b5fe304d..efee4cf5fb4 100644
--- a/contrib/pg_trgm/trgm_regexp.c
+++ b/contrib/pg_trgm/trgm_regexp.c
@@ -483,7 +483,7 @@ static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
static void RE_compile(regex_t *regex, text *text_re,
int cflags, Oid collation);
static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA);
-static bool convertPgWchar(pg_wchar c, trgm_mb_char *result);
+static int convertPgWchar(pg_wchar c, trgm_mb_char *result);
static void transformGraph(TrgmNFA *trgmNFA);
static void processState(TrgmNFA *trgmNFA, TrgmState *state);
static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key);
@@ -807,10 +807,11 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
for (j = 0; j < charsCount; j++)
{
trgm_mb_char c;
+ int clen = convertPgWchar(chars[j], &c);
- if (!convertPgWchar(chars[j], &c))
+ if (!clen)
continue; /* ok to ignore it altogether */
- if (ISWORDCHR(c.bytes))
+ if (ISWORDCHR(c.bytes, clen))
colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
else
colorInfo->containsNonWord = true;
@@ -822,13 +823,15 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
/*
* Convert pg_wchar to multibyte format.
- * Returns false if the character should be ignored completely.
+ * Returns 0 if the character should be ignored completely, else returns its
+ * byte length.
*/
-static bool
+static int
convertPgWchar(pg_wchar c, trgm_mb_char *result)
{
/* "s" has enough space for a multibyte character and a trailing NUL */
char s[MAX_MULTIBYTE_CHAR_LEN + 1];
+ int clen;
/*
* We can ignore the NUL character, since it can never appear in a PG text
@@ -836,11 +839,11 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result)
* reconstructing trigrams.
*/
if (c == 0)
- return false;
+ return 0;
/* Do the conversion, making sure the result is NUL-terminated */
memset(s, 0, sizeof(s));
- pg_wchar2mb_with_len(&c, s, 1);
+ clen = pg_wchar2mb_with_len(&c, s, 1);
/*
* In IGNORECASE mode, we can ignore uppercase characters. We assume that
@@ -857,12 +860,12 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result)
*/
#ifdef IGNORECASE
{
- char *lowerCased = str_tolower(s, strlen(s), DEFAULT_COLLATION_OID);
+ char *lowerCased = str_tolower(s, clen, DEFAULT_COLLATION_OID);
if (strcmp(lowerCased, s) != 0)
{
pfree(lowerCased);
- return false;
+ return 0;
}
pfree(lowerCased);
}
@@ -870,7 +873,7 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result)
/* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */
memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN);
- return true;
+ return clen;
}
diff --git a/contrib/pg_visibility/expected/pg_visibility.out b/contrib/pg_visibility/expected/pg_visibility.out
index e10f1706015..d26f0ab7589 100644
--- a/contrib/pg_visibility/expected/pg_visibility.out
+++ b/contrib/pg_visibility/expected/pg_visibility.out
@@ -207,7 +207,7 @@ select pg_truncate_visibility_map('test_partition');
-- test the case where vacuum phase I does not need to modify the heap buffer
-- and only needs to set the VM
-create table test_vac_unmodified_heap(a int);
+create temp table test_vac_unmodified_heap(a int);
insert into test_vac_unmodified_heap values (1);
vacuum (freeze) test_vac_unmodified_heap;
select pg_visibility_map_summary('test_vac_unmodified_heap');
diff --git a/contrib/pg_visibility/sql/pg_visibility.sql b/contrib/pg_visibility/sql/pg_visibility.sql
index 57af8a0c5b6..0888adb96a6 100644
--- a/contrib/pg_visibility/sql/pg_visibility.sql
+++ b/contrib/pg_visibility/sql/pg_visibility.sql
@@ -97,7 +97,7 @@ select pg_truncate_visibility_map('test_partition');
-- test the case where vacuum phase I does not need to modify the heap buffer
-- and only needs to set the VM
-create table test_vac_unmodified_heap(a int);
+create temp table test_vac_unmodified_heap(a int);
insert into test_vac_unmodified_heap values (1);
vacuum (freeze) test_vac_unmodified_heap;
select pg_visibility_map_summary('test_vac_unmodified_heap');
diff --git a/contrib/pgcrypto/Makefile b/contrib/pgcrypto/Makefile
index 69afa375011..17d2b0c5ed1 100644
--- a/contrib/pgcrypto/Makefile
+++ b/contrib/pgcrypto/Makefile
@@ -44,7 +44,8 @@ REGRESS = init md5 sha1 hmac-md5 hmac-sha1 blowfish rijndael \
sha2 des 3des cast5 \
crypt-des crypt-md5 crypt-blowfish crypt-xdes \
pgp-armor pgp-decrypt pgp-encrypt pgp-encrypt-md5 $(CF_PGP_TESTS) \
- pgp-pubkey-decrypt pgp-pubkey-encrypt pgp-info crypt-shacrypt
+ pgp-pubkey-decrypt pgp-pubkey-encrypt pgp-pubkey-session \
+ pgp-info crypt-shacrypt
ifdef USE_PGXS
PG_CONFIG = pg_config
diff --git a/contrib/pgcrypto/crypt-sha.c b/contrib/pgcrypto/crypt-sha.c
index 7ec21771a83..e8f32bc3896 100644
--- a/contrib/pgcrypto/crypt-sha.c
+++ b/contrib/pgcrypto/crypt-sha.c
@@ -328,7 +328,7 @@ px_crypt_shacrypt(const char *pw, const char *salt, char *passwd, unsigned dstle
ereport(ERROR,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid character in salt string: \"%.*s\"",
- pg_mblen(ep), ep));
+ pg_mblen_cstr(ep), ep));
}
else
{
diff --git a/contrib/pgcrypto/expected/pgp-decrypt.out b/contrib/pgcrypto/expected/pgp-decrypt.out
index eb049ba9d44..8ce6466f2e9 100644
--- a/contrib/pgcrypto/expected/pgp-decrypt.out
+++ b/contrib/pgcrypto/expected/pgp-decrypt.out
@@ -315,7 +315,7 @@ SaV9L04ky1qECNDx3XjnoKLC+H7IOQ==
\xda39a3ee5e6b4b0d3255bfef95601890afd80709
(1 row)
-select digest(pgp_sym_decrypt(dearmor('
+select digest(pgp_sym_decrypt_bytea(dearmor('
-----BEGIN PGP MESSAGE-----
Comment: dat3.aes.sha1.mdc.s2k3.z0
@@ -387,6 +387,28 @@ ERROR: Wrong key or corrupt data
select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1');
NOTICE: dbg: parse_literal_data: data type=b
ERROR: Not text data
+-- NUL byte in text decrypt. Ciphertext source:
+-- printf 'a\x00\xc' | gpg --homedir /nonexistent \
+-- --personal-compress-preferences uncompressed --textmode \
+-- --personal-cipher-preferences aes --no-emit-version --batch \
+-- --symmetric --passphrase key --armor
+do $$
+begin
+ perform pgp_sym_decrypt(dearmor('
+-----BEGIN PGP MESSAGE-----
+
+jA0EBwMCXLc8pozB10Fg0jQBVUID59TLvWutJp0j6eh9ZgjqIRzdYaIymFB8y4XH
+vu0YlJP5D5BX7yqZ+Pry7TlDmiFO
+=rV7z
+-----END PGP MESSAGE-----
+'), 'key', 'debug=1');
+exception when others then
+ raise '%',
+ regexp_replace(sqlerrm, 'encoding "[^"]*"', 'encoding [REDACTED]');
+end
+$$;
+ERROR: invalid byte sequence for encoding [REDACTED]: 0x00
+CONTEXT: PL/pgSQL function inline_code_block line 12 at RAISE
-- Decryption with a certain incorrect key yields an apparent BZip2-compressed
-- plaintext. Ciphertext source: iterative pgp_sym_encrypt('secret', 'key')
-- until the random prefix gave rise to that property.
diff --git a/contrib/pgcrypto/expected/pgp-decrypt_1.out b/contrib/pgcrypto/expected/pgp-decrypt_1.out
index 80a4c48613d..ee57ad43cb7 100644
--- a/contrib/pgcrypto/expected/pgp-decrypt_1.out
+++ b/contrib/pgcrypto/expected/pgp-decrypt_1.out
@@ -311,7 +311,7 @@ SaV9L04ky1qECNDx3XjnoKLC+H7IOQ==
\xda39a3ee5e6b4b0d3255bfef95601890afd80709
(1 row)
-select digest(pgp_sym_decrypt(dearmor('
+select digest(pgp_sym_decrypt_bytea(dearmor('
-----BEGIN PGP MESSAGE-----
Comment: dat3.aes.sha1.mdc.s2k3.z0
@@ -383,6 +383,28 @@ ERROR: Wrong key or corrupt data
select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1');
NOTICE: dbg: parse_literal_data: data type=b
ERROR: Not text data
+-- NUL byte in text decrypt. Ciphertext source:
+-- printf 'a\x00\xc' | gpg --homedir /nonexistent \
+-- --personal-compress-preferences uncompressed --textmode \
+-- --personal-cipher-preferences aes --no-emit-version --batch \
+-- --symmetric --passphrase key --armor
+do $$
+begin
+ perform pgp_sym_decrypt(dearmor('
+-----BEGIN PGP MESSAGE-----
+
+jA0EBwMCXLc8pozB10Fg0jQBVUID59TLvWutJp0j6eh9ZgjqIRzdYaIymFB8y4XH
+vu0YlJP5D5BX7yqZ+Pry7TlDmiFO
+=rV7z
+-----END PGP MESSAGE-----
+'), 'key', 'debug=1');
+exception when others then
+ raise '%',
+ regexp_replace(sqlerrm, 'encoding "[^"]*"', 'encoding [REDACTED]');
+end
+$$;
+ERROR: invalid byte sequence for encoding [REDACTED]: 0x00
+CONTEXT: PL/pgSQL function inline_code_block line 12 at RAISE
-- Decryption with a certain incorrect key yields an apparent BZip2-compressed
-- plaintext. Ciphertext source: iterative pgp_sym_encrypt('secret', 'key')
-- until the random prefix gave rise to that property.
diff --git a/contrib/pgcrypto/expected/pgp-pubkey-session.out b/contrib/pgcrypto/expected/pgp-pubkey-session.out
new file mode 100644
index 00000000000..f724d98eb24
--- /dev/null
+++ b/contrib/pgcrypto/expected/pgp-pubkey-session.out
@@ -0,0 +1,47 @@
+-- Test for overflow with session key at decrypt.
+-- Data automatically generated by scripts/pgp_session_data.py.
+-- See this file for details explaining how this data is generated.
+SELECT pgp_pub_decrypt_bytea(
+'\xc1c04c030000000000000000020800a46f5b9b1905b49457a6485474f71ed9b46c2527e1
+da08e1f7871e12c3d38828f2076b984a595bf60f616599ca5729d547de06a258bfbbcd30
+94a321e4668cd43010f0ca8ecf931e5d39bda1152c50c367b11c723f270729245d3ebdbd
+0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5060af7603cfd9ed186ebadd616
+3b50ae42bea5f6d14dda24e6d4687b434c175084515d562e896742b0ba9a1c87d5642e10
+a5550379c71cc490a052ada483b5d96526c0a600fc51755052aa77fdf72f7b4989b920e7
+b90f4b30787a46482670d5caecc7a515a926055ad5509d135702ce51a0e4c1033f2d939d
+8f0075ec3428e17310da37d3d2d7ad1ce99adcc91cd446c366c402ae1ee38250343a7fcc
+0f8bc28020e603d7a4795ef0dcc1c04c030000000000000000020800a46f5b9b1905b494
+57a6485474f71ed9b46c2527e1da08e1f7871e12c3d38828f2076b984a595bf60f616599
+ca5729d547de06a258bfbbcd3094a321e4668cd43010f0ca8ecf931e5d39bda1152c50c3
+67b11c723f270729245d3ebdbd0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5
+060af7603cfd9ed186ebadd6163b50ae42bea5f6d14dda24e6d4687b434c175084515d56
+2e896742b0ba9a1c87d5642e10a5550379c71cc490a052ada483b5d96526c0a600fc5175
+5052aa77fdf72f7b4989b920e7b90f4b30787a46482670d5caecc7a515a926055ad5509d
+135702ce51a0e4c1033f2d939d8f0075ec3428e17310da37d3d2d7ad1ce99adc'::bytea,
+'\xc7c2d8046965d657020800eef8bf1515adb1a3ee7825f75c668ea8dd3e3f9d13e958f6ad
+9c55adc0c931a4bb00abe1d52cf7bb0c95d537949d277a5292ede375c6b2a67a3bf7d19f
+f975bb7e7be35c2d8300dacba360a0163567372f7dc24000cc7cb6170bedc8f3b1f98c12
+07a6cb4de870a4bc61319b139dcc0e20c368fd68f8fd346d2c0b69c5aed560504e2ec6f1
+23086fe3c5540dc4dd155c0c67257c4ada862f90fe172ace344089da8135e92aca5c2709
+f1c1bc521798bb8c0365841496e709bd184132d387e0c9d5f26dc00fd06c3a76ef66a75c
+138285038684707a847b7bd33cfbefbf1d336be954a8048946af97a66352adef8e8b5ae4
+c4748c6f2510265b7a8267bc370dbb00110100010007ff7e72d4f95d2d39901ac12ca5c5
+18e767e719e72340c3fab51c8c5ab1c40f31db8eaffe43533fa61e2dbca2c3f4396c0847
+e5434756acbb1f68128f4136bb135710c89137d74538908dac77967de9e821c559700dd9
+de5a2727eec1f5d12d5d74869dd1de45ed369d94a8814d23861dd163f8c27744b26b98f0
+239c2e6dd1e3493b8cc976fdc8f9a5e250f715aa4c3d7d5f237f8ee15d242e8fa941d1a0
+ed9550ab632d992a97518d142802cb0a97b251319bf5742db8d9d8cbaa06cdfba2d75bc9
+9d77a51ff20bd5ba7f15d7af6e85b904de2855d19af08d45f39deb85403033c69c767a8e
+74a343b1d6c8911d34ea441ac3850e57808ed3d885835cbe6c79d10400ef16256f3d5c4c
+3341516a2d2aa888df81b603f48a27f3666b40f992a857c1d11ff639cd764a9b42d5a1f8
+58b4aeee36b85508bb5e8b91ef88a7737770b330224479d9b44eae8c631bc43628b69549
+507c0a1af0be0dd7696015abea722b571eb35eefc4ab95595378ec12814727443f625fcd
+183bb9b3bccf53b54dd0e5e7a50400ffe08537b2d4e6074e4a1727b658cfccdec8962302
+25e300c05690de45f7065c3d40d86f544a64d51a3e94424f9851a16d1322ebdb41fa8a45
+3131f3e2dc94e858e6396722643df382680f815e53bcdcde5da622f50530a83b217f1103
+cdd6e5e9babe1e415bbff28d44bd18c95f43bbd04afeb2a2a99af38a571c7540de21df03
+ff62c0a33d9143dd3f639893f47732c11c5a12c6052d1935f4d507b7ae1f76ab0e9a69b8
+7305a7f7c19bd509daf4903bff614bc26d118f03e461469c72c12d3a2bb4f78e4d342ce8
+487723649a01ed2b9eb11c662134502c098d55dfcd361939d8370873422c3da75a515a75
+9ffedfe7df44fb3c20f81650801a30d43b5c90b98b3eee'::bytea);
+ERROR: Public key too big
diff --git a/contrib/pgcrypto/meson.build b/contrib/pgcrypto/meson.build
index c9c48f16f90..4f255c8cb05 100644
--- a/contrib/pgcrypto/meson.build
+++ b/contrib/pgcrypto/meson.build
@@ -52,6 +52,7 @@ pgcrypto_regress = [
'pgp-encrypt-md5',
'pgp-pubkey-decrypt',
'pgp-pubkey-encrypt',
+ 'pgp-pubkey-session',
'pgp-info',
'crypt-shacrypt'
]
diff --git a/contrib/pgcrypto/pgp-pgsql.c b/contrib/pgcrypto/pgp-pgsql.c
index 3e47b9364ab..d3e7895b0d9 100644
--- a/contrib/pgcrypto/pgp-pgsql.c
+++ b/contrib/pgcrypto/pgp-pgsql.c
@@ -631,6 +631,7 @@ pgp_sym_decrypt_text(PG_FUNCTION_ARGS)
arg = PG_GETARG_TEXT_PP(2);
res = decrypt_internal(0, 1, data, key, NULL, arg);
+ pg_verifymbstr(VARDATA_ANY(res), VARSIZE_ANY_EXHDR(res), false);
PG_FREE_IF_COPY(data, 0);
PG_FREE_IF_COPY(key, 1);
@@ -732,6 +733,7 @@ pgp_pub_decrypt_text(PG_FUNCTION_ARGS)
arg = PG_GETARG_TEXT_PP(3);
res = decrypt_internal(1, 1, data, key, psw, arg);
+ pg_verifymbstr(VARDATA_ANY(res), VARSIZE_ANY_EXHDR(res), false);
PG_FREE_IF_COPY(data, 0);
PG_FREE_IF_COPY(key, 1);
diff --git a/contrib/pgcrypto/pgp-pubdec.c b/contrib/pgcrypto/pgp-pubdec.c
index a0a5738a40e..2a13aa3e6ad 100644
--- a/contrib/pgcrypto/pgp-pubdec.c
+++ b/contrib/pgcrypto/pgp-pubdec.c
@@ -157,6 +157,7 @@ pgp_parse_pubenc_sesskey(PGP_Context *ctx, PullFilter *pkt)
uint8 *msg;
int msglen;
PGP_MPI *m;
+ unsigned sess_key_len;
pk = ctx->pub_key;
if (pk == NULL)
@@ -220,11 +221,19 @@ pgp_parse_pubenc_sesskey(PGP_Context *ctx, PullFilter *pkt)
if (res < 0)
goto out;
+ sess_key_len = msglen - 3;
+ if (sess_key_len > PGP_MAX_KEY)
+ {
+ px_debug("incorrect session key length=%u", sess_key_len);
+ res = PXE_PGP_KEY_TOO_BIG;
+ goto out;
+ }
+
/*
* got sesskey
*/
ctx->cipher_algo = *msg;
- ctx->sess_key_len = msglen - 3;
+ ctx->sess_key_len = sess_key_len;
memcpy(ctx->sess_key, msg + 1, ctx->sess_key_len);
out:
diff --git a/contrib/pgcrypto/px.c b/contrib/pgcrypto/px.c
index 4d668d4e496..d9bf1aae81e 100644
--- a/contrib/pgcrypto/px.c
+++ b/contrib/pgcrypto/px.c
@@ -65,6 +65,7 @@ static const struct error_desc px_err_list[] = {
{PXE_PGP_UNEXPECTED_PKT, "Unexpected packet in key data"},
{PXE_PGP_MATH_FAILED, "Math operation failed"},
{PXE_PGP_SHORT_ELGAMAL_KEY, "Elgamal keys must be at least 1024 bits long"},
+ {PXE_PGP_KEY_TOO_BIG, "Public key too big"},
{PXE_PGP_UNKNOWN_PUBALGO, "Unknown public-key encryption algorithm"},
{PXE_PGP_WRONG_KEY, "Wrong key"},
{PXE_PGP_MULTIPLE_KEYS,
diff --git a/contrib/pgcrypto/px.h b/contrib/pgcrypto/px.h
index 4b81fceab8e..a09533a3582 100644
--- a/contrib/pgcrypto/px.h
+++ b/contrib/pgcrypto/px.h
@@ -75,7 +75,7 @@
/* -108 is unused */
#define PXE_PGP_MATH_FAILED -109
#define PXE_PGP_SHORT_ELGAMAL_KEY -110
-/* -111 is unused */
+#define PXE_PGP_KEY_TOO_BIG -111
#define PXE_PGP_UNKNOWN_PUBALGO -112
#define PXE_PGP_WRONG_KEY -113
#define PXE_PGP_MULTIPLE_KEYS -114
diff --git a/contrib/pgcrypto/scripts/pgp_session_data.py b/contrib/pgcrypto/scripts/pgp_session_data.py
new file mode 100644
index 00000000000..999350bb2bc
--- /dev/null
+++ b/contrib/pgcrypto/scripts/pgp_session_data.py
@@ -0,0 +1,491 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+#
+# Generate PGP data to check the session key length of the input data provided
+# to pgp_pub_decrypt_bytea().
+#
+# First, the crafted data is generated from valid RSA data, freshly generated
+# by this script each time it is run, see generate_rsa_keypair().
+# Second, the crafted PGP data is built, see build_message_data() and
+# build_key_data(). Finally, the resulting SQL script is generated.
+#
+# This script generates in stdout the SQL file that is used in the regression
+# tests of pgcrypto. The following command can be used to regenerate the file
+# which should never be manually manipulated:
+# python3 scripts/pgp_session_data.py > sql/pgp-pubkey-session.sql
+
+import os
+import re
+import struct
+import secrets
+import sys
+import time
+
+# pwn for binary manipulation (p32, p64)
+from pwn import *
+
+# Cryptographic libraries, to craft the PGP data.
+from Crypto.Cipher import AES
+from Crypto.PublicKey import RSA
+from Crypto.Util.number import inverse
+
+# AES key used for session key encryption (16 bytes for AES-128)
+AES_KEY = b'\x01' * 16
+
+def generate_rsa_keypair(key_size: int = 2048) -> dict:
+ """
+ Generate a fresh RSA key pair.
+
+ The generated key includes all components needed for PGP operations:
+ - n: public modulus (p * q)
+ - e: public exponent (typically 65537)
+ - d: private exponent (e^-1 mod phi(n))
+ - p, q: prime factors of n
+ - u: coefficient (p^-1 mod q) for CRT optimization
+
+ The caller can pass the wanted key size in input, for a default of 2048
+ bytes. This function returns the RSA key components, after performing
+ some validation on them.
+ """
+
+ start_time = time.time()
+
+ # Generate RSA key
+ key = RSA.generate(key_size)
+
+ # Extract all key components
+ rsa_components = {
+ 'n': key.n, # Public modulus (p * q)
+ 'e': key.e, # Public exponent (typically 65537)
+ 'd': key.d, # Private exponent (e^-1 mod phi(n))
+ 'p': key.p, # First prime factor
+ 'q': key.q, # Second prime factor
+ 'u': inverse(key.p, key.q) # Coefficient for CRT: p^-1 mod q
+ }
+
+ # Validate key components for correctness
+ validate_rsa_key(rsa_components)
+
+ return rsa_components
+
+def validate_rsa_key(rsa: dict) -> None:
+ """
+ Validate a generated RSA key.
+
+ This function performs basic validation to ensure the RSA key is properly
+ constructed and all components are consistent, at least mathematically.
+
+ Validations performed:
+ 1. n = p * q (modulus is product of primes)
+ 2. gcd(e, phi(n)) = 1 (public exponent is coprime to phi(n))
+ 3. (d * e) mod(phi(n)) = 1 (private exponent is multiplicative inverse)
+ 4. (u * p) (mod q) = 1 (coefficient is correct for CRT)
+ """
+
+ n, e, d, p, q, u = rsa['n'], rsa['e'], rsa['d'], rsa['p'], rsa['q'], rsa['u']
+
+ # Check that n = p * q
+ if n != p * q:
+ raise ValueError("RSA validation failed: n <> p * q")
+
+ # Check that p and q are different
+ if p == q:
+ raise ValueError("RSA validation failed: p = q (not allowed)")
+
+ # Calculate phi(n) = (p-1)(q-1)
+ phi_n = (p - 1) * (q - 1)
+
+ # Check that gcd(e, phi(n)) = 1
+ def gcd(a, b):
+ while b:
+ a, b = b, a % b
+ return a
+
+ if gcd(e, phi_n) != 1:
+ raise ValueError("RSA validation failed: gcd(e, phi(n)) <> 1")
+
+ # Check that (d * e) mod(phi(n)) = 1
+ if (d * e) % phi_n != 1:
+ raise ValueError("RSA validation failed: d * e <> 1 (mod phi(n))")
+
+ # Check that (u * p) (mod q) = 1
+ if (u * p) % q != 1:
+ raise ValueError("RSA validation failed: u * p <> 1 (mod q)")
+
+def mpi_encode(x: int) -> bytes:
+ """
+ Encode an integer as an OpenPGP Multi-Precision Integer (MPI).
+
+ Format (RFC 4880, Section 3.2):
+ - 2 bytes: bit length of the integer (big-endian)
+ - N bytes: the integer in big-endian format
+
+ This is used to encode RSA key components (n, e, d, p, q, u) in PGP
+ packets.
+
+ The integer to encode is given in input, returning an MPI-encoded
+ integer.
+
+ For example:
+ mpi_encode(65537) -> b'\x00\x11\x01\x00\x01'
+ (17 bits, value 0x010001)
+ """
+ if x < 0:
+ raise ValueError("MPI cannot encode negative integers")
+
+ if x == 0:
+ # Special case: zero has 0 bits and empty magnitude
+ bits = 0
+ mag = b""
+ else:
+ # Calculate bit length and convert to bytes
+ bits = x.bit_length()
+ mag = x.to_bytes((bits + 7) // 8, 'big')
+
+ # Pack: 2-byte bit length + magnitude bytes
+ return struct.pack('>H', bits) + mag
+
+def new_packet(tag: int, payload: bytes) -> bytes:
+ """
+ Create a new OpenPGP packet with a proper header.
+
+ OpenPGP packet format (RFC 4880, Section 4.2):
+ - New packet format: 0xC0 | tag
+ - Length encoding depends on payload size:
+ * 0-191: single byte
+ * 192-8383: two bytes (192 + ((length - 192) >> 8), (length - 192) & 0xFF)
+ * 8384+: five bytes (0xFF + 4-byte big-endian length)
+
+ The packet is built from a "tag" (1-63) and some "payload" data. The
+ result generated is a complete OpenPGP packet.
+
+ For example:
+ new_packet(1, b'data') -> b'\xC1\x04data'
+ (Tag 1, length 4, payload 'data')
+ """
+ # New packet format: set bit 7 and 6, clear bit 5, tag in bits 0-5
+ first = 0xC0 | (tag & 0x3F)
+ ln = len(payload)
+
+ # Encode length according to OpenPGP specification
+ if ln <= 191:
+ # Single byte length for small packets
+ llen = bytes([ln])
+ elif ln <= 8383:
+ # Two-byte length for medium packets
+ ln2 = ln - 192
+ llen = bytes([192 + (ln2 >> 8), ln2 & 0xFF])
+ else:
+ # Five-byte length for large packets
+ llen = bytes([255]) + struct.pack('>I', ln)
+
+ return bytes([first]) + llen + payload
+
+def build_key_data(rsa: dict) -> bytes:
+ """
+ Build the key data, containing an RSA private key.
+
+ The RSA contents should have been generated previously.
+
+ Format (see RFC 4880, Section 5.5.3):
+ - 1 byte: version (4)
+ - 4 bytes: creation time (current Unix timestamp)
+ - 1 byte: public key algorithm (2 = RSA encrypt)
+ - MPI: RSA public modulus n
+ - MPI: RSA public exponent e
+ - 1 byte: string-to-key usage (0 = no encryption)
+ - MPI: RSA private exponent d
+ - MPI: RSA prime p
+ - MPI: RSA prime q
+ - MPI: RSA coefficient u = p^-1 mod q
+ - 2 bytes: checksum of private key material
+
+ This function takes a set of RSA key components in input (n, e, d, p, q, u)
+ and returns a secret key packet.
+ """
+
+ # Public key portion
+ ver = bytes([4]) # Version 4 key
+ ctime = struct.pack('>I', int(time.time())) # Current Unix timestamp
+ algo = bytes([2]) # RSA encrypt algorithm
+ n_mpi = mpi_encode(rsa['n']) # Public modulus
+ e_mpi = mpi_encode(rsa['e']) # Public exponent
+ pub = ver + ctime + algo + n_mpi + e_mpi
+
+ # Private key portion
+ hide_type = bytes([0]) # No string-to-key encryption
+ d_mpi = mpi_encode(rsa['d']) # Private exponent
+ p_mpi = mpi_encode(rsa['p']) # Prime p
+ q_mpi = mpi_encode(rsa['q']) # Prime q
+ u_mpi = mpi_encode(rsa['u']) # Coefficient u = p^-1 mod q
+
+ # Calculate checksum of private key material (simple sum mod 65536)
+ private_data = d_mpi + p_mpi + q_mpi + u_mpi
+ cksum = sum(private_data) & 0xFFFF
+
+ secret = hide_type + private_data + struct.pack('>H', cksum)
+ payload = pub + secret
+
+ return new_packet(7, payload)
+
+def pgp_cfb_encrypt_resync(key, plaintext):
+ """
+ Implement OpenPGP CFB mode with resync.
+
+ OpenPGP CFB mode is a variant of standard CFB with a resync operation
+ after the first two blocks.
+
+ Algorithm (RFC 4880, Section 13.9):
+ 1. Block 1: FR=zeros, encrypt full block_size bytes
+ 2. Block 2: FR=block1, encrypt only 2 bytes
+ 3. Resync: FR = block1[2:] + block2
+ 4. Remaining blocks: standard CFB mode
+
+ This function uses the following arguments:
+ - key: AES encryption key (16 bytes for AES-128)
+ - plaintext: Data to encrypt
+ """
+ block_size = 16 # AES block size
+ cipher = AES.new(key[:16], AES.MODE_ECB) # Use ECB for manual CFB
+ ciphertext = b''
+
+ # Block 1: FR=zeros, encrypt full 16 bytes
+ FR = b'\x00' * block_size
+ FRE = cipher.encrypt(FR) # Encrypt the feedback register
+ block1 = bytes(a ^ b for a, b in zip(FRE, plaintext[0:16]))
+ ciphertext += block1
+
+ # Block 2: FR=block1, encrypt only 2 bytes
+ FR = block1
+ FRE = cipher.encrypt(FR)
+ block2 = bytes(a ^ b for a, b in zip(FRE[0:2], plaintext[16:18]))
+ ciphertext += block2
+
+ # Resync: FR = block1[2:16] + block2[0:2]
+ # This is the key difference from standard CFB mode
+ FR = block1[2:] + block2
+
+ # Block 3+: Continue with standard CFB mode
+ pos = 18
+ while pos < len(plaintext):
+ FRE = cipher.encrypt(FR)
+ chunk_len = min(block_size, len(plaintext) - pos)
+ chunk = plaintext[pos:pos+chunk_len]
+ enc_chunk = bytes(a ^ b for a, b in zip(FRE[:chunk_len], chunk))
+ ciphertext += enc_chunk
+
+ # Update feedback register for next iteration
+ if chunk_len == block_size:
+ FR = enc_chunk
+ else:
+ # Partial block: pad with old FR bytes
+ FR = enc_chunk + FR[chunk_len:]
+ pos += chunk_len
+
+ return ciphertext
+
+def build_literal_data_packet(data: bytes) -> bytes:
+ """
+ Build a literal data packet containing a message.
+
+ Format (RFC 4880, Section 5.9):
+ - 1 byte: data format ('b' = binary, 't' = text, 'u' = UTF-8 text)
+ - 1 byte: filename length (0 = no filename)
+ - N bytes: filename (empty in this case)
+ - 4 bytes: date (current Unix timestamp)
+ - M bytes: literal data
+
+ The data used to build the packet is given in input, with the generated
+ result returned.
+ """
+ body = bytes([
+ ord('b'), # Binary data format
+ 0, # Filename length (0 = no filename)
+ ]) + struct.pack('>I', int(time.time())) + data # Current timestamp + data
+
+ return new_packet(11, body)
+
+def build_symenc_data_packet(sess_key: bytes, cipher_algo: int, payload: bytes) -> bytes:
+ """
+ Build a symmetrically-encrypted data packet using AES-128-CFB.
+
+ This packet contains encrypted data using the session key. The format
+ includes a random prefix, for security (see RFC 4880, Section 5.7).
+
+ Packet structure:
+ - Random prefix (block_size bytes)
+ - Prefix repeat (last 2 bytes of prefix repeated)
+ - Encrypted literal data packet
+
+ This function uses the following set of arguments:
+ - sess_key: Session key for encryption
+ - cipher_algo: Cipher algorithm identifier (7 = AES-128)
+ - payload: Data to encrypt (wrapped in literal data packet)
+ """
+ block_size = 16 # AES-128 block size
+ key = sess_key[:16] # Use first 16 bytes for AES-128
+
+ # Create random prefix + repeat last 2 bytes (total 18 bytes)
+ # This is required by OpenPGP for integrity checking
+ prefix_random = secrets.token_bytes(block_size)
+ prefix = prefix_random + prefix_random[-2:] # 18 bytes total
+
+ # Wrap payload in literal data packet
+ literal_pkt = build_literal_data_packet(payload)
+
+ # Plaintext = prefix + literal data packet
+ plaintext = prefix + literal_pkt
+
+ # Encrypt using OpenPGP CFB mode with resync
+ ciphertext = pgp_cfb_encrypt_resync(key, plaintext)
+
+ return new_packet(9, ciphertext)
+
+def build_tag1_packet(rsa: dict, sess_key: bytes) -> bytes:
+ """
+ Build a public-key encrypted key.
+
+ This is a very important function, as it is able to create the packet
+ triggering the overflow check. This function can also be used to create
+ "legit" packet data.
+
+ Format (RFC 4880, Section 5.1):
+ - 1 byte: version (3)
+ - 8 bytes: key ID (0 = any key accepted)
+ - 1 byte: public key algorithm (2 = RSA encrypt)
+ - MPI: RSA-encrypted session key
+
+ This uses in arguments the generated RSA key pair, and the session key
+ to encrypt. The latter is manipulated to trigger the overflow.
+
+ This function returns a complete packet encrypted by a session key.
+ """
+
+ # Calculate RSA modulus size in bytes
+ n_bytes = (rsa['n'].bit_length() + 7) // 8
+
+ # Session key message format:
+ # - 1 byte: symmetric cipher algorithm (7 = AES-128)
+ # - N bytes: session key
+ # - 2 bytes: checksum (simple sum of session key bytes)
+ algo_byte = bytes([7]) # AES-128 algorithm identifier
+ cksum = sum(sess_key) & 0xFFFF # 16-bit checksum
+ M = algo_byte + sess_key + struct.pack('>H', cksum)
+
+ # PKCS#1 v1.5 padding construction
+ # Format: 0x02 || PS || 0x00 || M
+ # Total padded message must be exactly n_bytes long.
+ total_len = n_bytes # Total length must equal modulus size in bytes
+ ps_len = total_len - len(M) - 2 # Subtract 2 for 0x02 and 0x00 bytes
+
+ if ps_len < 8:
+ raise ValueError(f"Padding string too short ({ps_len} bytes); need at least 8 bytes. "
+ f"Message length: {len(M)}, Modulus size: {n_bytes} bytes")
+
+ # Create padding string with *ALL* bytes being 0xFF (no zero separator!)
+ PS = bytes([0xFF]) * ps_len
+
+ # Construct the complete padded message
+ # Normal PKCS#1 v1.5 padding: 0x02 || PS || 0x00 || M
+ padded = bytes([0x02]) + PS + bytes([0x00]) + M
+
+ # Verify padding construction
+ if len(padded) != n_bytes:
+ raise ValueError(f"Padded message length ({len(padded)}) doesn't match RSA modulus size ({n_bytes})")
+
+ # Convert padded message to integer and encrypt with RSA
+ m_int = int.from_bytes(padded, 'big')
+
+ # Ensure message is smaller than modulus (required for RSA)
+ if m_int >= rsa['n']:
+ raise ValueError("Padded message is larger than RSA modulus")
+
+ # RSA encryption: c = m^e mod n
+ c_int = pow(m_int, rsa['e'], rsa['n'])
+
+ # Encode encrypted result as MPI
+ c_mpi = mpi_encode(c_int)
+
+ # Build complete packet
+ ver = bytes([3]) # Version 3 packet
+ key_id = b"\x00" * 8 # Key ID (0 = any key accepted)
+ algo = bytes([2]) # RSA encrypt algorithm
+ payload = ver + key_id + algo + c_mpi
+
+ return new_packet(1, payload)
+
+def build_message_data(rsa: dict) -> bytes:
+ """
+ This function creates a crafted message, with a long session key
+ length.
+
+ This takes in input the RSA key components generated previously,
+ returning a concatenated set of PGP packets crafted for the purpose
+ of this test.
+ """
+
+ # Base prefix for session key (AES key + padding + size).
+ # Note that the crafted size is the important part for this test.
+ prefix = AES_KEY + b"\x00" * 16 + p32(0x10)
+
+ # Build encrypted data packet, legit.
+ sedata = build_symenc_data_packet(AES_KEY, cipher_algo=7, payload=b"\x0a\x00")
+
+ # Build multiple packets
+ packets = [
+ # First packet, legit.
+ build_tag1_packet(rsa, prefix),
+
+ # Encrypted data packet, legit.
+ sedata,
+
+ # Second packet: information payload.
+ #
+ # This packet contains a longer-crafted session key, able to trigger
+ # the overflow check in pgcrypto. This is the critical part, and
+ # and you are right to pay a lot of attention here if you are
+ # reading this code.
+ build_tag1_packet(rsa, prefix)
+ ]
+
+ return b"".join(packets)
+
+def main():
+ # Default key size.
+ # This number can be set to a higher number if wanted, like 4096. We
+ # just do not need to do that here.
+ key_size = 2048
+
+ # Generate fresh RSA key pair
+ rsa = generate_rsa_keypair(key_size)
+
+ # Generate the message data.
+ print("### Building message data", file=sys.stderr)
+ message_data = build_message_data(rsa)
+
+ # Build the key containing the RSA private key
+ print("### Building key data", file=sys.stderr)
+ key_data = build_key_data(rsa)
+
+ # Convert to hexadecimal, for the bytea used in the SQL file.
+ message_data = message_data.hex()
+ key_data = key_data.hex()
+
+ # Split each value into lines of 72 characters, for readability.
+ message_data = re.sub("(.{72})", "\\1\n", message_data, 0, re.DOTALL)
+ key_data = re.sub("(.{72})", "\\1\n", key_data, 0, re.DOTALL)
+
+ # Get the script filename for documentation
+ file_basename = os.path.basename(__file__)
+
+ # Output the SQL test case
+ print(f'''-- Test for overflow with session key at decrypt.
+-- Data automatically generated by scripts/{file_basename}.
+-- See this file for details explaining how this data is generated.
+SELECT pgp_pub_decrypt_bytea(
+'\\x{message_data}'::bytea,
+'\\x{key_data}'::bytea);''',
+ file=sys.stdout)
+
+if __name__ == "__main__":
+ main()
diff --git a/contrib/pgcrypto/sql/pgp-decrypt.sql b/contrib/pgcrypto/sql/pgp-decrypt.sql
index 49a0267bbcb..b499bf757b0 100644
--- a/contrib/pgcrypto/sql/pgp-decrypt.sql
+++ b/contrib/pgcrypto/sql/pgp-decrypt.sql
@@ -228,7 +228,7 @@ SaV9L04ky1qECNDx3XjnoKLC+H7IOQ==
-----END PGP MESSAGE-----
'), '0123456789abcdefghij'), 'sha1');
-select digest(pgp_sym_decrypt(dearmor('
+select digest(pgp_sym_decrypt_bytea(dearmor('
-----BEGIN PGP MESSAGE-----
Comment: dat3.aes.sha1.mdc.s2k3.z0
@@ -282,6 +282,27 @@ VsxxqLSPzNLAeIspJk5G
-- Routine text/binary mismatch.
select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1');
+-- NUL byte in text decrypt. Ciphertext source:
+-- printf 'a\x00\xc' | gpg --homedir /nonexistent \
+-- --personal-compress-preferences uncompressed --textmode \
+-- --personal-cipher-preferences aes --no-emit-version --batch \
+-- --symmetric --passphrase key --armor
+do $$
+begin
+ perform pgp_sym_decrypt(dearmor('
+-----BEGIN PGP MESSAGE-----
+
+jA0EBwMCXLc8pozB10Fg0jQBVUID59TLvWutJp0j6eh9ZgjqIRzdYaIymFB8y4XH
+vu0YlJP5D5BX7yqZ+Pry7TlDmiFO
+=rV7z
+-----END PGP MESSAGE-----
+'), 'key', 'debug=1');
+exception when others then
+ raise '%',
+ regexp_replace(sqlerrm, 'encoding "[^"]*"', 'encoding [REDACTED]');
+end
+$$;
+
-- Decryption with a certain incorrect key yields an apparent BZip2-compressed
-- plaintext. Ciphertext source: iterative pgp_sym_encrypt('secret', 'key')
-- until the random prefix gave rise to that property.
diff --git a/contrib/pgcrypto/sql/pgp-pubkey-session.sql b/contrib/pgcrypto/sql/pgp-pubkey-session.sql
new file mode 100644
index 00000000000..51792f1f4d8
--- /dev/null
+++ b/contrib/pgcrypto/sql/pgp-pubkey-session.sql
@@ -0,0 +1,46 @@
+-- Test for overflow with session key at decrypt.
+-- Data automatically generated by scripts/pgp_session_data.py.
+-- See this file for details explaining how this data is generated.
+SELECT pgp_pub_decrypt_bytea(
+'\xc1c04c030000000000000000020800a46f5b9b1905b49457a6485474f71ed9b46c2527e1
+da08e1f7871e12c3d38828f2076b984a595bf60f616599ca5729d547de06a258bfbbcd30
+94a321e4668cd43010f0ca8ecf931e5d39bda1152c50c367b11c723f270729245d3ebdbd
+0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5060af7603cfd9ed186ebadd616
+3b50ae42bea5f6d14dda24e6d4687b434c175084515d562e896742b0ba9a1c87d5642e10
+a5550379c71cc490a052ada483b5d96526c0a600fc51755052aa77fdf72f7b4989b920e7
+b90f4b30787a46482670d5caecc7a515a926055ad5509d135702ce51a0e4c1033f2d939d
+8f0075ec3428e17310da37d3d2d7ad1ce99adcc91cd446c366c402ae1ee38250343a7fcc
+0f8bc28020e603d7a4795ef0dcc1c04c030000000000000000020800a46f5b9b1905b494
+57a6485474f71ed9b46c2527e1da08e1f7871e12c3d38828f2076b984a595bf60f616599
+ca5729d547de06a258bfbbcd3094a321e4668cd43010f0ca8ecf931e5d39bda1152c50c3
+67b11c723f270729245d3ebdbd0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5
+060af7603cfd9ed186ebadd6163b50ae42bea5f6d14dda24e6d4687b434c175084515d56
+2e896742b0ba9a1c87d5642e10a5550379c71cc490a052ada483b5d96526c0a600fc5175
+5052aa77fdf72f7b4989b920e7b90f4b30787a46482670d5caecc7a515a926055ad5509d
+135702ce51a0e4c1033f2d939d8f0075ec3428e17310da37d3d2d7ad1ce99adc'::bytea,
+'\xc7c2d8046965d657020800eef8bf1515adb1a3ee7825f75c668ea8dd3e3f9d13e958f6ad
+9c55adc0c931a4bb00abe1d52cf7bb0c95d537949d277a5292ede375c6b2a67a3bf7d19f
+f975bb7e7be35c2d8300dacba360a0163567372f7dc24000cc7cb6170bedc8f3b1f98c12
+07a6cb4de870a4bc61319b139dcc0e20c368fd68f8fd346d2c0b69c5aed560504e2ec6f1
+23086fe3c5540dc4dd155c0c67257c4ada862f90fe172ace344089da8135e92aca5c2709
+f1c1bc521798bb8c0365841496e709bd184132d387e0c9d5f26dc00fd06c3a76ef66a75c
+138285038684707a847b7bd33cfbefbf1d336be954a8048946af97a66352adef8e8b5ae4
+c4748c6f2510265b7a8267bc370dbb00110100010007ff7e72d4f95d2d39901ac12ca5c5
+18e767e719e72340c3fab51c8c5ab1c40f31db8eaffe43533fa61e2dbca2c3f4396c0847
+e5434756acbb1f68128f4136bb135710c89137d74538908dac77967de9e821c559700dd9
+de5a2727eec1f5d12d5d74869dd1de45ed369d94a8814d23861dd163f8c27744b26b98f0
+239c2e6dd1e3493b8cc976fdc8f9a5e250f715aa4c3d7d5f237f8ee15d242e8fa941d1a0
+ed9550ab632d992a97518d142802cb0a97b251319bf5742db8d9d8cbaa06cdfba2d75bc9
+9d77a51ff20bd5ba7f15d7af6e85b904de2855d19af08d45f39deb85403033c69c767a8e
+74a343b1d6c8911d34ea441ac3850e57808ed3d885835cbe6c79d10400ef16256f3d5c4c
+3341516a2d2aa888df81b603f48a27f3666b40f992a857c1d11ff639cd764a9b42d5a1f8
+58b4aeee36b85508bb5e8b91ef88a7737770b330224479d9b44eae8c631bc43628b69549
+507c0a1af0be0dd7696015abea722b571eb35eefc4ab95595378ec12814727443f625fcd
+183bb9b3bccf53b54dd0e5e7a50400ffe08537b2d4e6074e4a1727b658cfccdec8962302
+25e300c05690de45f7065c3d40d86f544a64d51a3e94424f9851a16d1322ebdb41fa8a45
+3131f3e2dc94e858e6396722643df382680f815e53bcdcde5da622f50530a83b217f1103
+cdd6e5e9babe1e415bbff28d44bd18c95f43bbd04afeb2a2a99af38a571c7540de21df03
+ff62c0a33d9143dd3f639893f47732c11c5a12c6052d1935f4d507b7ae1f76ab0e9a69b8
+7305a7f7c19bd509daf4903bff614bc26d118f03e461469c72c12d3a2bb4f78e4d342ce8
+487723649a01ed2b9eb11c662134502c098d55dfcd361939d8370873422c3da75a515a75
+9ffedfe7df44fb3c20f81650801a30d43b5c90b98b3eee'::bytea);
diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 6066510c7c0..7cad5e67d09 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -698,12 +698,12 @@ EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = -c1; -- Op
Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8 FROM "S 1"."T 1" WHERE (("C 1" = (- "C 1")))
(3 rows)
-EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE (c3 IS NOT NULL) IS DISTINCT FROM (c3 IS NOT NULL); -- DistinctExpr
- QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------
+EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c3 IS DISTINCT FROM c3; -- DistinctExpr
+ QUERY PLAN
+----------------------------------------------------------------------------------------------------------
Foreign Scan on public.ft1 t1
Output: c1, c2, c3, c4, c5, c6, c7, c8
- Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8 FROM "S 1"."T 1" WHERE (((c3 IS NOT NULL) IS DISTINCT FROM (c3 IS NOT NULL)))
+ Remote SQL: SELECT "C 1", c2, c3, c4, c5, c6, c7, c8 FROM "S 1"."T 1" WHERE ((c3 IS DISTINCT FROM c3))
(3 rows)
EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = ANY(ARRAY[c2, 1, c1 + 0]); -- ScalarArrayOpExpr
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 4f7ab2ed0ac..eff25bd2baa 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -340,7 +340,7 @@ EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c3 IS NULL; -- Nu
EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c3 IS NOT NULL; -- NullTest
EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE round(abs(c1), 0) = 1; -- FuncExpr
EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = -c1; -- OpExpr(l)
-EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE (c3 IS NOT NULL) IS DISTINCT FROM (c3 IS NOT NULL); -- DistinctExpr
+EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c3 IS DISTINCT FROM c3; -- DistinctExpr
EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = ANY(ARRAY[c2, 1, c1 + 0]); -- ScalarArrayOpExpr
EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c1 = (ARRAY[c1,c2,3])[1]; -- SubscriptingRef
EXPLAIN (VERBOSE, COSTS OFF) SELECT * FROM ft1 t1 WHERE c6 = E'foo''s\\bar'; -- check special chars
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c
index e25c8a5aa26..69b173e4498 100644
--- a/contrib/unaccent/unaccent.c
+++ b/contrib/unaccent/unaccent.c
@@ -156,7 +156,7 @@ initTrie(const char *filename)
state = 0;
for (ptr = line; *ptr; ptr += ptrlen)
{
- ptrlen = pg_mblen(ptr);
+ ptrlen = pg_mblen_cstr(ptr);
/* ignore whitespace, but end src or trg */
if (isspace((unsigned char) *ptr))
{
@@ -382,6 +382,7 @@ unaccent_lexize(PG_FUNCTION_ARGS)
char *srcchar = (char *) PG_GETARG_POINTER(1);
int32 len = PG_GETARG_INT32(2);
char *srcstart = srcchar;
+ const char *srcend = srcstart + len;
TSLexeme *res;
StringInfoData buf;
@@ -409,7 +410,7 @@ unaccent_lexize(PG_FUNCTION_ARGS)
}
else
{
- matchlen = pg_mblen(srcchar);
+ matchlen = pg_mblen_range(srcchar, srcend);
if (buf.data != NULL)
appendBinaryStringInfo(&buf, srcchar, matchlen);
}
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 5560b95ee60..37342986969 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2412,6 +2412,43 @@ include_dir 'conf.d'
+
+ file_extend_method (enum)
+
+ file_extend_method configuration parameter
+
+
+
+
+ Specifies the method used to extend data files during bulk operations
+ such as COPY. The first available option is used as
+ the default, depending on the operating system:
+
+
+
+ posix_fallocate (Unix) uses the standard POSIX
+ interface for allocating disk space, but is missing on some systems.
+ If it is present but the underlying file system doesn't support it,
+ this option silently falls back to write_zeros.
+ Current versions of BTRFS are known to disable compression when
+ this option is used.
+ This is the default on systems that have the function.
+
+
+
+
+ write_zeros extends files by writing out blocks
+ of zero bytes. This is the default on systems that don't have the
+ function posix_fallocate.
+
+
+
+ The write_zeros method is always used when data
+ files are extended by 8 blocks or fewer.
+
+
+
+
max_notify_queue_pages (integer)
@@ -4722,45 +4759,6 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows
-
- synchronized_standby_slots (string)
-
- synchronized_standby_slots configuration parameter
-
-
-
-
- A comma-separated list of streaming replication standby server slot names
- that logical WAL sender processes will wait for. Logical WAL sender processes
- will send decoded changes to plugins only after the specified replication
- slots confirm receiving WAL. This guarantees that logical replication
- failover slots do not consume changes until those changes are received
- and flushed to corresponding physical standbys. If a
- logical replication connection is meant to switch to a physical standby
- after the standby is promoted, the physical replication slot for the
- standby should be listed here. Note that logical replication will not
- proceed if the slots specified in the
- synchronized_standby_slots do not exist or are invalidated.
- Additionally, the replication management functions
-
- pg_replication_slot_advance,
-
- pg_logical_slot_get_changes, and
-
- pg_logical_slot_peek_changes,
- when used with logical failover slots, will block until all
- physical slots specified in synchronized_standby_slots have
- confirmed WAL receipt.
-
-
- The standbys corresponding to the physical replication slots in
- synchronized_standby_slots must configure
- sync_replication_slots = true so they can receive
- logical failover slot changes from the primary.
-
-
-
-
@@ -4909,6 +4907,45 @@ ANY num_sync (
+ synchronized_standby_slots (string)
+
+ synchronized_standby_slots configuration parameter
+
+
+
+
+ A comma-separated list of streaming replication standby server slot names
+ that logical WAL sender processes will wait for. Logical WAL sender processes
+ will send decoded changes to plugins only after the specified replication
+ slots confirm receiving WAL. This guarantees that logical replication
+ failover slots do not consume changes until those changes are received
+ and flushed to corresponding physical standbys. If a
+ logical replication connection is meant to switch to a physical standby
+ after the standby is promoted, the physical replication slot for the
+ standby should be listed here. Note that logical replication will not
+ proceed if the slots specified in the
+ synchronized_standby_slots do not exist or are invalidated.
+ Additionally, the replication management functions
+
+ pg_replication_slot_advance,
+
+ pg_logical_slot_get_changes, and
+
+ pg_logical_slot_peek_changes,
+ when used with logical failover slots, will block until all
+ physical slots specified in synchronized_standby_slots have
+ confirmed WAL receipt.
+
+
+ The standbys corresponding to the physical replication slots in
+ synchronized_standby_slots must configure
+ sync_replication_slots = true so they can receive
+ logical failover slot changes from the primary.
+
+
@@ -7083,27 +7120,57 @@ local0.* /var/log/postgresql
- log_min_messages (enum)
+ log_min_messages (string)
log_min_messages configuration parameter
- Controls which message
- levels are written to the server log.
- Valid values are DEBUG5, DEBUG4,
- DEBUG3, DEBUG2, DEBUG1,
- INFO, NOTICE, WARNING,
- ERROR, LOG, FATAL, and
- PANIC. Each level includes all the levels that
- follow it. The later the level, the fewer messages are sent
- to the log. The default is WARNING. Note that
- LOG has a different rank here than in
+ Controls which
+ message levels
+ are written to the server log. The value is a comma-separated
+ list of zero or more
+ process type:level
+ entries and exactly one mandatory
+ level entry,
+ which becomes the default for process types not listed.
+ Valid process types are listed in the table below.
+
+ archiver
+ autovacuum
+ backend
+ bgworker
+ bgwriter
+ checkpointer
+ ioworker
+ postmaster
+ syslogger
+ slotsyncworker
+ startup
+ walreceiver
+ walsender
+ walsummarizer
+ walwriter
+
+ Valid level values are DEBUG5,
+ DEBUG4, DEBUG3, DEBUG2,
+ DEBUG1, INFO, NOTICE,
+ WARNING, ERROR, LOG,
+ FATAL, and PANIC. Each level includes
+ all the levels that follow it. The later the level, the fewer messages are sent
+ to the log. The default is WARNING, which
+ applies that level to all process types.
+ Note that LOG has a different rank here than in
.
Only superusers and users with the appropriate SET
privilege can change this setting.
+
+ Example: To log walsender and autovacuum
+ at level DEBUG1 and everything else at ERROR,
+ set log_min_messages to error, walsender:debug1, autovacuum:debug1.
+
diff --git a/doc/src/sgml/contrib.sgml b/doc/src/sgml/contrib.sgml
index 24b706b29ad..bdd4865f53f 100644
--- a/doc/src/sgml/contrib.sgml
+++ b/doc/src/sgml/contrib.sgml
@@ -156,6 +156,7 @@ CREATE EXTENSION extension_name;
&pgfreespacemap;
&pglogicalinspect;
&pgoverexplain;
+ &pgplanadvice;
&pgprewarm;
&pgrowlocks;
&pgstatstatements;
diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml
index ac66fcbdb57..d90b4338d2a 100644
--- a/doc/src/sgml/filelist.sgml
+++ b/doc/src/sgml/filelist.sgml
@@ -149,6 +149,7 @@
+
diff --git a/doc/src/sgml/oid2name.sgml b/doc/src/sgml/oid2name.sgml
index 54cc9be2b82..9340d7376aa 100644
--- a/doc/src/sgml/oid2name.sgml
+++ b/doc/src/sgml/oid2name.sgml
@@ -118,7 +118,7 @@
display more information about each object shown: tablespace name,
- schema name, and OID.
+ schema name, OID and path.
@@ -299,10 +299,10 @@ From database "alvherre":
$ # you can mix the options, and get more details with -x
$ oid2name -d alvherre -t accounts -f 1155291 -x
From database "alvherre":
- Filenode Table Name Oid Schema Tablespace
-------------------------------------------------------
- 155173 accounts 155173 public pg_default
- 1155291 accounts_pkey 1155291 public pg_default
+ Filenode Table Name Oid Schema Tablespace Path
+--------------------------------------------------------------------------
+ 155173 accounts 155173 public pg_default base/17228/155173
+ 1155291 accounts_pkey 1155291 public pg_default base/17228/1155291
$ # show disk space for every db object
$ du [0-9]* |
diff --git a/doc/src/sgml/pgplanadvice.sgml b/doc/src/sgml/pgplanadvice.sgml
new file mode 100644
index 00000000000..a5f605b3f19
--- /dev/null
+++ b/doc/src/sgml/pgplanadvice.sgml
@@ -0,0 +1,969 @@
+
+
+
+ pg_plan_advice — help the planner get the right plan
+
+
+ pg_plan_advice
+
+
+
+ The pg_plan_advice allows key planner decisions to be
+ described, reproduced, and altered using a special-purpose "plan advice"
+ mini-language. It is intended to allow stabilization of plan choices that
+ the user believes to be good, as well as experimentation with plans that
+ the planner believes to be non-optimal.
+
+
+
+ Note that, since the planner often makes good decisions, overriding its
+ judgement can easily backfire. For example, if the distribution of the
+ underlying data changes, the planner normally has the option to adjust the
+ plan in an attempt to preserve good performance. If the plan advice prevents
+ this, a very poor plan may be chosen. It is important to use plan advice
+ only when the risks of constraining the planner's choices are outweighed by
+ the benefits.
+
+
+
+ Getting Started
+
+
+ In order to use this module, the pg_plan_advice module
+ must be loaded. You can do this on a system-wide basis by adding
+ pg_plan_advice to
+ and restarting the
+ server, or by adding it to
+ and starting a new session,
+ or by loading it into an individual session using the
+ LOAD command. If you
+ wish to use the
+ collector interface,
+ you must also install the pg_plan_advice extension
+ in the database where you wish to use the collector. Use the command
+ CREATE EXTENSION pg_plan_advice to do this. If you do
+ not wish to use the collector interface, this step is not required.
+
+
+
+ Once the pg_plan_advice module is loaded,
+ EXPLAIN will support
+ a PLAN_ADVICE option. You can use this option to see
+ a plan advice string for the chosen plan. For example:
+
+
+
+EXPLAIN (COSTS OFF, PLAN_ADVICE)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+------------------------------------
+ Hash Join
+ Hash Cond: (f.dim_id = d.id)
+ -> Seq Scan on join_fact f
+ -> Hash
+ -> Seq Scan on join_dim d
+ Generated Plan Advice:
+ JOIN_ORDER(f d)
+ HASH_JOIN(d)
+ SEQ_SCAN(f d)
+ NO_GATHER(f d)
+
+
+
+ In this example, the user has not specified any advice; instead, the
+ planner has been permitted to make whatever decisions it thinks best, and
+ those decisions are memorialized in the form of an advice string.
+ JOIN_ORDER(f d) means that f should
+ be the driving table, and the first table to which it should be joined is
+ d. HASH_JOIN(d) means that
+ d should appear on the inner side of a hash join.
+ SEQ_SCAN(f d) means that both f
+ and d should be accessed via a sequential scan.
+ NO_GATHER(f d) means that neither f
+ nor d should appear beneath a Gather
+ or Gather Merge node. For more details on the plan
+ advice mini-language, see the information on
+ advice targets and
+ advice tags, below.
+
+
+
+ If you want to see the advice strings for a large number of queries, or
+ an entire workload, running EXPLAIN (PLAN_ADVICE) for
+ each one may not be convenient. In such situations, it can be more
+ convenient to use an
+ advice collector.
+
+
+
+ Once you have an advice string for a query, you can use it to control how
+ that query is planned. You can do this by setting
+ pg_plan_advice.advice to the advice string you've
+ chosen. This can be an advice string that was generated by the system,
+ or one you've written yourself. One good way of creating your own advice
+ string is to take the string generated by the system and pick out just
+ those elements that you wish to enforce. In the example above,
+ pg_plan_advice emits advice for the join order, the
+ join method, the scan method, and the use of parallelism, but you might
+ only want to control the join order:
+
+
+
+SET pg_plan_advice.advice = 'JOIN_ORDER(f d)';
+EXPLAIN (COSTS OFF)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+------------------------------------
+ Hash Join
+ Hash Cond: (f.dim_id = d.id)
+ -> Seq Scan on join_fact f
+ -> Hash
+ -> Seq Scan on join_dim d
+ Supplied Plan Advice:
+ JOIN_ORDER(f d) /* matched */
+
+
+
+ Since the PLAN_ADVICE option to
+ EXPLAIN was not specified, no advice string is generated
+ for the plan. However, the supplied plan advice is still shown so that
+ anyone looking at the EXPLAIN output knows that the
+ chosen plan was influenced by plan advice. If information about supplied
+ plan advice is not desired, it can be suppressed by configuring
+ pg_plan_advice.always_explain_supplied_advice = false.
+ For each piece of supplied advice, the output shows
+ advice feedback indicating
+ whether or not the advice was successfully applied to the query. In this
+ case, the feedback says /* matched */, which means that
+ f and d were found in the query and
+ that the resulting query plan conforms to the specified advice.
+
+
+
+
+
+ How It Works
+
+
+ Plan advice is written imperatively; that is, it specifies what should be
+ done. However, at an implementation level,
+ pg_plan_advice works by telling the core planner what
+ should not be done. In other words, it operates by constraining the
+ planner's choices, not by replacing it. Therefore, no matter what advice
+ you provide, you will only ever get a plan that the core planner would have
+ considered for the query in question. If you attempt to force what you
+ believe to be the correct plan by supplying an advice string, and the
+ planner still fails to produce the desired plan, this means that either
+ there is a bug in your advice string, or the plan in question was not
+ considered viable by the core planner. This commonly happens for one of two
+ reasons. First, it might be the planner believes that the plan you're trying
+ to force would be semantically incorrect - that is, it would produce the
+ wrong results - and for that reason it wasn't considered. Second, it might
+ be that the planner rejected the plan you were hoping to generate on some
+ grounds other than cost. For example, given a very simple query such as
+ SELECT * FROM some_table, the query planner will
+ decide that the use of an index is worthless here before it performs any
+ costing calculations. You cannot force it to use an index for this query
+ even if you set enable_seqscan = false, and you can't
+ force it to use an index using plan advice, either.
+
+
+
+ Specifying plan advice should never cause planner failure. However, if you
+ specify plan advice that asks for something impossible, you may get a plan
+ where some plan nodes are flagged as Disabled: true in
+ the EXPLAIN output. In some cases, such plans will be
+ basically the same plan you would have gotten with no supplied advice at
+ all, but in other cases, they may be much worse. For example:
+
+
+
+SET pg_plan_advice.advice = 'JOIN_ORDER(x f d)';
+EXPLAIN (COSTS OFF)
+ SELECT * FROM join_fact f JOIN join_dim d ON f.dim_id = d.id;
+ QUERY PLAN
+----------------------------------------------------
+ Nested Loop
+ Disabled: true
+ -> Seq Scan on join_fact f
+ -> Index Scan using join_dim_pkey on join_dim d
+ Index Cond: (id = f.dim_id)
+ Supplied Plan Advice:
+ JOIN_ORDER(x f d) /* partially matched */
+
+
+
+ Because neither f nor d is the
+ first table in the JOIN_ORDER() specification, the
+ planner disables all direct joins between the two of them, thinking that
+ the join to x should happen first. Since planning isn't
+ allowed to fail, a disabled plan between the two tables is eventually
+ selected anyway, but here it's a Nested Loop rather than
+ the Hash Join that was chosen in the above example where
+ no advice was specified. There are several different ways that this kind
+ of thing can happen; when it does, the resulting plan is generally worse
+ than if no advice had been specified at all. Therefore, it is a good idea
+ to validate that the advice you specify applies to the query to which it
+ is applied and that the results are as expected.
+
+
+
+
+
+ Advice Targets
+
+
+ An advice target uniquely identifies a particular
+ instance of a particular table involved in a particular query. In simple
+ cases, such as the examples shown above, the advice target is simply the
+ relation alias. However, a more complex syntax is required when subqueries
+ are used, when tables are partitioned, or when the same relation alias is
+ mentioned more than once in the same subquery (e.g., (foo JOIN bar
+ ON foo.a = bar.a) x JOIN foo ON x.b = foo.b). Any combination of
+ these three things can occur simultaneously: a relation could be mentioned
+ more than once, be partitioned, and be used inside of a subquery.
+
+
+
+ Because of this, the general syntax for a relation identifier is:
+
+
+
+alias_name#occurrence_number/partition_schema.partition_name@plan_name
+
+
+
+ All components except for the alias_name are optional
+ and are included only when required. When a component is omitted, the
+ preceding punctuation must also be omitted. For the first occurrence of a
+ table within a given subquery, generated advice will omit the occurrence
+ number, but it is legal to write #1, if desired. The
+ partition schema and partition name are included only for children of
+ partitioned tables. In generated advice, pg_plan_advice
+ always includes both, but it is legal to omit the schema. The plan name is
+ omitted for the top-level plan, and must be included for any subplan.
+
+
+
+ It is not always easy to determine the correct advice target by examining
+ the query. For instance, if the planner pulls up a subquery into the parent
+ query level, everything inside of it becomes part of the parent query level,
+ and uses the parent query's subplan name (or no subplan name, if pulled up
+ to the top level). Furthermore, the correct subquery name is sometimes not
+ obvious. For example, when two queries are joined using an operation such as
+ UNION or INTERSECT, no name for the
+ subqueries is present in the SQL syntax; instead, a system-generated name is
+ assigned to each branch. The easiest way to discover the proper advice
+ targets is to use EXPLAIN (PLAN_ADVICE) and examine the
+ generated advice.
+
+
+
+
+
+ Advice Tags
+
+
+ An advice tag specifies a particular behavior that
+ should be enforced for some portion of the query, such as a particular
+ join order or join method. All advice tags take
+ advice targets as arguments,
+ and many allow lists of advice targets, which in some cases can be nested
+ multiple levels deep. Several different classes of advice targets exist,
+ each controlling a different aspect of query planning.
+
+
+
+ Scan Method Advice
+
+SEQ_SCAN(target [ ... ])
+TID_SCAN(target [ ... ])
+INDEX_SCAN(targetindex_name [ ... ])
+INDEX_ONLY_SCAN(targetindex_name [ ... ])
+FOREIGN_SCAN((target [ ... ]) [ ... ])
+BITMAP_HEAP_SCAN(target [ ... ])
+
+
+ SEQ_SCAN specifies that each target table should be
+ scanned using a Seq Scan. TID_SCAN
+ specifies that each target table should be scanned using a
+ TID Scan or TID Range Scan.
+ BITMAP_HEAP_SCAN specifies that each target table
+ should be scanned using a Bitmap Heap Scan.
+
+
+
+ INDEX_SCAN specifies that each target table should
+ be scanned using an Index Scan on the given index
+ name. INDEX_ONLY_SCAN is similar, but specifies the
+ use of an Index Only Scan. In either case, the index
+ name can be, but does not have to be, schema-qualified.
+
+
+
+ FOREIGN_SCAN specifies that a join between two or
+ more foreign tables should be pushed down to a remote server so
+ that it can be implemented as a single Foreign Scan.
+ Specifying FOREIGN_SCAN for a single foreign table is
+ neither necessary nor permissible: a Foreign Scan will
+ need to be used regardless. If you want to prevent a join from being
+ pushed down, consider using the JOIN_ORDER tag for
+ that purpose.
+
+
+
+ The planner supports many types of scans other than those listed here;
+ however, in most of those cases, there is no meaningful decision to be
+ made, and hence no need for advice. For example, the output of a
+ set-returning function that appears in the FROM clause
+ can only ever be scanned using a Function Scan, so
+ there is no opportunity for advice to change anything.
+
+
+
+
+
+ Join Order Advice
+
+JOIN_ORDER(join_order_item [ ... ])
+
+where join_order_item is:
+
+advice_target |
+( join_order_item [ ... ] ) |
+{ join_order_item [ ... ] }
+
+
+ When JOIN_ORDER is used without any sublists, it
+ specifies an outer-deep join with the first advice target as the driving
+ table, joined to each subsequent advice target in turn in the order
+ specified. For instance, JOIN_ORDER(a b c) means that
+ a should be the driving table, and that it should be
+ joined first to b and then to c.
+ If there are more tables in the query than a,
+ b, and c, the rest can be joined
+ afterwards in any manner.
+
+
+
+ If a JOIN_ORDER list contains a parenthesized sublist,
+ it specifies a non-outer-deep join. The tables in the sublist must first
+ be joined to each other much as if the sublist were a top-level
+ JOIN_ORDER list, and the resulting join product must
+ then appear on the inner side of a join at the appropriate point in the
+ join order. For example, JOIN_ORDER(a (b c) d) requires
+ a plan of this form:
+
+
+
+Join
+ -> Join
+ -> Scan on a
+ -> Join
+ -> Scan on b
+ -> Scan on c
+ -> Scan on d
+
+
+
+ If a JOIN_ORDER list contains a sublist surrounded by
+ curly braces, this also specifies a non-outer-deep join. However, the join
+ order within the sublist is not constrained. For example, specifiying
+ JOIN_ORDER(a {b c} d) would allow the scans of
+ b and c to be swapped in the
+ previous example, which is not allowed when parentheses are used.
+
+
+
+ Parenthesized sublists can be arbitrarily nested, but sublists surrounded
+ by curly braces cannot themselves contain sublists.
+
+
+
+ Multiple instances of JOIN_ORDER() can sometimes be
+ needed in order to fully constraint the join order. This occurs when there
+ are multiple join problems that are optimized separately by the planner.
+ This can happen due to the presence of subqueries, or because there is a
+ partitionwise join. In the latter case, each branch of the partitionwise
+ join can have its own join order, independent of every other branch.
+
+
+
+
+
+ Join Method Advice
+
+join_method_name(join_method_item [ ... ])
+
+where join_method_name is:
+
+{ MERGE_JOIN_MATERIALIZE | MERGE_JOIN_PLAIN | NESTED_LOOP_MATERIALIZE | NESTED_LOOP_PLAIN | HASH_JOIN }
+
+and join_method_item is:
+
+{ advice_target |
+( advice_target [ ... ] ) }
+
+
+ Join method advice specifies the table, or set of tables, that should
+ appear on the inner side of a join using the named join method. For
+ example, HASH_JOIN(a b) means that each of
+ a and b should appear on the inner
+ side of a hash join; a conforming plan must contain at least two hash
+ joins, one of which has a and nothing else on the
+ inner side, and the other of which has b and nothing
+ else on the inner side. On the other hand,
+ HASH_JOIN((a b)) means that the join product of
+ a and b should appear together
+ on the inner side of a single hash join.
+
+
+
+ Note that join method advice implies a negative join order constraint.
+ Since the named table or tables must be on the inner side of a join using
+ the specified method, none of them can be the driving table for the entire
+ join problem. Moreover, no table inside the set should be joined to any
+ table outside the set until all tables within the set have been joined to
+ each other. For example, if the advice specifies
+ HASH_JOIN((a b)) and the system begins by joining either
+ of those tables to some third table c, the resulting
+ plan could never be compliant with the request to put exactly those two
+ tables on the inner side of a hash join. When using both join order advice
+ and join method advice for the same query, it is a good idea to make sure
+ that they do not mandate incompatible join orders.
+
+
+
+
+
+ Partitionwise Advice
+
+PARTITIONWISE(partitionwise_item [ ... ])
+
+where partitionwise_item is:
+
+{ advice_target |
+( advice_target [ ... ] ) }
+
+
+ When applied to a single target, PARTITIONWISE
+ specifies that the specified table should not be part of any partitionwise
+ join. When applied to a list of targets, PARTITIONWISE
+ specifies that exactly that set of tables should be joined in
+ partitionwise fashion. Note that, regardless of what advice is specified,
+ no partitionwise joins will be possible if
+ enable_partitionwise_join = off.
+
+
+
+
+
+ Semijoin Uniqueness Advice
+
+SEMIJOIN_UNIQUE(sj_unique_item [ ... ])
+SEMIJOIN_NON_UNIQUE(sj_unique_item [ ... ])
+
+where sj_unique_item is:
+
+{ advice_target |
+( advice_target [ ... ] ) }
+
+
+ The planner sometimes has a choice between implementing a semijoin
+ directly and implememnting a semijoin by making the nullable side unique
+ and then performing an inner join. SEMIJOIN_UNIQUE
+ specifies the latter strategy, while SEMIJOIN_NON_UNIQUE
+ specifies the former strategy. In either case, the argument is the single
+ table or list of tables that appear beneath the nullable side of the join.
+
+
+
+
+
+ Parallel Query Advice
+
+GATHER(gather_item [ ... ])
+GATHER_MERGE(gather_item [ ... ])
+NO_GATHER(advice_target [ ... ])
+
+where gather_item is:
+
+{ advice_target |
+( advice_target [ ... ] ) }
+
+
+ GATHER or GATHER_MERGE specifies
+ that Gather or Gather Merge,
+ respectively, should be placed on top of the single table specified as
+ a target, or on top of the join between the list of tables specified as
+ a target. This means that GATHER(a b c) is a request
+ for three different Gather nodes, while
+ GATHER((a b c)) is a request for a single
+ Gather node on top of a 3-way join.
+
+
+
+ NO_GATHER specifies that none of the tables given
+ as arguments should appear beneath a Gather or
+ Gather Merge node.
+
+
+
+
+
+
+
+ Advice Feedback
+
+
+ EXPLAIN provides feedback on whether supplied advice was
+ successfully applied to the query in the form of a comment on each piece
+ of supplied advice. For example:
+
+
+
+SET pg_plan_advice.advice = 'hash_join(f g) join_order(f g) index_scan(f no_such_index)';
+SET
+rhaas=# EXPLAIN (COSTS OFF) SELECT * FROM jo_fact f
+ LEFT JOIN jo_dim1 d1 ON f.dim1_id = d1.id
+ LEFT JOIN jo_dim2 d2 ON f.dim2_id = d2.id
+ WHERE val1 = 1 AND val2 = 1;
+ QUERY PLAN
+-------------------------------------------------------------------
+ Hash Join
+ Hash Cond: ((d1.id = f.dim1_id) AND (d2.id = f.dim2_id))
+ -> Nested Loop
+ -> Seq Scan on jo_dim2 d2
+ Filter: (val2 = 1)
+ -> Materialize
+ -> Seq Scan on jo_dim1 d1
+ Filter: (val1 = 1)
+ -> Hash
+ -> Seq Scan on jo_fact f
+ Supplied Plan Advice:
+ INDEX_SCAN(f no_such_index) /* matched, inapplicable, failed */
+ HASH_JOIN(f) /* matched */
+ HASH_JOIN(g) /* not matched */
+ JOIN_ORDER(f g) /* partially matched */
+
+
+
+ For this query, f is a valid advice target, but
+ g is not. Therefore, the request to place
+ f on the inner side of a hash join is listed as
+ matched, but the request to place g
+ on the inner side of a hash join is listed as
+ not matched. The JOIN_ORDER advice
+ tag involves one valid target and one invald target, and so is listed as
+ partially matched. Note that
+ HASH_JOIN(f g) is actually a request for two logically
+ separate behaviors, whereas JOIN_ORDER(f g) is a single
+ request. When providing advice feedback, EXPLAIN shows
+ each logical request separately, together with all the feedback applicable
+ to that request type.
+
+
+
+ Advice feedback can include any of the folllowing:
+
+
+
+
+
+
+ matched means that all of the specified advice targets
+ were observed during query planning.
+
+
+
+
+
+ partially matched means that some but not all of the
+ specified advice targets were observed during query planning.
+
+
+
+
+
+ not matched means that none of the
+ specified advice targets were observed during query planning. This may
+ happen if the advice simply doesn't match the query, or it may
+ occur if the relevant portion of the query was not planned, perhaps
+ because it was gated by a condition that was simplified to constant false.
+
+
+
+
+
+ inapplicable means that the advice tag could not
+ be applied to the advice targets for some reason. For example, this will
+ happen if the use of a nonexistent index is requested, or if an attempt
+ is made to control semijoin uniquness for a non-semijoin.
+
+
+
+
+
+ conflicting means that two or more pieces of advice
+ request incompatible behaviors. For example, if you advise a sequential
+ scan and an index scan for the same table, both requests will be flagged
+ as conflicting. This also commonly happens if join method advice or
+ semijoin uniqueness advice implies a join order incompatible with the
+ one explicitly specified; see
+ .
+
+
+
+
+
+ failed means that query plan does not comply with
+ the advice. This only occurs for entries that are also shown as
+ matched. It frequently occurs for entries that are
+ also marked as conflicting or
+ inapplicable. However, it can also occur when the
+ advice is valid insofar as pg_plan_advice is able
+ to determine, but the planner is not able to construct a legal
+ plan that can comply with the advice. It is important to note that the
+ sanity checks performed by pg_plan_advice are fairly
+ superficial and focused mostly on looking for logical inconsistencies in
+ the advice string; only the planner knows what will actually work.
+
+
+
+
+
+
+ All advice should be marked as exactly one of matched,
+ partially matched, or not matched.
+
+
+
+
+
+ Advice Collectors
+
+
+ pg_plan_advice can be configured to automatically
+ generate advice every time a query is planned and store the query and
+ the generated advice string either in local or shared memory.
+
+
+
+ To enable a collector, you must first set a collection limit. When the
+ number of queries for which advice has been stored exceeds the collection
+ limit, the oldest queries and the corresponding advice will be discarded.
+ Then, you must adjust a separate setting to actually enable advice
+ collection. For the local collector, set the collection limit by configuring
+ pg_plan_advice.local_collection_limit to a value
+ greater than zero, and then enable advice collection by setting
+ pg_plan_advice.local_collector = true. For the shared
+ collector, the procedure is the same, except that the names of the settings
+ are pg_plan_advice.shared_collection_limit and
+ pg_plan_advice.shared_collector. Note that the local
+ collector stores query texts and advice strings in backend-local memory,
+ and the shared collector does the same in dynamic shared memory, so
+ configuring large limits may result in considerable memory consumption.
+
+
+
+ Once the collector is enabled, you can run any queries for which you wish
+ to see the generated plan advice. Then, you can examine what has been
+ collected using whichever of
+ SELECT * FROM pg_get_collected_local_advice() or
+ SELECT * FROM pg_get_collected_shared_advice()
+ corresponds to the collector you enabled. To discard the collected advice
+ and release memory, you can call
+ pg_clear_collected_local_advice()
+ or pg_clear_collected_shared_advice().
+
+
+
+ In addition to the query texts an advice strings, the advice collectors
+ will also store the OID of the role that caused the query to be planned,
+ the OID of the database in which the query was planned, the query ID,
+ and the time at which the collection occurred. This module does not
+ automatically enable query ID computation; therefore, if you want the
+ query ID value to be populated in collected advice, be sure to configure
+ enable_query_id = on. Otherwise, the query ID may
+ always show as 0.
+
+
+
+
+
+ Functions
+
+
+ Note that these functions will only be available if the
+ pg_plan_advice extension has been installed in the
+ current database, which is not mandatory, since much of the functionality
+ of this module can be used without installing the extension.
+
+
+
+
+
+
+ pg_clear_collected_local_advice() returns void
+
+ pg_clear_collected_local_advice
+
+
+
+
+
+ Removes all collected query texts and advice strings from backend-local
+ memory.
+
+
+
+
+
+
+ pg_get_collected_local_advice() returns setof (id bigint,
+ userid oid, dbid oid, queryid bigint, collection_time timestamptz,
+ query text, advice text)
+
+ pg_get_collected_local_advice
+
+
+
+
+
+ Returns all query texts and advice strings stored in the local
+ advice collector.
+
+
+
+
+
+
+ pg_clear_collected_shared_advice() returns void
+
+ pg_clear_collected_shared_advice
+
+
+
+
+
+ Removes all collected query texts and advice strings from shared
+ memory.
+
+
+
+
+
+
+ pg_get_collected_shared_advice() returns setof (id bigint,
+ userid oid, dbid oid, queryid bigint, collection_time timestamptz,
+ query text, advice text)
+
+ pg_get_collected_shared_advice
+
+
+
+
+
+ Returns all query texts and advice strings stored in the shared
+ advice collector.
+
+
+
+
+
+
+
+
+
+ Configuration Parameters
+
+
+
+
+
+ pg_plan_advice.advice (string)
+
+ pg_plan_advice.advice configuration parameter
+
+
+
+
+
+ pg_plan_advice.advice is an advice string to be
+ used during query planning.
+
+
+
+
+
+
+ pg_plan_advice.always_explain_supplied_advice (boolean)
+
+ pg_plan_advice.always_explain_supplied_advice configuration parameter
+
+
+
+
+
+ pg_plan_advice.always_explain_supplied_advice causes
+ EXPLAIN to always show any supplied advice and the
+ associated
+ advice feedback.
+ The default value is true. If set to
+ false, this information will be displayed only when
+ EXPLAIN (PLAN_ADVICE) is used.
+
+
+
+
+
+
+ pg_plan_advice.always_store_advice_details (boolean)
+
+ pg_plan_advice.always_store_advice_details configuration parameter
+
+
+
+
+
+ pg_plan_advice.always_store_advice_details allows
+ EXPLAIN to show details related to plan advice even
+ when prepared queries are used. The default value is
+ false. When planning a prepared query, it is not
+ possible to know whether EXPLAIN will later be used,
+ so by default, to reduce overhead, pg_plan_advice
+ will not generate plan advice or feedback on supplied advice. This means
+ that if EXPLAIN EXECUTE is used on the prepared query,
+ it will not be able to show this information. Changing this setting to
+ true avoids this problem, but adds additional
+ overhead. It is probably a good idea to enable this option only in
+ sessions where it is needed, rather than on a system-wide basis.
+
+
+
+
+
+
+ pg_plan_advice.feedback_warnings (boolean)
+
+ pg_plan_advice.feedback_warnings configuration parameter
+
+
+
+
+
+ When set to true, pg_plan_advice.feedback_warnings
+ emits a warning whenever supplied plan advice is not successfully
+ enforced. The default value is false.
+
+
+
+
+
+
+ pg_plan_advice.local_collector (boolean)
+
+ pg_plan_advice.local_collector configuration parameter
+
+
+
+
+
+ pg_plan_advice.local_collector enables the
+ local advice collector.
+ The default value is false.
+
+
+
+
+
+
+ pg_plan_advice.local_collection_limit (integer)
+
+ pg_plan_advice.local_collection_limit configuration parameter
+
+
+
+
+
+ pg_plan_advice.local_collection_limit sets the
+ maximum number of query texts and advice strings retained by the
+ local advice collector.
+ The default value is 0.
+
+
+
+
+
+
+ pg_plan_advice.shared_collector (boolean)
+
+ pg_plan_advice.shared_collector configuration parameter
+
+
+
+
+
+ pg_plan_advice.shared_collector enables the
+ shared advice collector.
+ The default value is false. Only superusers and users
+ with the appropriate SET privilege can change this
+ setting.
+
+
+
+
+
+
+ pg_plan_advice.shared_collection_limit (integer)
+
+ pg_plan_advice.shared_collection_limit configuration parameter
+
+
+
+
+
+ pg_plan_advice.shared_collection_limit sets the
+ maximum number of query texts and advice strings retained by the
+ shared advice collector.
+ The default value is 0. Only superusers and users
+ with the appropriate SET privilege can change this
+ setting.
+
+
+
+
+
+
+ pg_plan_advice.trace_mask (boolean)
+
+ pg_plan_advice.trace_mask configuration parameter
+
+
+
+
+
+ When pg_plan_advice.trace_mask is
+ true, pg_plan_advice will print
+ messages during query planning each time that
+ pg_plan_advice alters the mask of allowable query
+ plan types in response to supplied plan advice. The default values is
+ false. The messages printed by this setting are not
+ excepted to be useful except for purposes of debugging this module.
+
+
+
+
+
+
+
+
+
+ Author
+
+
+ Robert Haas rhaas@postgresql.org
+
+
+
+
diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml
index a2b528c481e..89ac680efd5 100644
--- a/doc/src/sgml/protocol.sgml
+++ b/doc/src/sgml/protocol.sgml
@@ -223,10 +223,12 @@
shows the currently supported
protocol versions.
+
+ documents protocol versions that are unsupported or otherwise reserved.
+ Other Protocol Versions
+
+
+
+
+ Version
+ Supported by
+ Description
+
+
+
+
+
+ 3.9999
+ -
+ Reserved for protocol greasing. libpq may use this version, which
+ is higher than any minor version the project ever expects to use, to
+ test that servers and middleware properly implement protocol version
+ negotiation. Servers must not add special-case
+ logic for this version; they should simply compare it to their latest
+ supported version (which will always be smaller) and downgrade via a
+ NegotiateProtocolVersion message.
+
+
+ 3.1-Reserved. Version 3.1 has not been used by any PostgreSQL
@@ -257,15 +292,89 @@
- 3.0
- PostgreSQL 7.4 and later
-
- 2.0up to PostgreSQL 13
- See previous releases of
+ Obsolete. See previous releases of
the PostgreSQL documentation for
- details
+ details.
+
+
+
+
+
+
+
+ Protocol Extensions
+
+
+ Servers and clients may additionally negotiate individual extensions to the
+ protocol version in use. These are offered by the client in the startup
+ message, as specially-named parameters with a _pq_.
+ prefix. Servers reject any unknown or unsupported extensions by sending a
+ NegotiateProtocolVersion message containing the list of rejected parameter
+ names, at which point the client may choose whether to continue with the
+ connection. and
+ document the supported
+ and reserved protocol extension parameters, respectively.
+
+
+
+ Supported Protocol Extensions
+
+
+
+
+
+
+ Parameter Name
+ Values
+ Supported by
+ Description
+
+
+
+
+
+
+ (No supported protocol extensions are currently defined.)
+
+
+
+
+
+
+
+ Reserved Protocol Extensions
+
+
+
+
+ Parameter Name
+ Description
+
+
+
+
+
+ _pq_.[name]
+ Any other parameter names beginning with _pq_.,
+ that are not defined above, are reserved for future protocol expansion.
+ Servers must reject any that are received from a
+ client, by sending a NegotiateProtocolVersion message during the
+ startup flow, and should
+ otherwise continue the connection.
+
+
+
+
+ _pq_.test_protocol_negotiation
+ Reserved for protocol greasing. libpq may send this extension to
+ test that servers and middleware properly implement protocol extension
+ negotiation. Servers must not add special-case
+ logic for this parameter; they should simply send the list of all
+ unsupported options (including this one) via a NegotiateProtocolVersion
+ message.
+
@@ -295,8 +404,8 @@
To begin a session, a frontend opens a connection to the server and sends
a startup message. This message includes the names of the user and of the
database the user wants to connect to; it also identifies the particular
- protocol version to be used. (Optionally, the startup message can include
- additional settings for run-time parameters.)
+ protocol version to be used. (Optionally, the startup message can request
+ protocol extensions and include additional settings for run-time parameters.)
The server then uses this information and
the contents of its configuration files (such as
pg_hba.conf) to determine
@@ -6151,7 +6260,9 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;"
In addition to the above, other parameters may be listed.
Parameter names beginning with _pq_. are
- reserved for use as protocol extensions, while others are
+ reserved for use as
+ protocol extensions,
+ while others are
treated as run-time parameters to be set at backend start
time. Such settings will be applied during backend start
(after parsing the command-line arguments if any) and will
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 688e23c0e90..7f538e90194 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -263,12 +263,10 @@ PostgreSQL documentation
- When is specified,
- pg_dump makes no attempt to dump any other
- database objects that the selected extension(s) might depend upon.
- Therefore, there is no guarantee that the results of a
- specific-extension dump can be successfully restored by themselves
- into a clean database.
+ pg_dump does not dump the extension's
+ underlying installation files (such as shared libraries or control
+ files). These must be available on the destination system for the
+ restore to succeed.
@@ -445,16 +443,6 @@ PostgreSQL documentation
below.
-
-
- When is specified, pg_dump
- makes no attempt to dump any other database objects that the selected
- schema(s) might depend upon. Therefore, there is no guarantee
- that the results of a specific-schema dump can be successfully
- restored by themselves into a clean database.
-
-
-
Non-schema objects such as large objects are not dumped when is
@@ -596,16 +584,6 @@ PostgreSQL documentation
be dumped.
-
-
- When is specified, pg_dump
- makes no attempt to dump any other database objects that the selected
- table(s) might depend upon. Therefore, there is no guarantee
- that the results of a specific-table dump can be successfully
- restored by themselves into a clean database.
-
-
-
@@ -1689,6 +1667,17 @@ CREATE DATABASE foo WITH TEMPLATE template0;
+
+ When options , or
+ are specified, pg_dump makes no attempt to dump
+ any other database objects that the selected object(s) might depend upon.
+ Therefore, there is no guarantee that the results of a dump so generated
+ can be successfully restored by themselves into a clean database.
+ For example, if a table whose definition includes a foreign key is
+ specified to be restored, the table referenced by the foreign key is
+ not automatically restored.
+
+
When a dump without schema is chosen and the option
is used, pg_dump emits commands
diff --git a/doc/src/sgml/ref/pg_restore.sgml b/doc/src/sgml/ref/pg_restore.sgml
index 2c295bbf8dc..420a308a7c7 100644
--- a/doc/src/sgml/ref/pg_restore.sgml
+++ b/doc/src/sgml/ref/pg_restore.sgml
@@ -452,16 +452,6 @@ PostgreSQL documentation
specify table(s) in a particular schema.
-
-
- When is specified, pg_restore
- makes no attempt to restore any other database objects that the
- selected table(s) might depend upon. Therefore, there is no
- guarantee that a specific-table restore into a clean database will
- succeed.
-
-
-
This flag does not behave identically to the
@@ -1089,6 +1079,16 @@ PostgreSQL documentation
Notes
+
+ When options or are specified,
+ pg_restore makes no attempt to restore
+ any other database objects that the selected table(s) or schema(s)
+ might depend upon. Therefore, there is no guarantee that a specific-table
+ restore into a clean database will succeed. For example, if a table
+ whose definition includes a foreign key is specified to be restored, the
+ table referenced by the foreign key is not automatically restored.
+
+
If your installation has any local additions to the
template1 database, be careful to load the output of
diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml
index e464e3b13de..8b1d948ba05 100644
--- a/doc/src/sgml/ref/psql-ref.sgml
+++ b/doc/src/sgml/ref/psql-ref.sgml
@@ -5075,6 +5075,23 @@ testdb=> INSERT INTO my_table VALUES (:'content');
+
+ %i
+
+
+ Indicates whether the connected server is running in hot standby mode.
+ The value is shown as standby, if the server is
+ currently in hot standby and reports
+ as on,
+ and primary otherwise. This is useful when
+ connecting to multiple servers to quickly determine the role of
+ each connection. A value of ? is shown
+ when connected to a server running
+ PostgreSQL 13 or older.
+
+
+
+
%x
diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml
index 8b4abef8c68..e5fe423fc61 100644
--- a/doc/src/sgml/system-views.sgml
+++ b/doc/src/sgml/system-views.sgml
@@ -5045,6 +5045,45 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
non-null elements. (Null for scalar types.)
+
+
+
+ range_length_histogramanyarray
+
+
+ A histogram of the lengths of non-empty and non-null range values of an
+ expression. (Null for non-range types.)
+
+
+ This histogram is calculated using the subtype_diff
+ range function regardless of whether range bounds are inclusive.
+
+
+
+
+
+ range_empty_fracfloat4
+
+
+ Fraction of expression entries whose values are empty ranges.
+ (Null for non-range types.)
+
+
+
+
+
+ range_bounds_histogramanyarray
+
+
+ A histogram of lower and upper bounds of non-empty and non-null range
+ values. (Null for non-range types.)
+
+
+ These two histograms are represented as a single array of ranges, whose
+ lower bounds represent the histogram of lower bounds, and upper bounds
+ represent the histogram of upper bounds.
+
+
diff --git a/meson.build b/meson.build
index df907b62da3..96b3869df86 100644
--- a/meson.build
+++ b/meson.build
@@ -2911,7 +2911,7 @@ gnugetopt_dep = cc.find_library('gnugetopt', required: false)
# (i.e., allow '-' as a flag character), so use our version on those platforms
# - We want to use system's getopt_long() only if the system provides struct
# option
-always_replace_getopt = host_system in ['windows', 'cygwin', 'openbsd', 'solaris']
+always_replace_getopt = host_system in ['windows', 'cygwin', 'openbsd', 'sunos']
always_replace_getopt_long = host_system in ['windows', 'cygwin'] or not cdata.has('HAVE_STRUCT_OPTION')
# Required on BSDs
diff --git a/src/backend/Makefile b/src/backend/Makefile
index baa9b05d021..05642dc02e3 100644
--- a/src/backend/Makefile
+++ b/src/backend/Makefile
@@ -136,9 +136,6 @@ parser/gram.h: parser/gram.y
storage/lmgr/lwlocknames.h: storage/lmgr/generate-lwlocknames.pl ../include/storage/lwlocklist.h utils/activity/wait_event_names.txt
$(MAKE) -C storage/lmgr lwlocknames.h
-utils/activity/wait_event_types.h: utils/activity/generate-wait_event_types.pl utils/activity/wait_event_names.txt
- $(MAKE) -C utils/activity wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c
-
# run this unconditionally to avoid needing to know its dependencies here:
submake-catalog-headers:
$(MAKE) -C ../include/catalog generated-headers
@@ -163,18 +160,13 @@ submake-utils-headers:
.PHONY: generated-headers
-generated-headers: $(top_builddir)/src/include/storage/lwlocknames.h $(top_builddir)/src/include/utils/wait_event_types.h submake-catalog-headers submake-nodes-headers submake-utils-headers parser/gram.h
+generated-headers: $(top_builddir)/src/include/storage/lwlocknames.h submake-catalog-headers submake-nodes-headers submake-utils-headers parser/gram.h
$(top_builddir)/src/include/storage/lwlocknames.h: storage/lmgr/lwlocknames.h
prereqdir=`cd '$(dir $<)' >/dev/null && pwd` && \
cd '$(dir $@)' && rm -f $(notdir $@) && \
$(LN_S) "$$prereqdir/$(notdir $<)" .
-$(top_builddir)/src/include/utils/wait_event_types.h: utils/activity/wait_event_types.h
- prereqdir=`cd '$(dir $<)' >/dev/null && pwd` && \
- cd '$(dir $@)' && rm -f $(notdir $@) && \
- $(LN_S) "$$prereqdir/$(notdir $<)" .
-
utils/probes.o: utils/probes.d $(SUBDIROBJS)
$(DTRACE) $(DTRACEFLAGS) -C -G -s $(call expand_subsys,$^) -o $@
diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c
index 94b4f1f9975..b69d10f0a45 100644
--- a/src/backend/access/common/tupdesc.c
+++ b/src/backend/access/common/tupdesc.c
@@ -86,25 +86,8 @@ populate_compact_attribute_internal(Form_pg_attribute src,
IsCatalogRelationOid(src->attrelid) ? ATTNULLABLE_VALID :
ATTNULLABLE_UNKNOWN;
- switch (src->attalign)
- {
- case TYPALIGN_INT:
- dst->attalignby = ALIGNOF_INT;
- break;
- case TYPALIGN_CHAR:
- dst->attalignby = sizeof(char);
- break;
- case TYPALIGN_DOUBLE:
- dst->attalignby = ALIGNOF_DOUBLE;
- break;
- case TYPALIGN_SHORT:
- dst->attalignby = ALIGNOF_SHORT;
- break;
- default:
- dst->attalignby = 0;
- elog(ERROR, "invalid attalign value: %c", src->attalign);
- break;
- }
+ /* Compute numeric alignment requirement, too */
+ dst->attalignby = typalign_to_alignby(src->attalign);
}
/*
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index d5944205db2..dfffce3e396 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -291,7 +291,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
SplitPageLayout *dist = NULL,
*ptr;
BlockNumber oldrlink = InvalidBlockNumber;
- GistNSN oldnsn = 0;
+ GistNSN oldnsn = InvalidXLogRecPtr;
SplitPageLayout rootpg;
bool is_rootsplit;
int npage;
@@ -654,7 +654,7 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace,
/* Start from the root */
firststack.blkno = GIST_ROOT_BLKNO;
- firststack.lsn = 0;
+ firststack.lsn = InvalidXLogRecPtr;
firststack.retry_from_parent = false;
firststack.parent = NULL;
firststack.downlinkoffnum = InvalidOffsetNumber;
diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c
index 83bda209c42..036421fc664 100644
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -27,6 +27,7 @@
#include "postgres.h"
#include "common/hashfn.h"
+#include "utils/builtins.h"
#include "utils/float.h"
#include "utils/fmgrprotos.h"
#include "utils/pg_locale.h"
@@ -233,6 +234,7 @@ hashoidvector(PG_FUNCTION_ARGS)
{
oidvector *key = (oidvector *) PG_GETARG_POINTER(0);
+ check_valid_oidvector(key);
return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid));
}
@@ -241,6 +243,7 @@ hashoidvectorextended(PG_FUNCTION_ARGS)
{
oidvector *key = (oidvector *) PG_GETARG_POINTER(0);
+ check_valid_oidvector(key);
return hash_any_extended((unsigned char *) key->values,
key->dim1 * sizeof(Oid),
PG_GETARG_INT64(1));
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index f30a56ecf55..3004964ab7f 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -111,11 +111,11 @@ static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool ke
/*
- * Each tuple lock mode has a corresponding heavyweight lock, and one or two
- * corresponding MultiXactStatuses (one to merely lock tuples, another one to
- * update them). This table (and the macros below) helps us determine the
- * heavyweight lock mode and MultiXactStatus values to use for any particular
- * tuple lock strength.
+ * This table lists the heavyweight lock mode that corresponds to each tuple
+ * lock mode, as well as one or two corresponding MultiXactStatus values:
+ * .lockstatus to merely lock tuples, and .updstatus to update them. The
+ * latter is set to -1 if the corresponding tuple lock mode does not allow
+ * updating tuples -- see get_mxact_status_for_lock().
*
* These interact with InplaceUpdateTupleLock, an alias for ExclusiveLock.
*
@@ -127,29 +127,30 @@ static const struct
LOCKMODE hwlock;
int lockstatus;
int updstatus;
-}
+} tupleLockExtraInfo[] =
- tupleLockExtraInfo[MaxLockTupleMode + 1] =
{
- { /* LockTupleKeyShare */
- AccessShareLock,
- MultiXactStatusForKeyShare,
- -1 /* KeyShare does not allow updating tuples */
+ [LockTupleKeyShare] = {
+ .hwlock = AccessShareLock,
+ .lockstatus = MultiXactStatusForKeyShare,
+ /* KeyShare does not allow updating tuples */
+ .updstatus = -1
},
- { /* LockTupleShare */
- RowShareLock,
- MultiXactStatusForShare,
- -1 /* Share does not allow updating tuples */
+ [LockTupleShare] = {
+ .hwlock = RowShareLock,
+ .lockstatus = MultiXactStatusForShare,
+ /* Share does not allow updating tuples */
+ .updstatus = -1
},
- { /* LockTupleNoKeyExclusive */
- ExclusiveLock,
- MultiXactStatusForNoKeyUpdate,
- MultiXactStatusNoKeyUpdate
+ [LockTupleNoKeyExclusive] = {
+ .hwlock = ExclusiveLock,
+ .lockstatus = MultiXactStatusForNoKeyUpdate,
+ .updstatus = MultiXactStatusNoKeyUpdate
},
- { /* LockTupleExclusive */
- AccessExclusiveLock,
- MultiXactStatusForUpdate,
- MultiXactStatusUpdate
+ [LockTupleExclusive] = {
+ .hwlock = AccessExclusiveLock,
+ .lockstatus = MultiXactStatusForUpdate,
+ .updstatus = MultiXactStatusUpdate
}
};
@@ -1421,16 +1422,6 @@ heap_getnext(TableScanDesc sscan, ScanDirection direction)
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg_internal("only heap AM is supported")));
- /*
- * We don't expect direct calls to heap_getnext with valid CheckXidAlive
- * for catalog or regular tables. See detailed comments in xact.c where
- * these variables are declared. Normally we have such a check at tableam
- * level API but this is called from many places so we need to ensure it
- * here.
- */
- if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
- elog(ERROR, "unexpected heap_getnext call during logical decoding");
-
/* Note: no locking manipulations needed */
if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
diff --git a/src/backend/access/heap/heaptoast.c b/src/backend/access/heap/heaptoast.c
index e28fe47a449..6ddf6c6cf9f 100644
--- a/src/backend/access/heap/heaptoast.c
+++ b/src/backend/access/heap/heaptoast.c
@@ -768,7 +768,7 @@ heap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize,
chcpyend = (sliceoffset + slicelength - 1) % TOAST_MAX_CHUNK_SIZE;
memcpy(VARDATA(result) +
- (curchunk * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt,
+ curchunk * TOAST_MAX_CHUNK_SIZE - sliceoffset + chcpystrt,
chunkdata + chcpystrt,
(chcpyend - chcpystrt) + 1);
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index a29be6f467b..5e89b86a62c 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -420,6 +420,14 @@ systable_beginscan(Relation heapRelation,
sysscan->snapshot = NULL;
}
+ /*
+ * If CheckXidAlive is set then set a flag to indicate that system table
+ * scan is in-progress. See detailed comments in xact.c where these
+ * variables are declared.
+ */
+ if (TransactionIdIsValid(CheckXidAlive))
+ bsysscan = true;
+
if (irel)
{
int i;
@@ -468,14 +476,6 @@ systable_beginscan(Relation heapRelation,
sysscan->iscan = NULL;
}
- /*
- * If CheckXidAlive is set then set a flag to indicate that system table
- * scan is in-progress. See detailed comments in xact.c where these
- * variables are declared.
- */
- if (TransactionIdIsValid(CheckXidAlive))
- bsysscan = true;
-
return sysscan;
}
@@ -707,13 +707,6 @@ systable_beginscan_ordered(Relation heapRelation,
elog(ERROR, "column is not in index");
}
- sysscan->iscan = index_beginscan(heapRelation, indexRelation,
- snapshot, NULL, nkeys, 0);
- index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0);
- sysscan->scan = NULL;
-
- pfree(idxkey);
-
/*
* If CheckXidAlive is set then set a flag to indicate that system table
* scan is in-progress. See detailed comments in xact.c where these
@@ -722,6 +715,13 @@ systable_beginscan_ordered(Relation heapRelation,
if (TransactionIdIsValid(CheckXidAlive))
bsysscan = true;
+ sysscan->iscan = index_beginscan(heapRelation, indexRelation,
+ snapshot, NULL, nkeys, 0);
+ index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0);
+ sysscan->scan = NULL;
+
+ pfree(idxkey);
+
return sysscan;
}
diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c
index 8425805a292..1d343377e98 100644
--- a/src/backend/access/nbtree/nbtcompare.c
+++ b/src/backend/access/nbtree/nbtcompare.c
@@ -57,6 +57,7 @@
#include
+#include "utils/builtins.h"
#include "utils/fmgrprotos.h"
#include "utils/skipsupport.h"
#include "utils/sortsupport.h"
@@ -587,6 +588,9 @@ btoidvectorcmp(PG_FUNCTION_ARGS)
oidvector *b = (oidvector *) PG_GETARG_POINTER(1);
int i;
+ check_valid_oidvector(a);
+ check_valid_oidvector(b);
+
/* We arbitrarily choose to sort first by vector length */
if (a->dim1 != b->dim1)
PG_RETURN_INT32(a->dim1 - b->dim1);
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 90ab4e91b56..3a45508f62e 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -69,8 +69,8 @@
/*
* DISABLE_LEADER_PARTICIPATION disables the leader's participation in
* parallel index builds. This may be useful as a debugging aid.
-#undef DISABLE_LEADER_PARTICIPATION
*/
+/* #define DISABLE_LEADER_PARTICIPATION */
/*
* Status record for spooling/sorting phase. (Note we may have two of
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index 87491796523..dfda1af412e 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -117,8 +117,8 @@ table_beginscan_catalog(Relation relation, int nkeys, ScanKeyData *key)
Oid relid = RelationGetRelid(relation);
Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
- return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key,
- NULL, flags);
+ return table_beginscan_common(relation, snapshot, nkeys, key,
+ NULL, flags);
}
@@ -184,8 +184,8 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
snapshot = SnapshotAny;
}
- return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL,
- pscan, flags);
+ return table_beginscan_common(relation, snapshot, 0, NULL,
+ pscan, flags);
}
TableScanDesc
@@ -214,8 +214,8 @@ table_beginscan_parallel_tidrange(Relation relation,
snapshot = SnapshotAny;
}
- sscan = relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL,
- pscan, flags);
+ sscan = table_beginscan_common(relation, snapshot, 0, NULL,
+ pscan, flags);
return sscan;
}
@@ -269,14 +269,6 @@ table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid)
Relation rel = scan->rs_rd;
const TableAmRoutine *tableam = rel->rd_tableam;
- /*
- * We don't expect direct calls to table_tuple_get_latest_tid with valid
- * CheckXidAlive for catalog or regular tables. See detailed comments in
- * xact.c where these variables are declared.
- */
- if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
- elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding");
-
/*
* Since this can be called with user-supplied TID, don't trust the input
* too much.
diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c
index 01a89104ef0..fe00488487d 100644
--- a/src/backend/access/transam/parallel.c
+++ b/src/backend/access/transam/parallel.c
@@ -357,7 +357,7 @@ InitializeParallelDSM(ParallelContext *pcxt)
fps->stmt_ts = GetCurrentStatementStartTimestamp();
fps->serializable_xact_handle = ShareSerializableXact();
SpinLockInit(&fps->mutex);
- fps->last_xlog_end = 0;
+ fps->last_xlog_end = InvalidXLogRecPtr;
shm_toc_insert(pcxt->toc, PARALLEL_KEY_FIXED, fps);
/* We can skip the rest of this if we're not budgeting for any workers. */
@@ -530,7 +530,7 @@ ReinitializeParallelDSM(ParallelContext *pcxt)
/* Reset a few bits of fixed parallel state to a clean state. */
fps = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_FIXED, false);
- fps->last_xlog_end = 0;
+ fps->last_xlog_end = InvalidXLogRecPtr;
/* Recreate error queues (if they exist). */
if (pcxt->nworkers > 0)
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 601ce3faa64..eabc4d48208 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -470,7 +470,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, FullTransactionId fxid,
proc->databaseId = databaseid;
proc->roleId = owner;
proc->tempNamespaceId = InvalidOid;
- proc->isRegularBackend = false;
+ proc->backendType = B_INVALID;
proc->lwWaiting = LW_WS_NOT_WAITING;
proc->lwWaitMode = 0;
proc->waitLock = NULL;
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 16614e152dd..13ec6225b85 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2060,7 +2060,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
/* Have to write it ourselves */
TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
WriteRqst.Write = OldPageRqstPtr;
- WriteRqst.Flush = 0;
+ WriteRqst.Flush = InvalidXLogRecPtr;
XLogWrite(WriteRqst, tli, false);
LWLockRelease(WALWriteLock);
pgWalUsage.wal_buffers_full++;
@@ -3077,7 +3077,7 @@ XLogBackgroundFlush(void)
else
{
/* no flushing, this time round */
- WriteRqst.Flush = 0;
+ WriteRqst.Flush = InvalidXLogRecPtr;
}
#ifdef WAL_DEBUG
@@ -5207,7 +5207,7 @@ BootStrapXLOG(uint32 data_checksum_version)
/* Insert the initial checkpoint record */
recptr = ((char *) page + SizeOfXLogLongPHD);
record = (XLogRecord *) recptr;
- record->xl_prev = 0;
+ record->xl_prev = InvalidXLogRecPtr;
record->xl_xid = InvalidTransactionId;
record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c
index 3c3f067aafb..24cfa96d737 100644
--- a/src/backend/access/transam/xlogprefetcher.c
+++ b/src/backend/access/transam/xlogprefetcher.c
@@ -967,7 +967,7 @@ XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, XLogRecPtr recPtr)
/* Book-keeping to avoid readahead on first read. */
prefetcher->begin_ptr = recPtr;
- prefetcher->no_readahead_until = 0;
+ prefetcher->no_readahead_until = InvalidXLogRecPtr;
/* This will forget about any queued up records in the decoder. */
XLogBeginRead(prefetcher->reader, recPtr);
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index a81dcbb5d79..4fc37a031d9 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -261,7 +261,7 @@ static TimestampTz XLogReceiptTime = 0;
static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
/* Local copy of WalRcv->flushedUpto */
-static XLogRecPtr flushedUpto = 0;
+static XLogRecPtr flushedUpto = InvalidXLogRecPtr;
static TimeLineID receiveTLI = 0;
/*
@@ -3918,7 +3918,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
PrimarySlotName,
wal_receiver_create_temp_slot);
- flushedUpto = 0;
+ flushedUpto = InvalidXLogRecPtr;
}
/*
@@ -4096,7 +4096,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
static int
emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
{
- static XLogRecPtr lastComplaint = 0;
+ static XLogRecPtr lastComplaint = InvalidXLogRecPtr;
if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
{
diff --git a/src/backend/backup/walsummary.c b/src/backend/backup/walsummary.c
index 21164faac7e..4cd1824fbc6 100644
--- a/src/backend/backup/walsummary.c
+++ b/src/backend/backup/walsummary.c
@@ -214,7 +214,7 @@ OpenWalSummaryFile(WalSummaryFile *ws, bool missing_ok)
LSN_FORMAT_ARGS(ws->end_lsn));
file = PathNameOpenFile(path, O_RDONLY);
- if (file < 0 && (errno != EEXIST || !missing_ok))
+ if (file < 0 && (errno != ENOENT || !missing_ok))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", path)));
@@ -251,7 +251,7 @@ RemoveWalSummaryIfOlderThan(WalSummaryFile *ws, time_t cutoff_time)
if (unlink(path) != 0)
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not stat file \"%s\": %m", path)));
+ errmsg("could not remove file \"%s\": %m", path)));
ereport(DEBUG2,
(errmsg_internal("removing file \"%s\"", path)));
}
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index dd57624b4f9..7d32cd0e159 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -131,13 +131,13 @@ static const struct typinfo TypInfo[] = {
F_OIDVECTORIN, F_OIDVECTOROUT},
{"_int4", INT4ARRAYOID, INT4OID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid,
F_ARRAY_IN, F_ARRAY_OUT},
- {"_text", 1009, TEXTOID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, DEFAULT_COLLATION_OID,
+ {"_text", TEXTARRAYOID, TEXTOID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, DEFAULT_COLLATION_OID,
F_ARRAY_IN, F_ARRAY_OUT},
- {"_oid", 1028, OIDOID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid,
+ {"_oid", OIDARRAYOID, OIDOID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid,
F_ARRAY_IN, F_ARRAY_OUT},
- {"_char", 1002, CHAROID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid,
+ {"_char", CHARARRAYOID, CHAROID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid,
F_ARRAY_IN, F_ARRAY_OUT},
- {"_aclitem", 1034, ACLITEMOID, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, InvalidOid,
+ {"_aclitem", ACLITEMARRAYOID, ACLITEMOID, -1, false, TYPALIGN_DOUBLE, TYPSTORAGE_EXTENDED, InvalidOid,
F_ARRAY_IN, F_ARRAY_OUT}
};
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 606434823cf..a6ed9849e77 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -2635,6 +2635,7 @@ AddRelationNewConstraints(Relation rel,
* requested validity.
*/
if (AdjustNotNullInheritance(RelationGetRelid(rel), colnum,
+ cdef->conname,
is_local, cdef->is_no_inherit,
cdef->skip_validation))
continue;
diff --git a/src/backend/catalog/pg_constraint.c b/src/backend/catalog/pg_constraint.c
index cbbcf166e45..b12765ae691 100644
--- a/src/backend/catalog/pg_constraint.c
+++ b/src/backend/catalog/pg_constraint.c
@@ -731,14 +731,15 @@ extractNotNullColumn(HeapTuple constrTup)
* If a constraint exists but the connoinherit flag is not what the caller
* wants, throw an error about the incompatibility. If the desired
* constraint is valid but the existing constraint is not valid, also
- * throw an error about that (the opposite case is acceptable).
+ * throw an error about that (the opposite case is acceptable). If
+ * the proposed constraint has a different name, also throw an error.
*
* If everything checks out, we adjust conislocal/coninhcount and return
* true. If is_local is true we flip conislocal true, or do nothing if
* it's already true; otherwise we increment coninhcount by 1.
*/
bool
-AdjustNotNullInheritance(Oid relid, AttrNumber attnum,
+AdjustNotNullInheritance(Oid relid, AttrNumber attnum, const char *new_conname,
bool is_local, bool is_no_inherit, bool is_notvalid)
{
HeapTuple tup;
@@ -777,6 +778,22 @@ AdjustNotNullInheritance(Oid relid, AttrNumber attnum,
errhint("You might need to validate it using %s.",
"ALTER TABLE ... VALIDATE CONSTRAINT"));
+ /*
+ * If, for a new constraint that is being defined locally (i.e., not
+ * being passed down via inheritance), a name was specified, then
+ * verify that the existing constraint has the same name. Otherwise
+ * throw an error. Names of inherited constraints are ignored because
+ * they are not directly user-specified, so matching is not important.
+ */
+ if (is_local && new_conname &&
+ strcmp(new_conname, NameStr(conform->conname)) != 0)
+ ereport(ERROR,
+ errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("cannot create not-null constraint \"%s\" on column \"%s\" of table \"%s\"",
+ new_conname, get_attname(relid, attnum, false), get_rel_name(relid)),
+ errdetail("A not-null constraint named \"%s\" already exists for this column.",
+ NameStr(conform->conname)));
+
if (!is_local)
{
if (pg_add_s16_overflow(conform->coninhcount, 1,
diff --git a/src/backend/catalog/pg_depend.c b/src/backend/catalog/pg_depend.c
index 55309d16f15..07c2d41c189 100644
--- a/src/backend/catalog/pg_depend.c
+++ b/src/backend/catalog/pg_depend.c
@@ -23,12 +23,14 @@
#include "catalog/pg_constraint.h"
#include "catalog/pg_depend.h"
#include "catalog/pg_extension.h"
+#include "catalog/pg_type.h"
#include "catalog/partition.h"
#include "commands/extension.h"
#include "miscadmin.h"
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
+#include "utils/syscache.h"
static bool isObjectPinned(const ObjectAddress *object);
@@ -813,6 +815,77 @@ getAutoExtensionsOfObject(Oid classId, Oid objectId)
return result;
}
+/*
+ * Look up a type belonging to an extension.
+ *
+ * Returns the type's OID, or InvalidOid if not found.
+ *
+ * Notice that the type is specified by name only, without a schema.
+ * That's because this will typically be used by relocatable extensions
+ * which can't make a-priori assumptions about which schema their objects
+ * are in. As long as the extension only defines one type of this name,
+ * the answer is unique anyway.
+ *
+ * We might later add the ability to look up functions, operators, etc.
+ */
+Oid
+getExtensionType(Oid extensionOid, const char *typname)
+{
+ Oid result = InvalidOid;
+ Relation depRel;
+ ScanKeyData key[3];
+ SysScanDesc scan;
+ HeapTuple tup;
+
+ depRel = table_open(DependRelationId, AccessShareLock);
+
+ ScanKeyInit(&key[0],
+ Anum_pg_depend_refclassid,
+ BTEqualStrategyNumber, F_OIDEQ,
+ ObjectIdGetDatum(ExtensionRelationId));
+ ScanKeyInit(&key[1],
+ Anum_pg_depend_refobjid,
+ BTEqualStrategyNumber, F_OIDEQ,
+ ObjectIdGetDatum(extensionOid));
+ ScanKeyInit(&key[2],
+ Anum_pg_depend_refobjsubid,
+ BTEqualStrategyNumber, F_INT4EQ,
+ Int32GetDatum(0));
+
+ scan = systable_beginscan(depRel, DependReferenceIndexId, true,
+ NULL, 3, key);
+
+ while (HeapTupleIsValid(tup = systable_getnext(scan)))
+ {
+ Form_pg_depend depform = (Form_pg_depend) GETSTRUCT(tup);
+
+ if (depform->classid == TypeRelationId &&
+ depform->deptype == DEPENDENCY_EXTENSION)
+ {
+ Oid typoid = depform->objid;
+ HeapTuple typtup;
+
+ typtup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typoid));
+ if (!HeapTupleIsValid(typtup))
+ continue; /* should we throw an error? */
+ if (strcmp(NameStr(((Form_pg_type) GETSTRUCT(typtup))->typname),
+ typname) == 0)
+ {
+ result = typoid;
+ ReleaseSysCache(typtup);
+ break; /* no need to keep searching */
+ }
+ ReleaseSysCache(typtup);
+ }
+ }
+
+ systable_endscan(scan);
+
+ table_close(depRel, AccessShareLock);
+
+ return result;
+}
+
/*
* Detect whether a sequence is marked as "owned" by a column
*
diff --git a/src/backend/catalog/pg_proc.c b/src/backend/catalog/pg_proc.c
index acff7a0096d..5df4b3f7a91 100644
--- a/src/backend/catalog/pg_proc.c
+++ b/src/backend/catalog/pg_proc.c
@@ -1206,7 +1206,7 @@ match_prosrc_to_literal(const char *prosrc, const char *literal,
if (cursorpos > 0)
newcp++;
}
- chlen = pg_mblen(prosrc);
+ chlen = pg_mblen_cstr(prosrc);
if (strncmp(prosrc, literal, chlen) != 0)
goto fail;
prosrc += chlen;
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 7553f31fef0..1ea8f1faa9e 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -363,7 +363,28 @@ CREATE VIEW pg_stats_ext_exprs WITH (security_barrier) AS
WHEN (stat.a).stakind3 = 5 THEN (stat.a).stanumbers3
WHEN (stat.a).stakind4 = 5 THEN (stat.a).stanumbers4
WHEN (stat.a).stakind5 = 5 THEN (stat.a).stanumbers5
- END) AS elem_count_histogram
+ END) AS elem_count_histogram,
+ (CASE
+ WHEN (stat.a).stakind1 = 6 THEN (stat.a).stavalues1
+ WHEN (stat.a).stakind2 = 6 THEN (stat.a).stavalues2
+ WHEN (stat.a).stakind3 = 6 THEN (stat.a).stavalues3
+ WHEN (stat.a).stakind4 = 6 THEN (stat.a).stavalues4
+ WHEN (stat.a).stakind5 = 6 THEN (stat.a).stavalues5
+ END) AS range_length_histogram,
+ (CASE
+ WHEN (stat.a).stakind1 = 6 THEN (stat.a).stanumbers1[1]
+ WHEN (stat.a).stakind2 = 6 THEN (stat.a).stanumbers2[1]
+ WHEN (stat.a).stakind3 = 6 THEN (stat.a).stanumbers3[1]
+ WHEN (stat.a).stakind4 = 6 THEN (stat.a).stanumbers4[1]
+ WHEN (stat.a).stakind5 = 6 THEN (stat.a).stanumbers5[1]
+ END) AS range_empty_frac,
+ (CASE
+ WHEN (stat.a).stakind1 = 7 THEN (stat.a).stavalues1
+ WHEN (stat.a).stakind2 = 7 THEN (stat.a).stavalues2
+ WHEN (stat.a).stakind3 = 7 THEN (stat.a).stavalues3
+ WHEN (stat.a).stakind4 = 7 THEN (stat.a).stavalues4
+ WHEN (stat.a).stakind5 = 7 THEN (stat.a).stavalues5
+ END) AS range_bounds_histogram
FROM pg_statistic_ext s JOIN pg_class c ON (c.oid = s.stxrelid)
LEFT JOIN pg_statistic_ext_data sd ON (s.oid = sd.stxoid)
LEFT JOIN pg_namespace cn ON (cn.oid = c.relnamespace)
diff --git a/src/backend/commands/comment.c b/src/backend/commands/comment.c
index caacb17e5d7..771aba2a69f 100644
--- a/src/backend/commands/comment.c
+++ b/src/backend/commands/comment.c
@@ -41,6 +41,7 @@ CommentObject(CommentStmt *stmt)
{
Relation relation;
ObjectAddress address = InvalidObjectAddress;
+ bool missing_ok;
/*
* When loading a dump, we may see a COMMENT ON DATABASE for the old name
@@ -63,6 +64,14 @@ CommentObject(CommentStmt *stmt)
}
}
+ /*
+ * During binary upgrade, allow nonexistent large objects so that we don't
+ * have to create them during schema restoration. pg_upgrade will
+ * transfer the contents of pg_largeobject_metadata via COPY or by
+ * copying/linking its files from the old cluster later on.
+ */
+ missing_ok = IsBinaryUpgrade && stmt->objtype == OBJECT_LARGEOBJECT;
+
/*
* Translate the parser representation that identifies this object into an
* ObjectAddress. get_object_address() will throw an error if the object
@@ -70,7 +79,8 @@ CommentObject(CommentStmt *stmt)
* against concurrent DROP operations.
*/
address = get_object_address(stmt->objtype, stmt->object,
- &relation, ShareUpdateExclusiveLock, false);
+ &relation, ShareUpdateExclusiveLock,
+ missing_ok);
/* Require ownership of the target object. */
check_object_ownership(GetUserId(), stmt->objtype, address,
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index 5868a7fa11f..94d6f415a06 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -249,7 +249,9 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
switch (cstate->copy_src)
{
case COPY_FILE:
+ pgstat_report_wait_start(WAIT_EVENT_COPY_FROM_READ);
bytesread = fread(databuf, 1, maxread, cstate->copy_file);
+ pgstat_report_wait_end();
if (ferror(cstate->copy_file))
ereport(ERROR,
(errcode_for_file_access(),
diff --git a/src/backend/commands/copyto.c b/src/backend/commands/copyto.c
index 4ab4a3893d5..9ceeff6d99e 100644
--- a/src/backend/commands/copyto.c
+++ b/src/backend/commands/copyto.c
@@ -454,6 +454,7 @@ CopySendEndOfRow(CopyToState cstate)
switch (cstate->copy_dest)
{
case COPY_FILE:
+ pgstat_report_wait_start(WAIT_EVENT_COPY_TO_WRITE);
if (fwrite(fe_msgbuf->data, fe_msgbuf->len, 1,
cstate->copy_file) != 1 ||
ferror(cstate->copy_file))
@@ -486,6 +487,7 @@ CopySendEndOfRow(CopyToState cstate)
(errcode_for_file_access(),
errmsg("could not write to COPY file: %m")));
}
+ pgstat_report_wait_end();
break;
case COPY_FRONTEND:
/* Dump the accumulated row as one CopyData message */
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 87949054f26..33311760df7 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -60,6 +60,7 @@
#include "storage/lmgr.h"
#include "storage/md.h"
#include "storage/procarray.h"
+#include "storage/procsignal.h"
#include "storage/smgr.h"
#include "utils/acl.h"
#include "utils/builtins.h"
diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c
index 596105ee078..81f24615d51 100644
--- a/src/backend/commands/extension.c
+++ b/src/backend/commands/extension.c
@@ -45,6 +45,7 @@
#include "catalog/pg_depend.h"
#include "catalog/pg_extension.h"
#include "catalog/pg_namespace.h"
+#include "catalog/pg_proc.h"
#include "catalog/pg_type.h"
#include "commands/alter.h"
#include "commands/comment.h"
@@ -62,6 +63,7 @@
#include "utils/builtins.h"
#include "utils/conffiles.h"
#include "utils/fmgroids.h"
+#include "utils/inval.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/rel.h"
@@ -141,7 +143,26 @@ typedef struct
char *loc;
} ExtensionLocation;
+/*
+ * Cache structure for get_function_sibling_type (and maybe later,
+ * allied lookup functions).
+ */
+typedef struct ExtensionSiblingCache
+{
+ struct ExtensionSiblingCache *next; /* list link */
+ /* lookup key: requesting function's OID and type name */
+ Oid reqfuncoid;
+ const char *typname;
+ bool valid; /* is entry currently valid? */
+ uint32 exthash; /* cache hash of owning extension's OID */
+ Oid typeoid; /* OID associated with typname */
+} ExtensionSiblingCache;
+
+/* Head of linked list of ExtensionSiblingCache structs */
+static ExtensionSiblingCache *ext_sibling_list = NULL;
+
/* Local functions */
+static void ext_sibling_callback(Datum arg, int cacheid, uint32 hashvalue);
static List *find_update_path(List *evi_list,
ExtensionVersionInfo *evi_start,
ExtensionVersionInfo *evi_target,
@@ -263,6 +284,114 @@ get_extension_schema(Oid ext_oid)
return result;
}
+/*
+ * get_function_sibling_type - find a type belonging to same extension as func
+ *
+ * Returns the type's OID, or InvalidOid if not found.
+ *
+ * This is useful in extensions, which won't have fixed object OIDs.
+ * We work from the calling function's own OID, which it can get from its
+ * FunctionCallInfo parameter, and look up the owning extension and thence
+ * a type belonging to the same extension.
+ *
+ * Notice that the type is specified by name only, without a schema.
+ * That's because this will typically be used by relocatable extensions
+ * which can't make a-priori assumptions about which schema their objects
+ * are in. As long as the extension only defines one type of this name,
+ * the answer is unique anyway.
+ *
+ * We might later add the ability to look up functions, operators, etc.
+ *
+ * This code is simply a frontend for some pg_depend lookups. Those lookups
+ * are fairly expensive, so we provide a simple cache facility. We assume
+ * that the passed typname is actually a C constant, or at least permanently
+ * allocated, so that we need not copy that string.
+ */
+Oid
+get_function_sibling_type(Oid funcoid, const char *typname)
+{
+ ExtensionSiblingCache *cache_entry;
+ Oid extoid;
+ Oid typeoid;
+
+ /*
+ * See if we have the answer cached. Someday there may be enough callers
+ * to justify a hash table, but for now, a simple linked list is fine.
+ */
+ for (cache_entry = ext_sibling_list; cache_entry != NULL;
+ cache_entry = cache_entry->next)
+ {
+ if (funcoid == cache_entry->reqfuncoid &&
+ strcmp(typname, cache_entry->typname) == 0)
+ break;
+ }
+ if (cache_entry && cache_entry->valid)
+ return cache_entry->typeoid;
+
+ /*
+ * Nope, so do the expensive lookups. We do not expect failures, so we do
+ * not cache negative results.
+ */
+ extoid = getExtensionOfObject(ProcedureRelationId, funcoid);
+ if (!OidIsValid(extoid))
+ return InvalidOid;
+ typeoid = getExtensionType(extoid, typname);
+ if (!OidIsValid(typeoid))
+ return InvalidOid;
+
+ /*
+ * Build, or revalidate, cache entry.
+ */
+ if (cache_entry == NULL)
+ {
+ /* Register invalidation hook if this is first entry */
+ if (ext_sibling_list == NULL)
+ CacheRegisterSyscacheCallback(EXTENSIONOID,
+ ext_sibling_callback,
+ (Datum) 0);
+
+ /* Momentarily zero the space to ensure valid flag is false */
+ cache_entry = (ExtensionSiblingCache *)
+ MemoryContextAllocZero(CacheMemoryContext,
+ sizeof(ExtensionSiblingCache));
+ cache_entry->next = ext_sibling_list;
+ ext_sibling_list = cache_entry;
+ }
+
+ cache_entry->reqfuncoid = funcoid;
+ cache_entry->typname = typname;
+ cache_entry->exthash = GetSysCacheHashValue1(EXTENSIONOID,
+ ObjectIdGetDatum(extoid));
+ cache_entry->typeoid = typeoid;
+ /* Mark it valid only once it's fully populated */
+ cache_entry->valid = true;
+
+ return typeoid;
+}
+
+/*
+ * ext_sibling_callback
+ * Syscache inval callback function for EXTENSIONOID cache
+ *
+ * It seems sufficient to invalidate ExtensionSiblingCache entries when
+ * the owning extension's pg_extension entry is modified or deleted.
+ * Neither a requesting function's OID, nor the OID of the object it's
+ * looking for, could change without an extension update or drop/recreate.
+ */
+static void
+ext_sibling_callback(Datum arg, int cacheid, uint32 hashvalue)
+{
+ ExtensionSiblingCache *cache_entry;
+
+ for (cache_entry = ext_sibling_list; cache_entry != NULL;
+ cache_entry = cache_entry->next)
+ {
+ if (hashvalue == 0 ||
+ cache_entry->exthash == hashvalue)
+ cache_entry->valid = false;
+ }
+}
+
/*
* Utility functions to check validity of extension and version names
*/
@@ -1191,7 +1320,7 @@ execute_extension_script(Oid extensionOid, ExtensionControlFile *control,
(void) set_config_option("client_min_messages", "warning",
PGC_USERSET, PGC_S_SESSION,
GUC_ACTION_SAVE, true, 0, false);
- if (log_min_messages < WARNING)
+ if (log_min_messages[MyBackendType] < WARNING)
(void) set_config_option_ext("log_min_messages", "warning",
PGC_SUSET, PGC_S_SESSION,
BOOTSTRAP_SUPERUSERID,
@@ -2557,9 +2686,9 @@ extension_file_exists(const char *extensionName)
locations = get_extension_control_directories();
- foreach_ptr(char, location, locations)
+ foreach_ptr(ExtensionLocation, location, locations)
{
- dir = AllocateDir(location);
+ dir = AllocateDir(location->loc);
/*
* If the control directory doesn't exist, we want to silently return
@@ -2571,7 +2700,7 @@ extension_file_exists(const char *extensionName)
}
else
{
- while ((de = ReadDir(dir, location)) != NULL)
+ while ((de = ReadDir(dir, location->loc)) != NULL)
{
char *extname;
diff --git a/src/backend/commands/operatorcmds.c b/src/backend/commands/operatorcmds.c
index 9f7e0ed17ce..3e7b09b3494 100644
--- a/src/backend/commands/operatorcmds.c
+++ b/src/backend/commands/operatorcmds.c
@@ -276,7 +276,6 @@ ValidateRestrictionEstimator(List *restrictionName)
{
Oid typeId[4];
Oid restrictionOid;
- AclResult aclresult;
typeId[0] = INTERNALOID; /* PlannerInfo */
typeId[1] = OIDOID; /* operator OID */
@@ -292,11 +291,33 @@ ValidateRestrictionEstimator(List *restrictionName)
errmsg("restriction estimator function %s must return type %s",
NameListToString(restrictionName), "float8")));
- /* Require EXECUTE rights for the estimator */
- aclresult = object_aclcheck(ProcedureRelationId, restrictionOid, GetUserId(), ACL_EXECUTE);
- if (aclresult != ACLCHECK_OK)
- aclcheck_error(aclresult, OBJECT_FUNCTION,
- NameListToString(restrictionName));
+ /*
+ * If the estimator is not a built-in function, require superuser
+ * privilege to install it. This protects against using something that is
+ * not a restriction estimator or has hard-wired assumptions about what
+ * data types it is working with. (Built-in estimators are required to
+ * defend themselves adequately against unexpected data type choices, but
+ * it seems impractical to expect that of extensions' estimators.)
+ *
+ * If it is built-in, only require EXECUTE rights.
+ */
+ if (restrictionOid >= FirstGenbkiObjectId)
+ {
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be superuser to specify a non-built-in restriction estimator function")));
+ }
+ else
+ {
+ AclResult aclresult;
+
+ aclresult = object_aclcheck(ProcedureRelationId, restrictionOid,
+ GetUserId(), ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_FUNCTION,
+ NameListToString(restrictionName));
+ }
return restrictionOid;
}
@@ -312,7 +333,6 @@ ValidateJoinEstimator(List *joinName)
Oid typeId[5];
Oid joinOid;
Oid joinOid2;
- AclResult aclresult;
typeId[0] = INTERNALOID; /* PlannerInfo */
typeId[1] = OIDOID; /* operator OID */
@@ -350,11 +370,24 @@ ValidateJoinEstimator(List *joinName)
errmsg("join estimator function %s must return type %s",
NameListToString(joinName), "float8")));
- /* Require EXECUTE rights for the estimator */
- aclresult = object_aclcheck(ProcedureRelationId, joinOid, GetUserId(), ACL_EXECUTE);
- if (aclresult != ACLCHECK_OK)
- aclcheck_error(aclresult, OBJECT_FUNCTION,
- NameListToString(joinName));
+ /* privilege checks are the same as in ValidateRestrictionEstimator */
+ if (joinOid >= FirstGenbkiObjectId)
+ {
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be superuser to specify a non-built-in join estimator function")));
+ }
+ else
+ {
+ AclResult aclresult;
+
+ aclresult = object_aclcheck(ProcedureRelationId, joinOid,
+ GetUserId(), ACL_EXECUTE);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, OBJECT_FUNCTION,
+ NameListToString(joinName));
+ }
return joinOid;
}
diff --git a/src/backend/commands/seclabel.c b/src/backend/commands/seclabel.c
index 4160f5b6855..5b80396723c 100644
--- a/src/backend/commands/seclabel.c
+++ b/src/backend/commands/seclabel.c
@@ -118,6 +118,7 @@ ExecSecLabelStmt(SecLabelStmt *stmt)
ObjectAddress address;
Relation relation;
ListCell *lc;
+ bool missing_ok;
/*
* Find the named label provider, or if none specified, check whether
@@ -159,6 +160,14 @@ ExecSecLabelStmt(SecLabelStmt *stmt)
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("security labels are not supported for this type of object")));
+ /*
+ * During binary upgrade, allow nonexistent large objects so that we don't
+ * have to create them during schema restoration. pg_upgrade will
+ * transfer the contents of pg_largeobject_metadata via COPY or by
+ * copying/linking its files from the old cluster later on.
+ */
+ missing_ok = IsBinaryUpgrade && stmt->objtype == OBJECT_LARGEOBJECT;
+
/*
* Translate the parser representation which identifies this object into
* an ObjectAddress. get_object_address() will throw an error if the
@@ -166,7 +175,8 @@ ExecSecLabelStmt(SecLabelStmt *stmt)
* guard against concurrent modifications.
*/
address = get_object_address(stmt->objtype, stmt->object,
- &relation, ShareUpdateExclusiveLock, false);
+ &relation, ShareUpdateExclusiveLock,
+ missing_ok);
/* Require ownership of the target object. */
check_object_ownership(GetUserId(), stmt->objtype, address,
diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index 0b064891932..3511a4ec0fd 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -70,6 +70,7 @@
#include "miscadmin.h"
#include "postmaster/bgwriter.h"
#include "storage/fd.h"
+#include "storage/procsignal.h"
#include "storage/standby.h"
#include "utils/acl.h"
#include "utils/builtins.h"
diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c
index a7a5ac1e83b..61ff5ddc74c 100644
--- a/src/backend/executor/execExprInterp.c
+++ b/src/backend/executor/execExprInterp.c
@@ -4032,6 +4032,7 @@ ExecEvalScalarArrayOp(ExprState *state, ExprEvalStep *op)
int16 typlen;
bool typbyval;
char typalign;
+ uint8 typalignby;
char *s;
bits8 *bitmap;
int bitmask;
@@ -4086,6 +4087,7 @@ ExecEvalScalarArrayOp(ExprState *state, ExprEvalStep *op)
typlen = op->d.scalararrayop.typlen;
typbyval = op->d.scalararrayop.typbyval;
typalign = op->d.scalararrayop.typalign;
+ typalignby = typalign_to_alignby(typalign);
/* Initialize result appropriately depending on useOr */
result = BoolGetDatum(!useOr);
@@ -4111,7 +4113,7 @@ ExecEvalScalarArrayOp(ExprState *state, ExprEvalStep *op)
{
elt = fetch_att(s, typbyval, typlen);
s = att_addlength_pointer(s, typlen, s);
- s = (char *) att_align_nominal(s, typalign);
+ s = (char *) att_nominal_alignby(s, typalignby);
fcinfo->args[1].value = elt;
fcinfo->args[1].isnull = false;
}
@@ -4255,6 +4257,7 @@ ExecEvalHashedScalarArrayOp(ExprState *state, ExprEvalStep *op, ExprContext *eco
int16 typlen;
bool typbyval;
char typalign;
+ uint8 typalignby;
int nitems;
bool has_nulls = false;
char *s;
@@ -4272,6 +4275,7 @@ ExecEvalHashedScalarArrayOp(ExprState *state, ExprEvalStep *op, ExprContext *eco
&typlen,
&typbyval,
&typalign);
+ typalignby = typalign_to_alignby(typalign);
oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory);
@@ -4318,7 +4322,7 @@ ExecEvalHashedScalarArrayOp(ExprState *state, ExprEvalStep *op, ExprContext *eco
element = fetch_att(s, typbyval, typlen);
s = att_addlength_pointer(s, typlen, s);
- s = (char *) att_align_nominal(s, typalign);
+ s = (char *) att_nominal_alignby(s, typalignby);
saophash_insert(elements_tab->hashtab, element, &hashfound);
}
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 772e81f3154..f87978c137e 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -106,7 +106,7 @@ struct SharedExecutorInstrumentation
/* array of num_plan_nodes * num_workers Instrumentation objects follows */
};
#define GetInstrumentationArray(sei) \
- (AssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \
+ (StaticAssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \
(Instrumentation *) (((char *) sei) + sei->instrument_offset))
/* Context object for ExecParallelEstimate. */
diff --git a/src/backend/jit/llvm/llvmjit_types.c b/src/backend/jit/llvm/llvmjit_types.c
index 4636b90cd0f..c8a1f841293 100644
--- a/src/backend/jit/llvm/llvmjit_types.c
+++ b/src/backend/jit/llvm/llvmjit_types.c
@@ -81,7 +81,7 @@ extern Datum AttributeTemplate(PG_FUNCTION_ARGS);
Datum
AttributeTemplate(PG_FUNCTION_ARGS)
{
- AssertVariableIsOfType(&AttributeTemplate, PGFunction);
+ StaticAssertVariableIsOfType(&AttributeTemplate, PGFunction);
PG_RETURN_NULL();
}
@@ -99,8 +99,8 @@ ExecEvalSubroutineTemplate(ExprState *state,
struct ExprEvalStep *op,
ExprContext *econtext)
{
- AssertVariableIsOfType(&ExecEvalSubroutineTemplate,
- ExecEvalSubroutine);
+ StaticAssertVariableIsOfType(&ExecEvalSubroutineTemplate,
+ ExecEvalSubroutine);
}
extern bool ExecEvalBoolSubroutineTemplate(ExprState *state,
@@ -111,8 +111,8 @@ ExecEvalBoolSubroutineTemplate(ExprState *state,
struct ExprEvalStep *op,
ExprContext *econtext)
{
- AssertVariableIsOfType(&ExecEvalBoolSubroutineTemplate,
- ExecEvalBoolSubroutine);
+ StaticAssertVariableIsOfType(&ExecEvalBoolSubroutineTemplate,
+ ExecEvalBoolSubroutine);
return false;
}
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index b4581e54d93..90275e25872 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -128,8 +128,10 @@ static Path *get_cheapest_parameterized_child_path(PlannerInfo *root,
Relids required_outer);
static void accumulate_append_subpath(Path *path,
List **subpaths,
- List **special_subpaths);
-static Path *get_singleton_append_subpath(Path *path);
+ List **special_subpaths,
+ List **child_append_relid_sets);
+static Path *get_singleton_append_subpath(Path *path,
+ List **child_append_relid_sets);
static void set_dummy_rel_pathlist(RelOptInfo *rel);
static void set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel,
Index rti, RangeTblEntry *rte);
@@ -1404,22 +1406,21 @@ void
add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
List *live_childrels)
{
- List *subpaths = NIL;
- bool subpaths_valid = true;
- List *startup_subpaths = NIL;
- bool startup_subpaths_valid = true;
- List *partial_subpaths = NIL;
- List *pa_partial_subpaths = NIL;
- List *pa_nonpartial_subpaths = NIL;
- bool partial_subpaths_valid = true;
- bool pa_subpaths_valid;
+ AppendPathInput unparameterized = {0};
+ AppendPathInput startup = {0};
+ AppendPathInput partial_only = {0};
+ AppendPathInput parallel_append = {0};
+ bool unparameterized_valid = true;
+ bool startup_valid = true;
+ bool partial_only_valid = true;
+ bool parallel_append_valid = true;
List *all_child_pathkeys = NIL;
List *all_child_outers = NIL;
ListCell *l;
double partial_rows = -1;
/* If appropriate, consider parallel append */
- pa_subpaths_valid = enable_parallel_append && rel->consider_parallel;
+ parallel_append_valid = enable_parallel_append && rel->consider_parallel;
/*
* For every non-dummy child, remember the cheapest path. Also, identify
@@ -1443,9 +1444,9 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
if (childrel->pathlist != NIL &&
childrel->cheapest_total_path->param_info == NULL)
accumulate_append_subpath(childrel->cheapest_total_path,
- &subpaths, NULL);
+ &unparameterized.subpaths, NULL, &unparameterized.child_append_relid_sets);
else
- subpaths_valid = false;
+ unparameterized_valid = false;
/*
* When the planner is considering cheap startup plans, we'll also
@@ -1471,11 +1472,12 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
/* cheapest_startup_path must not be a parameterized path. */
Assert(cheapest_path->param_info == NULL);
accumulate_append_subpath(cheapest_path,
- &startup_subpaths,
- NULL);
+ &startup.subpaths,
+ NULL,
+ &startup.child_append_relid_sets);
}
else
- startup_subpaths_valid = false;
+ startup_valid = false;
/* Same idea, but for a partial plan. */
@@ -1483,16 +1485,17 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
{
cheapest_partial_path = linitial(childrel->partial_pathlist);
accumulate_append_subpath(cheapest_partial_path,
- &partial_subpaths, NULL);
+ &partial_only.partial_subpaths, NULL,
+ &partial_only.child_append_relid_sets);
}
else
- partial_subpaths_valid = false;
+ partial_only_valid = false;
/*
* Same idea, but for a parallel append mixing partial and non-partial
* paths.
*/
- if (pa_subpaths_valid)
+ if (parallel_append_valid)
{
Path *nppath = NULL;
@@ -1502,7 +1505,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
if (cheapest_partial_path == NULL && nppath == NULL)
{
/* Neither a partial nor a parallel-safe path? Forget it. */
- pa_subpaths_valid = false;
+ parallel_append_valid = false;
}
else if (nppath == NULL ||
(cheapest_partial_path != NULL &&
@@ -1511,8 +1514,9 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
/* Partial path is cheaper or the only option. */
Assert(cheapest_partial_path != NULL);
accumulate_append_subpath(cheapest_partial_path,
- &pa_partial_subpaths,
- &pa_nonpartial_subpaths);
+ ¶llel_append.partial_subpaths,
+ ¶llel_append.subpaths,
+ ¶llel_append.child_append_relid_sets);
}
else
{
@@ -1530,8 +1534,9 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
* figure that out.
*/
accumulate_append_subpath(nppath,
- &pa_nonpartial_subpaths,
- NULL);
+ ¶llel_append.subpaths,
+ NULL,
+ ¶llel_append.child_append_relid_sets);
}
}
@@ -1605,28 +1610,28 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
* unparameterized Append path for the rel. (Note: this is correct even
* if we have zero or one live subpath due to constraint exclusion.)
*/
- if (subpaths_valid)
- add_path(rel, (Path *) create_append_path(root, rel, subpaths, NIL,
+ if (unparameterized_valid)
+ add_path(rel, (Path *) create_append_path(root, rel, unparameterized,
NIL, NULL, 0, false,
-1));
/* build an AppendPath for the cheap startup paths, if valid */
- if (startup_subpaths_valid)
- add_path(rel, (Path *) create_append_path(root, rel, startup_subpaths,
- NIL, NIL, NULL, 0, false, -1));
+ if (startup_valid)
+ add_path(rel, (Path *) create_append_path(root, rel, startup,
+ NIL, NULL, 0, false, -1));
/*
* Consider an append of unordered, unparameterized partial paths. Make
* it parallel-aware if possible.
*/
- if (partial_subpaths_valid && partial_subpaths != NIL)
+ if (partial_only_valid && partial_only.partial_subpaths != NIL)
{
AppendPath *appendpath;
ListCell *lc;
int parallel_workers = 0;
/* Find the highest number of workers requested for any subpath. */
- foreach(lc, partial_subpaths)
+ foreach(lc, partial_only.partial_subpaths)
{
Path *path = lfirst(lc);
@@ -1653,7 +1658,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
Assert(parallel_workers > 0);
/* Generate a partial append path. */
- appendpath = create_append_path(root, rel, NIL, partial_subpaths,
+ appendpath = create_append_path(root, rel, partial_only,
NIL, NULL, parallel_workers,
enable_parallel_append,
-1);
@@ -1674,7 +1679,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
* a non-partial path that is substantially cheaper than any partial path;
* otherwise, we should use the append path added in the previous step.)
*/
- if (pa_subpaths_valid && pa_nonpartial_subpaths != NIL)
+ if (parallel_append_valid && parallel_append.subpaths != NIL)
{
AppendPath *appendpath;
ListCell *lc;
@@ -1684,7 +1689,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
* Find the highest number of workers requested for any partial
* subpath.
*/
- foreach(lc, pa_partial_subpaths)
+ foreach(lc, parallel_append.partial_subpaths)
{
Path *path = lfirst(lc);
@@ -1702,8 +1707,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
max_parallel_workers_per_gather);
Assert(parallel_workers > 0);
- appendpath = create_append_path(root, rel, pa_nonpartial_subpaths,
- pa_partial_subpaths,
+ appendpath = create_append_path(root, rel, parallel_append,
NIL, NULL, parallel_workers, true,
partial_rows);
add_partial_path(rel, (Path *) appendpath);
@@ -1713,7 +1717,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
* Also build unparameterized ordered append paths based on the collected
* list of child pathkeys.
*/
- if (subpaths_valid)
+ if (unparameterized_valid)
generate_orderedappend_paths(root, rel, live_childrels,
all_child_pathkeys);
@@ -1734,10 +1738,10 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
{
Relids required_outer = (Relids) lfirst(l);
ListCell *lcr;
+ AppendPathInput parameterized = {0};
+ bool parameterized_valid = true;
/* Select the child paths for an Append with this parameterization */
- subpaths = NIL;
- subpaths_valid = true;
foreach(lcr, live_childrels)
{
RelOptInfo *childrel = (RelOptInfo *) lfirst(lcr);
@@ -1746,7 +1750,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
if (childrel->pathlist == NIL)
{
/* failed to make a suitable path for this child */
- subpaths_valid = false;
+ parameterized_valid = false;
break;
}
@@ -1756,15 +1760,16 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
if (subpath == NULL)
{
/* failed to make a suitable path for this child */
- subpaths_valid = false;
+ parameterized_valid = false;
break;
}
- accumulate_append_subpath(subpath, &subpaths, NULL);
+ accumulate_append_subpath(subpath, ¶meterized.subpaths, NULL,
+ ¶meterized.child_append_relid_sets);
}
- if (subpaths_valid)
+ if (parameterized_valid)
add_path(rel, (Path *)
- create_append_path(root, rel, subpaths, NIL,
+ create_append_path(root, rel, parameterized,
NIL, required_outer, 0, false,
-1));
}
@@ -1785,13 +1790,14 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
{
Path *path = (Path *) lfirst(l);
AppendPath *appendpath;
+ AppendPathInput append = {0};
/* skip paths with no pathkeys. */
if (path->pathkeys == NIL)
continue;
- appendpath = create_append_path(root, rel, NIL, list_make1(path),
- NIL, NULL,
+ append.partial_subpaths = list_make1(path);
+ appendpath = create_append_path(root, rel, append, NIL, NULL,
path->parallel_workers, true,
partial_rows);
add_partial_path(rel, (Path *) appendpath);
@@ -1873,9 +1879,9 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel,
foreach(lcp, all_child_pathkeys)
{
List *pathkeys = (List *) lfirst(lcp);
- List *startup_subpaths = NIL;
- List *total_subpaths = NIL;
- List *fractional_subpaths = NIL;
+ AppendPathInput startup = {0};
+ AppendPathInput total = {0};
+ AppendPathInput fractional = {0};
bool startup_neq_total = false;
bool fraction_neq_total = false;
bool match_partition_order;
@@ -2038,16 +2044,23 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel,
* just a single subpath (and hence aren't doing anything
* useful).
*/
- cheapest_startup = get_singleton_append_subpath(cheapest_startup);
- cheapest_total = get_singleton_append_subpath(cheapest_total);
+ cheapest_startup =
+ get_singleton_append_subpath(cheapest_startup,
+ &startup.child_append_relid_sets);
+ cheapest_total =
+ get_singleton_append_subpath(cheapest_total,
+ &total.child_append_relid_sets);
- startup_subpaths = lappend(startup_subpaths, cheapest_startup);
- total_subpaths = lappend(total_subpaths, cheapest_total);
+ startup.subpaths = lappend(startup.subpaths, cheapest_startup);
+ total.subpaths = lappend(total.subpaths, cheapest_total);
if (cheapest_fractional)
{
- cheapest_fractional = get_singleton_append_subpath(cheapest_fractional);
- fractional_subpaths = lappend(fractional_subpaths, cheapest_fractional);
+ cheapest_fractional =
+ get_singleton_append_subpath(cheapest_fractional,
+ &fractional.child_append_relid_sets);
+ fractional.subpaths =
+ lappend(fractional.subpaths, cheapest_fractional);
}
}
else
@@ -2057,13 +2070,16 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel,
* child paths for the MergeAppend.
*/
accumulate_append_subpath(cheapest_startup,
- &startup_subpaths, NULL);
+ &startup.subpaths, NULL,
+ &startup.child_append_relid_sets);
accumulate_append_subpath(cheapest_total,
- &total_subpaths, NULL);
+ &total.subpaths, NULL,
+ &total.child_append_relid_sets);
if (cheapest_fractional)
accumulate_append_subpath(cheapest_fractional,
- &fractional_subpaths, NULL);
+ &fractional.subpaths, NULL,
+ &fractional.child_append_relid_sets);
}
}
@@ -2073,8 +2089,7 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel,
/* We only need Append */
add_path(rel, (Path *) create_append_path(root,
rel,
- startup_subpaths,
- NIL,
+ startup,
pathkeys,
NULL,
0,
@@ -2083,19 +2098,17 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel,
if (startup_neq_total)
add_path(rel, (Path *) create_append_path(root,
rel,
- total_subpaths,
- NIL,
+ total,
pathkeys,
NULL,
0,
false,
-1));
- if (fractional_subpaths && fraction_neq_total)
+ if (fractional.subpaths && fraction_neq_total)
add_path(rel, (Path *) create_append_path(root,
rel,
- fractional_subpaths,
- NIL,
+ fractional,
pathkeys,
NULL,
0,
@@ -2107,20 +2120,23 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel,
/* We need MergeAppend */
add_path(rel, (Path *) create_merge_append_path(root,
rel,
- startup_subpaths,
+ startup.subpaths,
+ startup.child_append_relid_sets,
pathkeys,
NULL));
if (startup_neq_total)
add_path(rel, (Path *) create_merge_append_path(root,
rel,
- total_subpaths,
+ total.subpaths,
+ total.child_append_relid_sets,
pathkeys,
NULL));
- if (fractional_subpaths && fraction_neq_total)
+ if (fractional.subpaths && fraction_neq_total)
add_path(rel, (Path *) create_merge_append_path(root,
rel,
- fractional_subpaths,
+ fractional.subpaths,
+ fractional.child_append_relid_sets,
pathkeys,
NULL));
}
@@ -2223,7 +2239,8 @@ get_cheapest_parameterized_child_path(PlannerInfo *root, RelOptInfo *rel,
* paths).
*/
static void
-accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths)
+accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths,
+ List **child_append_relid_sets)
{
if (IsA(path, AppendPath))
{
@@ -2232,6 +2249,11 @@ accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths)
if (!apath->path.parallel_aware || apath->first_partial_path == 0)
{
*subpaths = list_concat(*subpaths, apath->subpaths);
+ *child_append_relid_sets =
+ lappend(*child_append_relid_sets, path->parent->relids);
+ *child_append_relid_sets =
+ list_concat(*child_append_relid_sets,
+ apath->child_append_relid_sets);
return;
}
else if (special_subpaths != NULL)
@@ -2246,6 +2268,11 @@ accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths)
apath->first_partial_path);
*special_subpaths = list_concat(*special_subpaths,
new_special_subpaths);
+ *child_append_relid_sets =
+ lappend(*child_append_relid_sets, path->parent->relids);
+ *child_append_relid_sets =
+ list_concat(*child_append_relid_sets,
+ apath->child_append_relid_sets);
return;
}
}
@@ -2254,6 +2281,11 @@ accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths)
MergeAppendPath *mpath = (MergeAppendPath *) path;
*subpaths = list_concat(*subpaths, mpath->subpaths);
+ *child_append_relid_sets =
+ lappend(*child_append_relid_sets, path->parent->relids);
+ *child_append_relid_sets =
+ list_concat(*child_append_relid_sets,
+ mpath->child_append_relid_sets);
return;
}
@@ -2265,10 +2297,15 @@ accumulate_append_subpath(Path *path, List **subpaths, List **special_subpaths)
* Returns the single subpath of an Append/MergeAppend, or just
* return 'path' if it's not a single sub-path Append/MergeAppend.
*
+ * As a side effect, whenever we return a single subpath rather than the
+ * original path, add the relid sets for the original path to
+ * child_append_relid_sets, so that those relids don't entirely disappear
+ * from the final plan.
+ *
* Note: 'path' must not be a parallel-aware path.
*/
static Path *
-get_singleton_append_subpath(Path *path)
+get_singleton_append_subpath(Path *path, List **child_append_relid_sets)
{
Assert(!path->parallel_aware);
@@ -2277,14 +2314,28 @@ get_singleton_append_subpath(Path *path)
AppendPath *apath = (AppendPath *) path;
if (list_length(apath->subpaths) == 1)
+ {
+ *child_append_relid_sets =
+ lappend(*child_append_relid_sets, path->parent->relids);
+ *child_append_relid_sets =
+ list_concat(*child_append_relid_sets,
+ apath->child_append_relid_sets);
return (Path *) linitial(apath->subpaths);
+ }
}
else if (IsA(path, MergeAppendPath))
{
MergeAppendPath *mpath = (MergeAppendPath *) path;
if (list_length(mpath->subpaths) == 1)
+ {
+ *child_append_relid_sets =
+ lappend(*child_append_relid_sets, path->parent->relids);
+ *child_append_relid_sets =
+ list_concat(*child_append_relid_sets,
+ mpath->child_append_relid_sets);
return (Path *) linitial(mpath->subpaths);
+ }
}
return path;
@@ -2304,6 +2355,8 @@ get_singleton_append_subpath(Path *path)
static void
set_dummy_rel_pathlist(RelOptInfo *rel)
{
+ AppendPathInput in = {0};
+
/* Set dummy size estimates --- we leave attr_widths[] as zeroes */
rel->rows = 0;
rel->reltarget->width = 0;
@@ -2313,7 +2366,7 @@ set_dummy_rel_pathlist(RelOptInfo *rel)
rel->partial_pathlist = NIL;
/* Set up the dummy path */
- add_path(rel, (Path *) create_append_path(NULL, rel, NIL, NIL,
+ add_path(rel, (Path *) create_append_path(NULL, rel, in,
NIL, rel->lateral_relids,
0, false, -1));
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 4da0b17f137..89ca4e08bf1 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -1461,7 +1461,6 @@ cost_tidrangescan(Path *path, PlannerInfo *root,
enable_mask |= PGS_CONSIDER_NONPARTIAL;
path->disabled_nodes =
(baserel->pgs_mask & enable_mask) != enable_mask ? 1 : 0;
- path->disabled_nodes = 0;
path->startup_cost = startup_cost;
path->total_cost = startup_cost + cpu_run_cost + disk_run_cost;
}
@@ -2590,11 +2589,6 @@ cost_material(Path *path,
double nbytes = relation_byte_size(tuples, width);
double work_mem_bytes = work_mem * (Size) 1024;
- if (path->parallel_workers == 0 &&
- path->parent != NULL &&
- (path->parent->pgs_mask & PGS_CONSIDER_NONPARTIAL) == 0)
- enabled = false;
-
path->rows = tuples;
/*
diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c
index 1e4246b49d5..044560da7bf 100644
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -1048,6 +1048,7 @@ try_partial_nestloop_path(PlannerInfo *root,
initial_cost_nestloop(root, &workspace, jointype, nestloop_subtype,
outer_path, inner_path, extra);
if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes,
+ workspace.startup_cost,
workspace.total_cost, pathkeys))
return;
@@ -1237,6 +1238,7 @@ try_partial_mergejoin_path(PlannerInfo *root,
extra);
if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes,
+ workspace.startup_cost,
workspace.total_cost, pathkeys))
return;
@@ -1369,6 +1371,7 @@ try_partial_hashjoin_path(PlannerInfo *root,
initial_cost_hashjoin(root, &workspace, jointype, hashclauses,
outer_path, inner_path, extra, parallel_hash);
if (!add_partial_path_precheck(joinrel, workspace.disabled_nodes,
+ workspace.startup_cost,
workspace.total_cost, NIL))
return;
@@ -1895,8 +1898,17 @@ match_unsorted_outer(PlannerInfo *root,
/*
* Consider materializing the cheapest inner path, unless that is
* disabled or the path in question materializes its output anyway.
+ *
+ * At present, we only consider materialization for non-partial outer
+ * paths, so it's correct to test PGS_CONSIDER_NONPARTIAL here. If we
+ * ever want to consider materialization for partial paths, we'll need
+ * to create matpath whenever PGS_NESTLOOP_MATERIALIZE is set, use it
+ * for partial paths either way, and use it for non-partial paths only
+ * when PGS_CONSIDER_NONPARTIAL is also set.
*/
- if ((extra->pgs_mask & PGS_NESTLOOP_MATERIALIZE) != 0 &&
+ if ((extra->pgs_mask &
+ (PGS_NESTLOOP_MATERIALIZE | PGS_CONSIDER_NONPARTIAL)) ==
+ (PGS_NESTLOOP_MATERIALIZE | PGS_CONSIDER_NONPARTIAL) &&
inner_cheapest_total != NULL &&
!ExecMaterializesOutput(inner_cheapest_total->pathtype))
matpath = (Path *)
diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c
index 2615651c073..443e2dca7c0 100644
--- a/src/backend/optimizer/path/joinrels.c
+++ b/src/backend/optimizer/path/joinrels.c
@@ -1513,6 +1513,7 @@ void
mark_dummy_rel(RelOptInfo *rel)
{
MemoryContext oldcontext;
+ AppendPathInput in = {0};
/* Already marked? */
if (is_dummy_rel(rel))
@@ -1529,7 +1530,7 @@ mark_dummy_rel(RelOptInfo *rel)
rel->partial_pathlist = NIL;
/* Set up the dummy path */
- add_path(rel, (Path *) create_append_path(NULL, rel, NIL, NIL,
+ add_path(rel, (Path *) create_append_path(NULL, rel, in,
NIL, rel->lateral_relids,
0, false, -1));
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index c26e841f537..959df43c39e 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -1263,6 +1263,7 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path, int flags)
plan->plan.lefttree = NULL;
plan->plan.righttree = NULL;
plan->apprelids = rel->relids;
+ plan->child_append_relid_sets = best_path->child_append_relid_sets;
if (pathkeys != NIL)
{
@@ -1475,6 +1476,7 @@ create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path,
plan->lefttree = NULL;
plan->righttree = NULL;
node->apprelids = rel->relids;
+ node->child_append_relid_sets = best_path->child_append_relid_sets;
/*
* Compute sort column info, and adjust MergeAppend's tlist as needed.
@@ -6527,7 +6529,6 @@ materialize_finished_plan(Plan *subplan)
subplan->total_cost -= initplan_cost;
/* Set cost data */
- matpath.parent = NULL;
cost_material(&matpath,
enable_material,
subplan->disabled_nodes,
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 757bdc7b1de..006b3281969 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -511,7 +511,8 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions,
/* Allow plugins to take control after we've initialized "glob" */
if (planner_setup_hook)
- (*planner_setup_hook) (glob, parse, query_string, &tuple_fraction, es);
+ (*planner_setup_hook) (glob, parse, query_string, cursorOptions,
+ &tuple_fraction, es);
/* primary planning entry point (may recurse for subqueries) */
root = subquery_planner(glob, parse, NULL, NULL, false, tuple_fraction,
@@ -654,6 +655,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions,
result->unprunableRelids = bms_difference(glob->allRelids,
glob->prunableRelids);
result->permInfos = glob->finalrteperminfos;
+ result->subrtinfos = glob->subrtinfos;
result->resultRelations = glob->resultRelations;
result->appendRelations = glob->appendRelations;
result->subplans = glob->subplans;
@@ -664,6 +666,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions,
result->paramExecTypes = glob->paramExecTypes;
/* utilityStmt should be null, but we might as well copy it */
result->utilityStmt = parse->utilityStmt;
+ result->elidedNodes = glob->elidedNodes;
result->stmt_location = parse->stmt_location;
result->stmt_len = parse->stmt_len;
@@ -4060,7 +4063,7 @@ create_degenerate_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
* might get between 0 and N output rows. Offhand I think that's
* desired.)
*/
- List *paths = NIL;
+ AppendPathInput append = {0};
while (--nrows >= 0)
{
@@ -4068,13 +4071,12 @@ create_degenerate_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
create_group_result_path(root, grouped_rel,
grouped_rel->reltarget,
(List *) parse->havingQual);
- paths = lappend(paths, path);
+ append.subpaths = lappend(append.subpaths, path);
}
path = (Path *)
create_append_path(root,
grouped_rel,
- paths,
- NIL,
+ append,
NIL,
NULL,
0,
diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c
index 16d200cfb46..5ad6c13830b 100644
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -211,6 +211,9 @@ static List *set_windowagg_runcondition_references(PlannerInfo *root,
List *runcondition,
Plan *plan);
+static void record_elided_node(PlannerGlobal *glob, int plan_node_id,
+ NodeTag elided_type, Bitmapset *relids);
+
/*****************************************************************************
*
@@ -399,6 +402,26 @@ add_rtes_to_flat_rtable(PlannerInfo *root, bool recursing)
Index rti;
ListCell *lc;
+ /*
+ * Record enough information to make it possible for code that looks at
+ * the final range table to understand how it was constructed. (If
+ * finalrtable is still NIL, then this is the very topmost PlannerInfo,
+ * which will always have plan_name == NULL and rtoffset == 0; we omit the
+ * degenerate list entry.)
+ */
+ if (root->glob->finalrtable != NIL)
+ {
+ SubPlanRTInfo *rtinfo = makeNode(SubPlanRTInfo);
+
+ rtinfo->plan_name = root->plan_name;
+ rtinfo->rtoffset = list_length(root->glob->finalrtable);
+
+ /* When recursing = true, it's an unplanned or dummy subquery. */
+ rtinfo->dummy = recursing;
+
+ root->glob->subrtinfos = lappend(root->glob->subrtinfos, rtinfo);
+ }
+
/*
* Add the query's own RTEs to the flattened rangetable.
*
@@ -1440,10 +1463,17 @@ set_subqueryscan_references(PlannerInfo *root,
if (trivial_subqueryscan(plan))
{
+ Index scanrelid;
+
/*
* We can omit the SubqueryScan node and just pull up the subplan.
*/
result = clean_up_removed_plan_level((Plan *) plan, plan->subplan);
+
+ /* Remember that we removed a SubqueryScan */
+ scanrelid = plan->scan.scanrelid + rtoffset;
+ record_elided_node(root->glob, plan->subplan->plan_node_id,
+ T_SubqueryScan, bms_make_singleton(scanrelid));
}
else
{
@@ -1871,7 +1901,17 @@ set_append_references(PlannerInfo *root,
Plan *p = (Plan *) linitial(aplan->appendplans);
if (p->parallel_aware == aplan->plan.parallel_aware)
- return clean_up_removed_plan_level((Plan *) aplan, p);
+ {
+ Plan *result;
+
+ result = clean_up_removed_plan_level((Plan *) aplan, p);
+
+ /* Remember that we removed an Append */
+ record_elided_node(root->glob, p->plan_node_id, T_Append,
+ offset_relid_set(aplan->apprelids, rtoffset));
+
+ return result;
+ }
}
/*
@@ -1939,7 +1979,17 @@ set_mergeappend_references(PlannerInfo *root,
Plan *p = (Plan *) linitial(mplan->mergeplans);
if (p->parallel_aware == mplan->plan.parallel_aware)
- return clean_up_removed_plan_level((Plan *) mplan, p);
+ {
+ Plan *result;
+
+ result = clean_up_removed_plan_level((Plan *) mplan, p);
+
+ /* Remember that we removed a MergeAppend */
+ record_elided_node(root->glob, p->plan_node_id, T_MergeAppend,
+ offset_relid_set(mplan->apprelids, rtoffset));
+
+ return result;
+ }
}
/*
@@ -3754,3 +3804,21 @@ extract_query_dependencies_walker(Node *node, PlannerInfo *context)
return expression_tree_walker(node, extract_query_dependencies_walker,
context);
}
+
+/*
+ * Record some details about a node removed from the plan during setrefs
+ * processing, for the benefit of code trying to reconstruct planner decisions
+ * from examination of the final plan tree.
+ */
+static void
+record_elided_node(PlannerGlobal *glob, int plan_node_id,
+ NodeTag elided_type, Bitmapset *relids)
+{
+ ElidedNode *n = makeNode(ElidedNode);
+
+ n->plan_node_id = plan_node_id;
+ n->elided_type = elided_type;
+ n->relids = relids;
+
+ glob->elidedNodes = lappend(glob->elidedNodes, n);
+}
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 78c95c36dd5..f50c296e3d9 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -696,9 +696,9 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
ListCell *lc;
ListCell *lc2;
ListCell *lc3;
- List *cheapest_pathlist = NIL;
- List *ordered_pathlist = NIL;
- List *partial_pathlist = NIL;
+ AppendPathInput cheapest = {0};
+ AppendPathInput ordered = {0};
+ AppendPathInput partial = {0};
bool partial_paths_valid = true;
bool consider_parallel = true;
List *rellist;
@@ -783,7 +783,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
if (is_dummy_rel(rel))
continue;
- cheapest_pathlist = lappend(cheapest_pathlist,
+ cheapest.subpaths = lappend(cheapest.subpaths,
rel->cheapest_total_path);
if (try_sorted)
@@ -795,7 +795,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
false);
if (ordered_path != NULL)
- ordered_pathlist = lappend(ordered_pathlist, ordered_path);
+ ordered.subpaths = lappend(ordered.subpaths, ordered_path);
else
{
/*
@@ -818,20 +818,20 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
else if (rel->partial_pathlist == NIL)
partial_paths_valid = false;
else
- partial_pathlist = lappend(partial_pathlist,
- linitial(rel->partial_pathlist));
+ partial.partial_subpaths = lappend(partial.partial_subpaths,
+ linitial(rel->partial_pathlist));
}
}
/* Build result relation. */
result_rel = fetch_upper_rel(root, UPPERREL_SETOP, relids);
result_rel->reltarget = create_setop_pathtarget(root, tlist,
- cheapest_pathlist);
+ cheapest.subpaths);
result_rel->consider_parallel = consider_parallel;
result_rel->consider_startup = (root->tuple_fraction > 0);
/* If all UNION children were dummy rels, make the resulting rel dummy */
- if (cheapest_pathlist == NIL)
+ if (cheapest.subpaths == NIL)
{
mark_dummy_rel(result_rel);
@@ -842,8 +842,8 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
* Append the child results together using the cheapest paths from each
* union child.
*/
- apath = (Path *) create_append_path(root, result_rel, cheapest_pathlist,
- NIL, NIL, NULL, 0, false, -1);
+ apath = (Path *) create_append_path(root, result_rel, cheapest,
+ NIL, NULL, 0, false, -1);
/*
* Estimate number of groups. For now we just assume the output is unique
@@ -862,7 +862,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
int parallel_workers = 0;
/* Find the highest number of workers requested for any subpath. */
- foreach(lc, partial_pathlist)
+ foreach(lc, partial.partial_subpaths)
{
Path *subpath = lfirst(lc);
@@ -881,14 +881,14 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
if (enable_parallel_append)
{
parallel_workers = Max(parallel_workers,
- pg_leftmost_one_pos32(list_length(partial_pathlist)) + 1);
+ pg_leftmost_one_pos32(list_length(partial.partial_subpaths)) + 1);
parallel_workers = Min(parallel_workers,
max_parallel_workers_per_gather);
}
Assert(parallel_workers > 0);
papath = (Path *)
- create_append_path(root, result_rel, NIL, partial_pathlist,
+ create_append_path(root, result_rel, partial,
NIL, NULL, parallel_workers,
enable_parallel_append, -1);
gpath = (Path *)
@@ -901,7 +901,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
double dNumGroups;
bool can_sort = grouping_is_sortable(groupList);
bool can_hash = grouping_is_hashable(groupList);
- Path *first_path = linitial(cheapest_pathlist);
+ Path *first_path = linitial(cheapest.subpaths);
/*
* Estimate the number of UNION output rows. In the case when only a
@@ -911,7 +911,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
* contain Vars with varno==0, which estimate_num_groups() wouldn't
* like.
*/
- if (list_length(cheapest_pathlist) == 1 &&
+ if (list_length(cheapest.subpaths) == 1 &&
first_path->parent->reloptkind != RELOPT_UPPER_REL)
{
dNumGroups = estimate_num_groups(root,
@@ -1017,7 +1017,8 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
path = (Path *) create_merge_append_path(root,
result_rel,
- ordered_pathlist,
+ ordered.subpaths,
+ NIL,
union_pathkeys,
NULL);
@@ -1216,6 +1217,9 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
if (op->all)
{
Path *apath;
+ AppendPathInput append = {0};
+
+ append.subpaths = list_make1(lpath);
/*
* EXCEPT ALL: If the right-hand input is dummy then we can
@@ -1224,8 +1228,9 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
* between the set op targetlist and the targetlist of the
* left input. The Append will be removed in setrefs.c.
*/
- apath = (Path *) create_append_path(root, result_rel, list_make1(lpath),
- NIL, NIL, NULL, 0, false, -1);
+ apath = (Path *) create_append_path(root, result_rel,
+ append, NIL, NULL, 0,
+ false, -1);
add_path(result_rel, apath);
diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c
index 32204776c45..504a30d8836 100644
--- a/src/backend/optimizer/util/clauses.c
+++ b/src/backend/optimizer/util/clauses.c
@@ -2705,6 +2705,7 @@ eval_const_expressions_mutator(Node *node,
bool has_null_input = false;
bool all_null_input = true;
bool has_nonconst_input = false;
+ bool has_nullable_nonconst = false;
Expr *simple;
DistinctExpr *newexpr;
@@ -2721,7 +2722,8 @@ eval_const_expressions_mutator(Node *node,
/*
* We must do our own check for NULLs because DistinctExpr has
* different results for NULL input than the underlying
- * operator does.
+ * operator does. We also check if any non-constant input is
+ * potentially nullable.
*/
foreach(arg, args)
{
@@ -2731,12 +2733,24 @@ eval_const_expressions_mutator(Node *node,
all_null_input &= ((Const *) lfirst(arg))->constisnull;
}
else
+ {
has_nonconst_input = true;
+ all_null_input = false;
+
+ if (!has_nullable_nonconst &&
+ !expr_is_nonnullable(context->root,
+ (Expr *) lfirst(arg), false))
+ has_nullable_nonconst = true;
+ }
}
- /* all constants? then can optimize this out */
if (!has_nonconst_input)
{
+ /*
+ * All inputs are constants. We can optimize this out
+ * completely.
+ */
+
/* all nulls? then not distinct */
if (all_null_input)
return makeBoolConst(false, false);
@@ -2781,6 +2795,72 @@ eval_const_expressions_mutator(Node *node,
return (Node *) csimple;
}
}
+ else if (!has_nullable_nonconst)
+ {
+ /*
+ * There are non-constant inputs, but since all of them
+ * are proven non-nullable, "IS DISTINCT FROM" semantics
+ * are much simpler.
+ */
+
+ OpExpr *eqexpr;
+
+ /*
+ * If one input is an explicit NULL constant, and the
+ * other is a non-nullable expression, the result is
+ * always TRUE.
+ */
+ if (has_null_input)
+ return makeBoolConst(true, false);
+
+ /*
+ * Otherwise, both inputs are known non-nullable. In this
+ * case, "IS DISTINCT FROM" is equivalent to the standard
+ * inequality operator (usually "<>"). We convert this to
+ * an OpExpr, which is a more efficient representation for
+ * the planner. It can enable the use of partial indexes
+ * and constraint exclusion. Furthermore, if the clause
+ * is negated (ie, "IS NOT DISTINCT FROM"), the resulting
+ * "=" operator can allow the planner to use index scans,
+ * merge joins, hash joins, and EC-based qual deductions.
+ */
+ eqexpr = makeNode(OpExpr);
+ eqexpr->opno = expr->opno;
+ eqexpr->opfuncid = expr->opfuncid;
+ eqexpr->opresulttype = BOOLOID;
+ eqexpr->opretset = expr->opretset;
+ eqexpr->opcollid = expr->opcollid;
+ eqexpr->inputcollid = expr->inputcollid;
+ eqexpr->args = args;
+ eqexpr->location = expr->location;
+
+ return eval_const_expressions_mutator(negate_clause((Node *) eqexpr),
+ context);
+ }
+ else if (has_null_input)
+ {
+ /*
+ * One input is a nullable non-constant expression, and
+ * the other is an explicit NULL constant. We can
+ * transform this to a NullTest with !argisrow, which is
+ * much more amenable to optimization.
+ */
+
+ NullTest *nt = makeNode(NullTest);
+
+ nt->arg = (Expr *) (IsA(linitial(args), Const) ?
+ lsecond(args) : linitial(args));
+ nt->nulltesttype = IS_NOT_NULL;
+
+ /*
+ * argisrow = false is correct whether or not arg is
+ * composite
+ */
+ nt->argisrow = false;
+ nt->location = expr->location;
+
+ return eval_const_expressions_mutator((Node *) nt, context);
+ }
/*
* The expression cannot be simplified any further, so build
@@ -3630,6 +3710,9 @@ eval_const_expressions_mutator(Node *node,
context);
if (arg && IsA(arg, Const))
{
+ /*
+ * If arg is Const, simplify to constant.
+ */
Const *carg = (Const *) arg;
bool result;
@@ -3666,6 +3749,34 @@ eval_const_expressions_mutator(Node *node,
return makeBoolConst(result, false);
}
+ if (arg && expr_is_nonnullable(context->root, (Expr *) arg, false))
+ {
+ /*
+ * If arg is proven non-nullable, simplify to boolean
+ * expression or constant.
+ */
+ switch (btest->booltesttype)
+ {
+ case IS_TRUE:
+ case IS_NOT_FALSE:
+ return arg;
+
+ case IS_FALSE:
+ case IS_NOT_TRUE:
+ return (Node *) make_notclause((Expr *) arg);
+
+ case IS_UNKNOWN:
+ return makeBoolConst(false, false);
+
+ case IS_NOT_UNKNOWN:
+ return makeBoolConst(true, false);
+
+ default:
+ elog(ERROR, "unrecognized booltesttype: %d",
+ (int) btest->booltesttype);
+ break;
+ }
+ }
newbtest = makeNode(BooleanTest);
newbtest->arg = (Expr *) arg;
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 7295438ad20..d61f328707f 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -777,10 +777,9 @@ add_path_precheck(RelOptInfo *parent_rel, int disabled_nodes,
*
* Because we don't consider parameterized paths here, we also don't
* need to consider the row counts as a measure of quality: every path will
- * produce the same number of rows. Neither do we need to consider startup
- * costs: parallelism is only used for plans that will be run to completion.
- * Therefore, this routine is much simpler than add_path: it needs to
- * consider only disabled nodes, pathkeys and total cost.
+ * produce the same number of rows. However, we do need to consider the
+ * startup costs: this partial path could be used beneath a Limit node,
+ * so a fast-start plan could be correct.
*
* As with add_path, we pfree paths that are found to be dominated by
* another partial path; this requires that there be no other references to
@@ -818,52 +817,36 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path)
/* Compare pathkeys. */
keyscmp = compare_pathkeys(new_path->pathkeys, old_path->pathkeys);
- /* Unless pathkeys are incompatible, keep just one of the two paths. */
+ /*
+ * Unless pathkeys are incompatible, see if one of the paths dominates
+ * the other (both in startup and total cost). It may happen that one
+ * path has lower startup cost, the other has lower total cost.
+ */
if (keyscmp != PATHKEYS_DIFFERENT)
{
- if (unlikely(new_path->disabled_nodes != old_path->disabled_nodes))
+ PathCostComparison costcmp;
+
+ /*
+ * Do a fuzzy cost comparison with standard fuzziness limit.
+ */
+ costcmp = compare_path_costs_fuzzily(new_path, old_path,
+ STD_FUZZ_FACTOR);
+ if (costcmp == COSTS_BETTER1)
{
- if (new_path->disabled_nodes > old_path->disabled_nodes)
- accept_new = false;
- else
+ if (keyscmp == PATHKEYS_BETTER1)
remove_old = true;
}
- else if (new_path->total_cost > old_path->total_cost
- * STD_FUZZ_FACTOR)
+ else if (costcmp == COSTS_BETTER2)
{
- /* New path costs more; keep it only if pathkeys are better. */
- if (keyscmp != PATHKEYS_BETTER1)
+ if (keyscmp == PATHKEYS_BETTER2)
accept_new = false;
}
- else if (old_path->total_cost > new_path->total_cost
- * STD_FUZZ_FACTOR)
+ else if (costcmp == COSTS_EQUAL)
{
- /* Old path costs more; keep it only if pathkeys are better. */
- if (keyscmp != PATHKEYS_BETTER2)
+ if (keyscmp == PATHKEYS_BETTER1)
remove_old = true;
- }
- else if (keyscmp == PATHKEYS_BETTER1)
- {
- /* Costs are about the same, new path has better pathkeys. */
- remove_old = true;
- }
- else if (keyscmp == PATHKEYS_BETTER2)
- {
- /* Costs are about the same, old path has better pathkeys. */
- accept_new = false;
- }
- else if (old_path->total_cost > new_path->total_cost * 1.0000000001)
- {
- /* Pathkeys are the same, and the old path costs more. */
- remove_old = true;
- }
- else
- {
- /*
- * Pathkeys are the same, and new path isn't materially
- * cheaper.
- */
- accept_new = false;
+ else if (keyscmp == PATHKEYS_BETTER2)
+ accept_new = false;
}
}
@@ -878,8 +861,13 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path)
}
else
{
- /* new belongs after this old path if it has cost >= old's */
- if (new_path->total_cost >= old_path->total_cost)
+ /*
+ * new belongs after this old path if it has more disabled nodes
+ * or if it has the same number of nodes but a greater total cost
+ */
+ if (new_path->disabled_nodes > old_path->disabled_nodes ||
+ (new_path->disabled_nodes == old_path->disabled_nodes &&
+ new_path->total_cost >= old_path->total_cost))
insert_at = foreach_current_index(p1) + 1;
}
@@ -909,16 +897,16 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path)
* add_partial_path_precheck
* Check whether a proposed new partial path could possibly get accepted.
*
- * Unlike add_path_precheck, we can ignore startup cost and parameterization,
- * since they don't matter for partial paths (see add_partial_path). But
- * we do want to make sure we don't add a partial path if there's already
- * a complete path that dominates it, since in that case the proposed path
- * is surely a loser.
+ * Unlike add_path_precheck, we can ignore parameterization, since it doesn't
+ * matter for partial paths (see add_partial_path). But we do want to make
+ * sure we don't add a partial path if there's already a complete path that
+ * dominates it, since in that case the proposed path is surely a loser.
*/
bool
add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes,
- Cost total_cost, List *pathkeys)
+ Cost startup_cost, Cost total_cost, List *pathkeys)
{
+ bool consider_startup = parent_rel->consider_startup;
ListCell *p1;
/*
@@ -928,25 +916,80 @@ add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes,
* is clearly superior to some existing partial path -- at least, modulo
* final cost computations. If so, we definitely want to consider it.
*
- * Unlike add_path(), we always compare pathkeys here. This is because we
- * expect partial_pathlist to be very short, and getting a definitive
+ * Unlike add_path(), we never try to exit this loop early. This is because
+ * we expect partial_pathlist to be very short, and getting a definitive
* answer at this stage avoids the need to call add_path_precheck.
*/
foreach(p1, parent_rel->partial_pathlist)
{
Path *old_path = (Path *) lfirst(p1);
+ PathCostComparison costcmp;
PathKeysComparison keyscmp;
- keyscmp = compare_pathkeys(pathkeys, old_path->pathkeys);
- if (keyscmp != PATHKEYS_DIFFERENT)
+ /*
+ * First, compare costs and disabled nodes. This logic should be
+ * identical to compare_path_costs_fuzzily, except that one of the
+ * paths hasn't been created yet, and the fuzz factor is always
+ * STD_FUZZ_FACTOR.
+ */
+ if (unlikely(old_path->disabled_nodes != disabled_nodes))
{
- if (total_cost > old_path->total_cost * STD_FUZZ_FACTOR &&
- keyscmp != PATHKEYS_BETTER1)
- return false;
- if (old_path->total_cost > total_cost * STD_FUZZ_FACTOR &&
- keyscmp != PATHKEYS_BETTER2)
- return true;
+ if (disabled_nodes < old_path->disabled_nodes)
+ costcmp = COSTS_BETTER1;
+ else
+ costcmp = COSTS_BETTER2;
+ }
+ else if (total_cost > old_path->total_cost * STD_FUZZ_FACTOR)
+ {
+ if (consider_startup &&
+ old_path->startup_cost > startup_cost * STD_FUZZ_FACTOR)
+ costcmp = COSTS_DIFFERENT;
+ else
+ costcmp = COSTS_BETTER2;
}
+ else if (old_path->total_cost > total_cost * STD_FUZZ_FACTOR)
+ {
+ if (consider_startup &&
+ startup_cost > old_path->startup_cost * STD_FUZZ_FACTOR)
+ costcmp = COSTS_DIFFERENT;
+ else
+ costcmp = COSTS_BETTER1;
+ }
+ else if (startup_cost > old_path->startup_cost * STD_FUZZ_FACTOR)
+ costcmp = COSTS_BETTER2;
+ else if (old_path->startup_cost > startup_cost * STD_FUZZ_FACTOR)
+ costcmp = COSTS_BETTER1;
+ else
+ costcmp = COSTS_EQUAL;
+
+ /*
+ * If one path wins on startup cost and the other on total cost, we
+ * can't say for sure which is better.
+ */
+ if (costcmp == COSTS_DIFFERENT)
+ continue;
+
+ /*
+ * If the two paths have different pathkeys, we can't say for sure
+ * which is better.
+ */
+ keyscmp = compare_pathkeys(pathkeys, old_path->pathkeys);
+ if (keyscmp == PATHKEYS_DIFFERENT)
+ continue;
+
+ /*
+ * If the existing path is cheaper and the pathkeys are equal or worse,
+ * the new path is not interesting.
+ */
+ if (costcmp == COSTS_BETTER2 && keyscmp != PATHKEYS_BETTER1)
+ return false;
+
+ /*
+ * If the new path is cheaper and the pathkeys are equal or better,
+ * it is definitely interesting.
+ */
+ if (costcmp == COSTS_BETTER1 && keyscmp != PATHKEYS_BETTER2)
+ return true;
}
/*
@@ -954,14 +997,9 @@ add_partial_path_precheck(RelOptInfo *parent_rel, int disabled_nodes,
* clearly good enough that it might replace one. Compare it to
* non-parallel plans. If it loses even before accounting for the cost of
* the Gather node, we should definitely reject it.
- *
- * Note that we pass the total_cost to add_path_precheck twice. This is
- * because it's never advantageous to consider the startup cost of a
- * partial path; the resulting plans, if run in parallel, will be run to
- * completion.
*/
- if (!add_path_precheck(parent_rel, disabled_nodes, total_cost, total_cost,
- pathkeys, NULL))
+ if (!add_path_precheck(parent_rel, disabled_nodes, startup_cost,
+ total_cost, pathkeys, NULL))
return false;
return true;
@@ -1077,6 +1115,14 @@ create_index_path(PlannerInfo *root,
cost_index(pathnode, root, loop_count, partial_path);
+ /*
+ * cost_index will set disabled_nodes to 1 if this rel is not allowed to
+ * use index scans in general, but it doesn't have the IndexOptInfo to
+ * know whether this specific index has been disabled.
+ */
+ if (index->disabled)
+ pathnode->path.disabled_nodes = 1;
+
return pathnode;
}
@@ -1298,7 +1344,7 @@ create_tidrangescan_path(PlannerInfo *root, RelOptInfo *rel,
AppendPath *
create_append_path(PlannerInfo *root,
RelOptInfo *rel,
- List *subpaths, List *partial_subpaths,
+ AppendPathInput input,
List *pathkeys, Relids required_outer,
int parallel_workers, bool parallel_aware,
double rows)
@@ -1308,6 +1354,7 @@ create_append_path(PlannerInfo *root,
Assert(!parallel_aware || parallel_workers > 0);
+ pathnode->child_append_relid_sets = input.child_append_relid_sets;
pathnode->path.pathtype = T_Append;
pathnode->path.parent = rel;
pathnode->path.pathtarget = rel->reltarget;
@@ -1323,7 +1370,7 @@ create_append_path(PlannerInfo *root,
* on the simpler get_appendrel_parampathinfo. There's no point in doing
* the more expensive thing for a dummy path, either.
*/
- if (rel->reloptkind == RELOPT_BASEREL && root && subpaths != NIL)
+ if (rel->reloptkind == RELOPT_BASEREL && root && input.subpaths != NIL)
pathnode->path.param_info = get_baserel_parampathinfo(root,
rel,
required_outer);
@@ -1354,11 +1401,11 @@ create_append_path(PlannerInfo *root,
*/
Assert(pathkeys == NIL);
- list_sort(subpaths, append_total_cost_compare);
- list_sort(partial_subpaths, append_startup_cost_compare);
+ list_sort(input.subpaths, append_total_cost_compare);
+ list_sort(input.partial_subpaths, append_startup_cost_compare);
}
- pathnode->first_partial_path = list_length(subpaths);
- pathnode->subpaths = list_concat(subpaths, partial_subpaths);
+ pathnode->first_partial_path = list_length(input.subpaths);
+ pathnode->subpaths = list_concat(input.subpaths, input.partial_subpaths);
/*
* Apply query-wide LIMIT if known and path is for sole base relation.
@@ -1470,6 +1517,7 @@ MergeAppendPath *
create_merge_append_path(PlannerInfo *root,
RelOptInfo *rel,
List *subpaths,
+ List *child_append_relid_sets,
List *pathkeys,
Relids required_outer)
{
@@ -1485,6 +1533,7 @@ create_merge_append_path(PlannerInfo *root,
*/
Assert(bms_is_empty(rel->lateral_relids) && bms_is_empty(required_outer));
+ pathnode->child_append_relid_sets = child_append_relid_sets;
pathnode->path.pathtype = T_MergeAppend;
pathnode->path.parent = rel;
pathnode->path.pathtarget = rel->reltarget;
@@ -3932,11 +3981,12 @@ reparameterize_path(PlannerInfo *root, Path *path,
case T_Append:
{
AppendPath *apath = (AppendPath *) path;
- List *childpaths = NIL;
- List *partialpaths = NIL;
+ AppendPathInput new_append = {0};
int i;
ListCell *lc;
+ new_append.child_append_relid_sets = apath->child_append_relid_sets;
+
/* Reparameterize the children */
i = 0;
foreach(lc, apath->subpaths)
@@ -3950,13 +4000,13 @@ reparameterize_path(PlannerInfo *root, Path *path,
return NULL;
/* We have to re-split the regular and partial paths */
if (i < apath->first_partial_path)
- childpaths = lappend(childpaths, spath);
+ new_append.subpaths = lappend(new_append.subpaths, spath);
else
- partialpaths = lappend(partialpaths, spath);
+ new_append.partial_subpaths = lappend(new_append.partial_subpaths, spath);
i++;
}
return (Path *)
- create_append_path(root, rel, childpaths, partialpaths,
+ create_append_path(root, rel, new_append,
apath->path.pathkeys, required_outer,
apath->path.parallel_workers,
apath->path.parallel_aware,
@@ -3971,10 +4021,10 @@ reparameterize_path(PlannerInfo *root, Path *path,
spath = reparameterize_path(root, spath,
required_outer,
loop_count);
- enabled =
- (mpath->path.disabled_nodes <= spath->disabled_nodes);
if (spath == NULL)
return NULL;
+ enabled =
+ (mpath->path.disabled_nodes <= spath->disabled_nodes);
return (Path *) create_material_path(rel, spath, enabled);
}
case T_Memoize:
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 3cd3544fa2b..2e3886cf9fe 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -855,7 +855,7 @@ PGSharedMemoryCreate(Size size,
* Initialize space allocation status for segment.
*/
hdr->totalsize = size;
- hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
+ hdr->content_offset = MAXALIGN(sizeof(PGShmemHeader));
*shim = hdr;
/* Save info for possible future use */
diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c
index 7cb8b4c9b60..794e4fcb2ad 100644
--- a/src/backend/port/win32_shmem.c
+++ b/src/backend/port/win32_shmem.c
@@ -389,7 +389,7 @@ PGSharedMemoryCreate(Size size,
* Initialize space allocation status for segment.
*/
hdr->totalsize = size;
- hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
+ hdr->content_offset = MAXALIGN(sizeof(PGShmemHeader));
hdr->dsm_control = 0;
/* Save info for possible future use */
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index 22379de1e31..6fde740465f 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -385,7 +385,6 @@ AutoVacLauncherMain(const void *startup_data, size_t startup_data_len)
PostmasterContext = NULL;
}
- MyBackendType = B_AUTOVAC_LAUNCHER;
init_ps_display(NULL);
ereport(DEBUG1,
@@ -1398,7 +1397,6 @@ AutoVacWorkerMain(const void *startup_data, size_t startup_data_len)
PostmasterContext = NULL;
}
- MyBackendType = B_AUTOVAC_WORKER;
init_ps_display(NULL);
Assert(GetProcessingMode() == InitProcessing);
diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c
index 65deabe91a7..261ccd3f59c 100644
--- a/src/backend/postmaster/bgworker.c
+++ b/src/backend/postmaster/bgworker.c
@@ -120,22 +120,28 @@ static const struct
{
{
- "ParallelWorkerMain", ParallelWorkerMain
+ .fn_name = "ParallelWorkerMain",
+ .fn_addr = ParallelWorkerMain
},
{
- "ApplyLauncherMain", ApplyLauncherMain
+ .fn_name = "ApplyLauncherMain",
+ .fn_addr = ApplyLauncherMain
},
{
- "ApplyWorkerMain", ApplyWorkerMain
+ .fn_name = "ApplyWorkerMain",
+ .fn_addr = ApplyWorkerMain
},
{
- "ParallelApplyWorkerMain", ParallelApplyWorkerMain
+ .fn_name = "ParallelApplyWorkerMain",
+ .fn_addr = ParallelApplyWorkerMain
},
{
- "TableSyncWorkerMain", TableSyncWorkerMain
+ .fn_name = "TableSyncWorkerMain",
+ .fn_addr = TableSyncWorkerMain
},
{
- "SequenceSyncWorkerMain", SequenceSyncWorkerMain
+ .fn_name = "SequenceSyncWorkerMain",
+ .fn_addr = SequenceSyncWorkerMain
}
};
@@ -753,7 +759,6 @@ BackgroundWorkerMain(const void *startup_data, size_t startup_data_len)
}
MyBgworkerEntry = worker;
- MyBackendType = B_BG_WORKER;
init_ps_display(worker->bgw_name);
Assert(GetProcessingMode() == InitProcessing);
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 80e3088fc7e..0956bd39a85 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -94,7 +94,6 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len)
Assert(startup_data_len == 0);
- MyBackendType = B_BG_WRITER;
AuxiliaryProcessMainCommon();
/*
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index 6482c21b8f9..e03c19123bc 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -199,7 +199,6 @@ CheckpointerMain(const void *startup_data, size_t startup_data_len)
Assert(startup_data_len == 0);
- MyBackendType = B_CHECKPOINTER;
AuxiliaryProcessMainCommon();
CheckpointerShmem->checkpointer_pid = MyProcPid;
diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c
index cea229ad6a4..05b1feef3cf 100644
--- a/src/backend/postmaster/launch_backend.c
+++ b/src/backend/postmaster/launch_backend.c
@@ -96,7 +96,6 @@ typedef struct
HANDLE UsedShmemSegID;
#endif
void *UsedShmemSegAddr;
- slock_t *ShmemLock;
#ifdef USE_INJECTION_POINTS
struct InjectionPointsCtl *ActiveInjectionPoints;
#endif
@@ -179,7 +178,7 @@ typedef struct
} child_process_kind;
static child_process_kind child_process_kinds[] = {
-#define PG_PROCTYPE(bktype, description, main_func, shmem_attach) \
+#define PG_PROCTYPE(bktype, bkcategory, description, main_func, shmem_attach) \
[bktype] = {description, main_func, shmem_attach},
#include "postmaster/proctypelist.h"
#undef PG_PROCTYPE
@@ -224,6 +223,8 @@ postmaster_child_launch(BackendType child_type, int child_slot,
pid = fork_process();
if (pid == 0) /* child */
{
+ MyBackendType = child_type;
+
/* Capture and transfer timings that may be needed for logging */
if (IsExternalConnectionBackend(child_type))
{
@@ -608,6 +609,7 @@ SubPostmasterMain(int argc, char *argv[])
child_type = (BackendType) atoi(child_kind);
if (child_type <= B_INVALID || child_type > BACKEND_NUM_TYPES - 1)
elog(ERROR, "unknown child kind %s", child_kind);
+ MyBackendType = child_type;
/* Read in the variables file */
read_backend_variables(argv[2], &startup_data, &startup_data_len);
@@ -676,7 +678,7 @@ SubPostmasterMain(int argc, char *argv[])
/* Restore basic shared memory pointers */
if (UsedShmemSegAddr != NULL)
- InitShmemAccess(UsedShmemSegAddr);
+ InitShmemAllocator(UsedShmemSegAddr);
/*
* Run the appropriate Main function
@@ -724,8 +726,6 @@ save_backend_variables(BackendParameters *param,
param->UsedShmemSegID = UsedShmemSegID;
param->UsedShmemSegAddr = UsedShmemSegAddr;
- param->ShmemLock = ShmemLock;
-
#ifdef USE_INJECTION_POINTS
param->ActiveInjectionPoints = ActiveInjectionPoints;
#endif
@@ -986,8 +986,6 @@ restore_backend_variables(BackendParameters *param)
UsedShmemSegID = param->UsedShmemSegID;
UsedShmemSegAddr = param->UsedShmemSegAddr;
- ShmemLock = param->ShmemLock;
-
#ifdef USE_INJECTION_POINTS
ActiveInjectionPoints = param->ActiveInjectionPoints;
#endif
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index 1a20387c4bd..82731e452fc 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -222,7 +222,6 @@ PgArchiverMain(const void *startup_data, size_t startup_data_len)
{
Assert(startup_data_len == 0);
- MyBackendType = B_ARCHIVER;
AuxiliaryProcessMainCommon();
/*
diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c
index a1a4f65f9a9..cdbe53dd262 100644
--- a/src/backend/postmaster/startup.c
+++ b/src/backend/postmaster/startup.c
@@ -217,7 +217,6 @@ StartupProcessMain(const void *startup_data, size_t startup_data_len)
{
Assert(startup_data_len == 0);
- MyBackendType = B_STARTUP;
AuxiliaryProcessMainCommon();
/* Arrange to clean up at startup process exit */
diff --git a/src/backend/postmaster/syslogger.c b/src/backend/postmaster/syslogger.c
index 1c443b3d126..86c5e376b40 100644
--- a/src/backend/postmaster/syslogger.c
+++ b/src/backend/postmaster/syslogger.c
@@ -206,7 +206,6 @@ SysLoggerMain(const void *startup_data, size_t startup_data_len)
now = MyStartTime;
- MyBackendType = B_LOGGER;
init_ps_display(NULL);
/*
diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c
index c3d56c866d3..2d8f57099fd 100644
--- a/src/backend/postmaster/walsummarizer.c
+++ b/src/backend/postmaster/walsummarizer.c
@@ -234,7 +234,6 @@ WalSummarizerMain(const void *startup_data, size_t startup_data_len)
Assert(startup_data_len == 0);
- MyBackendType = B_WAL_SUMMARIZER;
AuxiliaryProcessMainCommon();
ereport(DEBUG1,
diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c
index 38ec8a4c8c7..23e79a32345 100644
--- a/src/backend/postmaster/walwriter.c
+++ b/src/backend/postmaster/walwriter.c
@@ -94,7 +94,6 @@ WalWriterMain(const void *startup_data, size_t startup_data_len)
Assert(startup_data_len == 0);
- MyBackendType = B_WAL_WRITER;
AuxiliaryProcessMainCommon();
/*
diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
index 85060d19a49..603a2b94d05 100644
--- a/src/backend/replication/logical/logical.c
+++ b/src/backend/replication/logical/logical.c
@@ -1986,16 +1986,22 @@ UpdateDecodingStats(LogicalDecodingContext *ctx)
}
/*
- * Read up to the end of WAL starting from the decoding slot's restart_lsn.
- * Return true if any meaningful/decodable WAL records are encountered,
- * otherwise false.
+ * Read up to the end of WAL starting from the decoding slot's restart_lsn
+ * to end_of_wal in order to check if any meaningful/decodable WAL records
+ * are encountered. scan_cutoff_lsn is the LSN, where we can terminate the
+ * WAL scan early if we find a decodable WAL record after this LSN.
+ *
+ * Returns the last LSN decodable WAL record's LSN if found, otherwise
+ * returns InvalidXLogRecPtr.
*/
-bool
-LogicalReplicationSlotHasPendingWal(XLogRecPtr end_of_wal)
+XLogRecPtr
+LogicalReplicationSlotCheckPendingWal(XLogRecPtr end_of_wal,
+ XLogRecPtr scan_cutoff_lsn)
{
- bool has_pending_wal = false;
+ XLogRecPtr last_pending_wal = InvalidXLogRecPtr;
Assert(MyReplicationSlot);
+ Assert(end_of_wal >= scan_cutoff_lsn);
PG_TRY();
{
@@ -2023,8 +2029,7 @@ LogicalReplicationSlotHasPendingWal(XLogRecPtr end_of_wal)
/* Invalidate non-timetravel entries */
InvalidateSystemCaches();
- /* Loop until the end of WAL or some changes are processed */
- while (!has_pending_wal && ctx->reader->EndRecPtr < end_of_wal)
+ while (ctx->reader->EndRecPtr < end_of_wal)
{
XLogRecord *record;
char *errm = NULL;
@@ -2037,7 +2042,20 @@ LogicalReplicationSlotHasPendingWal(XLogRecPtr end_of_wal)
if (record != NULL)
LogicalDecodingProcessRecord(ctx, ctx->reader);
- has_pending_wal = ctx->processing_required;
+ if (ctx->processing_required)
+ {
+ last_pending_wal = ctx->reader->ReadRecPtr;
+
+ /*
+ * If we find a decodable WAL after the scan_cutoff_lsn point,
+ * we can terminate the scan early.
+ */
+ if (last_pending_wal >= scan_cutoff_lsn)
+ break;
+
+ /* Reset the flag and continue checking */
+ ctx->processing_required = false;
+ }
CHECK_FOR_INTERRUPTS();
}
@@ -2055,7 +2073,7 @@ LogicalReplicationSlotHasPendingWal(XLogRecPtr end_of_wal)
}
PG_END_TRY();
- return has_pending_wal;
+ return last_pending_wal;
}
/*
diff --git a/src/backend/replication/logical/logicalctl.c b/src/backend/replication/logical/logicalctl.c
index 9f787f3dc51..4e292951201 100644
--- a/src/backend/replication/logical/logicalctl.c
+++ b/src/backend/replication/logical/logicalctl.c
@@ -71,6 +71,7 @@
#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/procarray.h"
+#include "storage/procsignal.h"
#include "utils/injection_point.h"
/*
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index d84fa120b9f..2d2a6d5e9e7 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -5361,7 +5361,7 @@ DisplayMapping(HTAB *tuplecid_data)
* transaction c) applied in LSN order.
*/
static void
-ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
+ApplyLogicalMappingFile(HTAB *tuplecid_data, const char *fname)
{
char path[MAXPGPATH];
int fd;
@@ -5544,7 +5544,7 @@ UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
snapshot->subxip[0]);
- ApplyLogicalMappingFile(tuplecid_data, relid, f->fname);
+ ApplyLogicalMappingFile(tuplecid_data, f->fname);
pfree(f);
}
}
diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c
index 1c343d03d21..d02d44d26a0 100644
--- a/src/backend/replication/logical/slotsync.c
+++ b/src/backend/replication/logical/slotsync.c
@@ -1541,8 +1541,6 @@ ReplSlotSyncWorkerMain(const void *startup_data, size_t startup_data_len)
Assert(startup_data_len == 0);
- MyBackendType = B_SLOTSYNC_WORKER;
-
init_ps_display(NULL);
Assert(GetProcessingMode() == InitProcessing);
@@ -1759,7 +1757,7 @@ update_synced_slots_inactive_since(void)
Assert(SlotIsLogical(s));
/* The slot must not be acquired by any process */
- Assert(s->active_pid == 0);
+ Assert(s->active_proc == INVALID_PROC_NUMBER);
/* Use the same inactive_since time for all the slots. */
if (now == 0)
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 4c47261c7f9..28c7019402b 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -226,6 +226,7 @@ ReplicationSlotsShmemInit(void)
ReplicationSlot *slot = &ReplicationSlotCtl->replication_slots[i];
/* everything else is zeroed by the memset above */
+ slot->active_proc = INVALID_PROC_NUMBER;
SpinLockInit(&slot->mutex);
LWLockInitialize(&slot->io_in_progress_lock,
LWTRANCHE_REPLICATION_SLOT_IO);
@@ -461,7 +462,7 @@ ReplicationSlotCreate(const char *name, bool db_specific,
* be doing that. So it's safe to initialize the slot.
*/
Assert(!slot->in_use);
- Assert(slot->active_pid == 0);
+ Assert(slot->active_proc == INVALID_PROC_NUMBER);
/* first initialize persistent data */
memset(&slot->data, 0, sizeof(ReplicationSlotPersistentData));
@@ -505,8 +506,8 @@ ReplicationSlotCreate(const char *name, bool db_specific,
/* We can now mark the slot active, and that makes it our slot. */
SpinLockAcquire(&slot->mutex);
- Assert(slot->active_pid == 0);
- slot->active_pid = MyProcPid;
+ Assert(slot->active_proc == INVALID_PROC_NUMBER);
+ slot->active_proc = MyProcNumber;
SpinLockRelease(&slot->mutex);
MyReplicationSlot = slot;
@@ -620,6 +621,7 @@ void
ReplicationSlotAcquire(const char *name, bool nowait, bool error_if_invalid)
{
ReplicationSlot *s;
+ ProcNumber active_proc;
int active_pid;
Assert(name != NULL);
@@ -672,17 +674,18 @@ ReplicationSlotAcquire(const char *name, bool nowait, bool error_if_invalid)
* to inactive_since in InvalidatePossiblyObsoleteSlot.
*/
SpinLockAcquire(&s->mutex);
- if (s->active_pid == 0)
- s->active_pid = MyProcPid;
- active_pid = s->active_pid;
+ if (s->active_proc == INVALID_PROC_NUMBER)
+ s->active_proc = MyProcNumber;
+ active_proc = s->active_proc;
ReplicationSlotSetInactiveSince(s, 0, false);
SpinLockRelease(&s->mutex);
}
else
{
- s->active_pid = active_pid = MyProcPid;
+ s->active_proc = active_proc = MyProcNumber;
ReplicationSlotSetInactiveSince(s, 0, true);
}
+ active_pid = GetPGProcByNumber(active_proc)->pid;
LWLockRelease(ReplicationSlotControlLock);
/*
@@ -690,7 +693,7 @@ ReplicationSlotAcquire(const char *name, bool nowait, bool error_if_invalid)
* wait until the owning process signals us that it's been released, or
* error out.
*/
- if (active_pid != MyProcPid)
+ if (active_proc != MyProcNumber)
{
if (!nowait)
{
@@ -762,7 +765,7 @@ ReplicationSlotRelease(void)
bool is_logical;
TimestampTz now = 0;
- Assert(slot != NULL && slot->active_pid != 0);
+ Assert(slot != NULL && slot->active_proc != INVALID_PROC_NUMBER);
is_logical = SlotIsLogical(slot);
@@ -815,7 +818,7 @@ ReplicationSlotRelease(void)
* disconnecting, but wake up others that may be waiting for it.
*/
SpinLockAcquire(&slot->mutex);
- slot->active_pid = 0;
+ slot->active_proc = INVALID_PROC_NUMBER;
ReplicationSlotSetInactiveSince(slot, now, false);
SpinLockRelease(&slot->mutex);
ConditionVariableBroadcast(&slot->active_cv);
@@ -877,7 +880,7 @@ ReplicationSlotCleanup(bool synced_only)
found_valid_logicalslot |=
(SlotIsLogical(s) && s->data.invalidated == RS_INVAL_NONE);
- if ((s->active_pid == MyProcPid &&
+ if ((s->active_proc == MyProcNumber &&
(!synced_only || s->data.synced)))
{
Assert(s->data.persistency == RS_TEMPORARY);
@@ -1088,7 +1091,7 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
bool fail_softly = slot->data.persistency != RS_PERSISTENT;
SpinLockAcquire(&slot->mutex);
- slot->active_pid = 0;
+ slot->active_proc = INVALID_PROC_NUMBER;
SpinLockRelease(&slot->mutex);
/* wake up anyone waiting on this slot */
@@ -1110,7 +1113,7 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
* Also wake up processes waiting for it.
*/
LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE);
- slot->active_pid = 0;
+ slot->active_proc = INVALID_PROC_NUMBER;
slot->in_use = false;
LWLockRelease(ReplicationSlotControlLock);
ConditionVariableBroadcast(&slot->active_cv);
@@ -1476,7 +1479,7 @@ ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive)
/* count slots with spinlock held */
SpinLockAcquire(&s->mutex);
(*nslots)++;
- if (s->active_pid != 0)
+ if (s->active_proc != INVALID_PROC_NUMBER)
(*nactive)++;
SpinLockRelease(&s->mutex);
}
@@ -1520,7 +1523,7 @@ ReplicationSlotsDropDBSlots(Oid dboid)
{
ReplicationSlot *s;
char *slotname;
- int active_pid;
+ ProcNumber active_proc;
s = &ReplicationSlotCtl->replication_slots[i];
@@ -1550,11 +1553,11 @@ ReplicationSlotsDropDBSlots(Oid dboid)
SpinLockAcquire(&s->mutex);
/* can't change while ReplicationSlotControlLock is held */
slotname = NameStr(s->data.name);
- active_pid = s->active_pid;
- if (active_pid == 0)
+ active_proc = s->active_proc;
+ if (active_proc == INVALID_PROC_NUMBER)
{
MyReplicationSlot = s;
- s->active_pid = MyProcPid;
+ s->active_proc = MyProcNumber;
}
SpinLockRelease(&s->mutex);
@@ -1579,11 +1582,11 @@ ReplicationSlotsDropDBSlots(Oid dboid)
* XXX: We can consider shutting down the slot sync worker before
* trying to drop synced temporary slots here.
*/
- if (active_pid)
+ if (active_proc != INVALID_PROC_NUMBER)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_IN_USE),
errmsg("replication slot \"%s\" is active for PID %d",
- slotname, active_pid)));
+ slotname, GetPGProcByNumber(active_proc)->pid)));
/*
* To avoid duplicating ReplicationSlotDropAcquired() and to avoid
@@ -1974,6 +1977,7 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
{
XLogRecPtr restart_lsn;
NameData slotname;
+ ProcNumber active_proc;
int active_pid = 0;
ReplicationSlotInvalidationCause invalidation_cause = RS_INVAL_NONE;
TimestampTz now = 0;
@@ -2027,7 +2031,7 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
}
slotname = s->data.name;
- active_pid = s->active_pid;
+ active_proc = s->active_proc;
/*
* If the slot can be acquired, do so and mark it invalidated
@@ -2039,10 +2043,10 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
* is terminated. So, the inactive slot can only be invalidated
* immediately without being terminated.
*/
- if (active_pid == 0)
+ if (active_proc == INVALID_PROC_NUMBER)
{
MyReplicationSlot = s;
- s->active_pid = MyProcPid;
+ s->active_proc = MyProcNumber;
s->data.invalidated = invalidation_cause;
/*
@@ -2058,6 +2062,11 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
/* Let caller know */
invalidated = true;
}
+ else
+ {
+ active_pid = GetPGProcByNumber(active_proc)->pid;
+ Assert(active_pid != 0);
+ }
SpinLockRelease(&s->mutex);
@@ -2073,7 +2082,7 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
&slot_idle_usecs);
}
- if (active_pid != 0)
+ if (active_proc != INVALID_PROC_NUMBER)
{
/*
* Prepare the sleep on the slot's condition variable before
@@ -2105,9 +2114,9 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
slot_idle_secs);
if (MyBackendType == B_STARTUP)
- (void) SendProcSignal(active_pid,
- PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT,
- INVALID_PROC_NUMBER);
+ (void) SignalRecoveryConflict(GetPGProcByNumber(active_proc),
+ active_pid,
+ RECOVERY_CONFLICT_LOGICALSLOT);
else
(void) kill(active_pid, SIGTERM);
@@ -2875,7 +2884,7 @@ RestoreSlotFromDisk(const char *name)
slot->candidate_restart_valid = InvalidXLogRecPtr;
slot->in_use = true;
- slot->active_pid = 0;
+ slot->active_proc = INVALID_PROC_NUMBER;
/*
* Set the time since the slot has become inactive after loading the
@@ -3158,7 +3167,7 @@ StandbySlotsHaveCaughtup(XLogRecPtr wait_for_lsn, int elevel)
SpinLockAcquire(&slot->mutex);
restart_lsn = slot->data.restart_lsn;
invalidated = slot->data.invalidated != RS_INVAL_NONE;
- inactive = slot->active_pid == 0;
+ inactive = slot->active_proc == INVALID_PROC_NUMBER;
SpinLockRelease(&slot->mutex);
if (invalidated)
diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
index 1ed2d80c2d2..9f5e4f998fe 100644
--- a/src/backend/replication/slotfuncs.c
+++ b/src/backend/replication/slotfuncs.c
@@ -20,6 +20,7 @@
#include "replication/logical.h"
#include "replication/slot.h"
#include "replication/slotsync.h"
+#include "storage/proc.h"
#include "utils/builtins.h"
#include "utils/guc.h"
#include "utils/pg_lsn.h"
@@ -309,10 +310,10 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
values[i++] = ObjectIdGetDatum(slot_contents.data.database);
values[i++] = BoolGetDatum(slot_contents.data.persistency == RS_TEMPORARY);
- values[i++] = BoolGetDatum(slot_contents.active_pid != 0);
+ values[i++] = BoolGetDatum(slot_contents.active_proc != INVALID_PROC_NUMBER);
- if (slot_contents.active_pid != 0)
- values[i++] = Int32GetDatum(slot_contents.active_pid);
+ if (slot_contents.active_proc != INVALID_PROC_NUMBER)
+ values[i++] = Int32GetDatum(GetPGProcByNumber(slot_contents.active_proc)->pid);
else
nulls[i++] = true;
@@ -377,13 +378,13 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
*/
if (XLogRecPtrIsValid(slot_contents.data.restart_lsn))
{
- int pid;
+ ProcNumber procno;
SpinLockAcquire(&slot->mutex);
- pid = slot->active_pid;
+ procno = slot->active_proc;
slot_contents.data.restart_lsn = slot->data.restart_lsn;
SpinLockRelease(&slot->mutex);
- if (pid != 0)
+ if (procno != INVALID_PROC_NUMBER)
{
values[i++] = CStringGetTextDatum("unreserved");
walstate = WALAVAIL_UNRESERVED;
diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c
index e7bee777532..7ea6001e9ad 100644
--- a/src/backend/replication/syncrep.c
+++ b/src/backend/replication/syncrep.c
@@ -355,7 +355,7 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
pg_read_barrier();
Assert(dlist_node_is_detached(&MyProc->syncRepLinks));
MyProc->syncRepState = SYNC_REP_NOT_WAITING;
- MyProc->waitLSN = 0;
+ MyProc->waitLSN = InvalidXLogRecPtr;
/* reset ps display to remove the suffix */
if (update_process_title)
@@ -1027,7 +1027,7 @@ SyncRepQueueIsOrderedByLSN(int mode)
Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE);
- lastLSN = 0;
+ lastLSN = InvalidXLogRecPtr;
dlist_foreach(iter, &WalSndCtl->SyncRepQueue[mode])
{
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index 6970af3f3ff..10e64a7d1f4 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -169,7 +169,6 @@ WalReceiverMain(const void *startup_data, size_t startup_data_len)
Assert(startup_data_len == 0);
- MyBackendType = B_WAL_RECEIVER;
AuxiliaryProcessMainCommon();
/*
@@ -1122,8 +1121,8 @@ XLogWalRcvClose(XLogRecPtr recptr, TimeLineID tli)
static void
XLogWalRcvSendReply(bool force, bool requestReply)
{
- static XLogRecPtr writePtr = 0;
- static XLogRecPtr flushPtr = 0;
+ static XLogRecPtr writePtr = InvalidXLogRecPtr;
+ static XLogRecPtr flushPtr = InvalidXLogRecPtr;
XLogRecPtr applyPtr;
TimestampTz now;
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index a0e6a3d200c..2cde8ebc729 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1611,6 +1611,32 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
ProcessPendingWrites();
}
+/*
+ * Handle configuration reload.
+ *
+ * Process the pending configuration file reload and reinitializes synchronous
+ * replication settings. Also releases any waiters that may now be satisfied due
+ * to changes in synchronous replication requirements.
+ */
+static void
+WalSndHandleConfigReload(void)
+{
+ if (!ConfigReloadPending)
+ return;
+
+ ConfigReloadPending = false;
+ ProcessConfigFile(PGC_SIGHUP);
+ SyncRepInitConfig();
+
+ /*
+ * Recheck and release any now-satisfied waiters after config reload
+ * changes synchronous replication requirements (e.g., reducing the number
+ * of sync standbys or changing the standby names).
+ */
+ if (!am_cascading_walsender)
+ SyncRepReleaseWaiters();
+}
+
/*
* Wait until there is no pending write. Also process replies from the other
* side and check timeouts during that.
@@ -1646,12 +1672,7 @@ ProcessPendingWrites(void)
CHECK_FOR_INTERRUPTS();
/* Process any requests or signals received recently */
- if (ConfigReloadPending)
- {
- ConfigReloadPending = false;
- ProcessConfigFile(PGC_SIGHUP);
- SyncRepInitConfig();
- }
+ WalSndHandleConfigReload();
/* Try to flush pending output to the client */
if (pq_flush_if_writable() != 0)
@@ -1854,12 +1875,7 @@ WalSndWaitForWal(XLogRecPtr loc)
CHECK_FOR_INTERRUPTS();
/* Process any requests or signals received recently */
- if (ConfigReloadPending)
- {
- ConfigReloadPending = false;
- ProcessConfigFile(PGC_SIGHUP);
- SyncRepInitConfig();
- }
+ WalSndHandleConfigReload();
/* Check for input from the client */
ProcessRepliesIfAny();
@@ -2899,12 +2915,7 @@ WalSndLoop(WalSndSendDataCallback send_data)
CHECK_FOR_INTERRUPTS();
/* Process any requests or signals received recently */
- if (ConfigReloadPending)
- {
- ConfigReloadPending = false;
- ProcessConfigFile(PGC_SIGHUP);
- SyncRepInitConfig();
- }
+ WalSndHandleConfigReload();
/* Check for input from the client */
ProcessRepliesIfAny();
diff --git a/src/backend/statistics/extended_stats_funcs.c b/src/backend/statistics/extended_stats_funcs.c
index db107684607..479f74652be 100644
--- a/src/backend/statistics/extended_stats_funcs.c
+++ b/src/backend/statistics/extended_stats_funcs.c
@@ -347,9 +347,8 @@ extended_statistics_update(FunctionCallInfo fcinfo)
{
ereport(WARNING,
errcode(ERRCODE_UNDEFINED_OBJECT),
- errmsg("could not find extended statistics object \"%s\".\"%s\"",
- quote_identifier(nspname),
- quote_identifier(stxname)));
+ errmsg("could not find extended statistics object \"%s.%s\"",
+ nspname, stxname));
success = false;
goto cleanup;
}
@@ -364,11 +363,9 @@ extended_statistics_update(FunctionCallInfo fcinfo)
{
ereport(WARNING,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("could not restore extended statistics object \"%s\".\"%s\": incorrect relation \"%s\".\"%s\" specified",
- quote_identifier(nspname),
- quote_identifier(stxname),
- quote_identifier(relnspname),
- quote_identifier(relname)));
+ errmsg("could not restore extended statistics object \"%s.%s\": incorrect relation \"%s.%s\" specified",
+ nspname, stxname,
+ relnspname, relname));
success = false;
goto cleanup;
@@ -420,9 +417,8 @@ extended_statistics_update(FunctionCallInfo fcinfo)
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("cannot specify parameter \"%s\"",
extarginfo[NDISTINCT_ARG].argname),
- errhint("Extended statistics object \"%s\".\"%s\" does not support statistics of this type.",
- quote_identifier(nspname),
- quote_identifier(stxname)));
+ errhint("Extended statistics object \"%s.%s\" does not support statistics of this type.",
+ nspname, stxname));
has.ndistinct = false;
success = false;
@@ -438,9 +434,8 @@ extended_statistics_update(FunctionCallInfo fcinfo)
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("cannot specify parameter \"%s\"",
extarginfo[DEPENDENCIES_ARG].argname),
- errhint("Extended statistics object \"%s\".\"%s\" does not support statistics of this type.",
- quote_identifier(nspname),
- quote_identifier(stxname)));
+ errhint("Extended statistics object \"%s.%s\" does not support statistics of this type.",
+ nspname, stxname));
has.dependencies = false;
success = false;
}
@@ -463,9 +458,8 @@ extended_statistics_update(FunctionCallInfo fcinfo)
extarginfo[MOST_COMMON_VALS_ARG].argname,
extarginfo[MOST_COMMON_FREQS_ARG].argname,
extarginfo[MOST_COMMON_BASE_FREQS_ARG].argname),
- errhint("Extended statistics object \"%s\".\"%s\" does not support statistics of this type.",
- quote_identifier(nspname),
- quote_identifier(stxname)));
+ errhint("Extended statistics object \"%s.%s\" does not support statistics of this type.",
+ nspname, stxname));
has.mcv = false;
success = false;
@@ -539,7 +533,7 @@ extended_statistics_update(FunctionCallInfo fcinfo)
/*
* After all the positive number attnums in stxkeys come the negative
* numbers (if any) which represent expressions in the order that they
- * appear in stxdexprs. Because the expressions are always
+ * appear in stxdexpr. Because the expressions are always
* monotonically decreasing from -1, there is no point in looking at
* the values in stxkeys, it's enough to know how many of them there
* are.
@@ -888,7 +882,7 @@ pg_clear_extended_stats(PG_FUNCTION_ARGS)
table_close(pg_stext, RowExclusiveLock);
ereport(WARNING,
errcode(ERRCODE_UNDEFINED_OBJECT),
- errmsg("could not find extended statistics object \"%s\".\"%s\"",
+ errmsg("could not find extended statistics object \"%s.%s\"",
nspname, stxname));
PG_RETURN_VOID();
}
@@ -904,7 +898,7 @@ pg_clear_extended_stats(PG_FUNCTION_ARGS)
table_close(pg_stext, RowExclusiveLock);
ereport(WARNING,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("could not clear extended statistics object \"%s\".\"%s\": incorrect relation \"%s\".\"%s\" specified",
+ errmsg("could not clear extended statistics object \"%s.%s\": incorrect relation \"%s.%s\" specified",
get_namespace_name(nspoid), stxname,
relnspname, relname));
PG_RETURN_VOID();
diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c
index d7c144cd8f7..d9617c20e76 100644
--- a/src/backend/storage/aio/method_worker.c
+++ b/src/backend/storage/aio/method_worker.c
@@ -390,7 +390,6 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
volatile int error_errno = 0;
char cmd[128];
- MyBackendType = B_IO_WORKER;
AuxiliaryProcessMainCommon();
pqsignal(SIGHUP, SignalHandlerForConfigReload);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 6f935648ae9..d1babaff023 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -59,6 +59,7 @@
#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/proclist.h"
+#include "storage/procsignal.h"
#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "storage/standby.h"
@@ -5895,6 +5896,13 @@ BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr)
/*
* Acquire the content lock for the buffer, but only if we don't have to wait.
+ *
+ * It is allowed to try to conditionally acquire a lock on a buffer that this
+ * backend has already locked, but the lock acquisition will always fail, even
+ * if the new lock acquisition does not conflict with an already held lock
+ * (e.g. two share locks). This is because we currently do not have space to
+ * track multiple lock ownerships of the same buffer within one backend. That
+ * is ok for the current uses of BufferLockConditional().
*/
static bool
BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
@@ -5903,9 +5911,12 @@ BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
bool mustwait;
/*
- * We better not already hold a lock on the buffer.
+ * As described above, if we're trying to lock a buffer this backend
+ * already has locked, return false, independent of the existing and
+ * desired lock level.
*/
- Assert(entry->data.lockmode == BUFFER_LOCK_UNLOCK);
+ if (entry->data.lockmode != BUFFER_LOCK_UNLOCK)
+ return false;
/*
* Lock out cancel/die interrupts until we exit the code section protected
@@ -6560,7 +6571,7 @@ LockBufferForCleanup(Buffer buffer)
* deadlock_timeout for it.
*/
if (logged_recovery_conflict)
- LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
+ LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN,
waitStart, GetCurrentTimestamp(),
NULL, false);
@@ -6611,7 +6622,7 @@ LockBufferForCleanup(Buffer buffer)
if (TimestampDifferenceExceeds(waitStart, now,
DeadlockTimeout))
{
- LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
+ LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN,
waitStart, now, NULL, true);
logged_recovery_conflict = true;
}
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 0f8083651de..5d07b64a1ef 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -164,6 +164,9 @@ bool data_sync_retry = false;
/* How SyncDataDirectory() should do its job. */
int recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
+/* How data files should be bulk-extended with zeros. */
+int file_extend_method = DEFAULT_FILE_EXTEND_METHOD;
+
/* Which kinds of files should be opened with PG_O_DIRECT. */
int io_direct_flags;
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 2a3dfedf7e9..1f7e933d500 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -212,12 +212,10 @@ CreateSharedMemoryAndSemaphores(void)
Assert(strcmp("unknown",
GetConfigOption("huge_pages_status", false, false)) != 0);
- InitShmemAccess(seghdr);
-
/*
* Set up shared memory allocation mechanism
*/
- InitShmemAllocation();
+ InitShmemAllocator(seghdr);
/* Initialize subsystems */
CreateOrAttachShmemStructs();
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 6be565155ab..40312df2cac 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -60,6 +60,7 @@
#include "port/pg_lfind.h"
#include "storage/proc.h"
#include "storage/procarray.h"
+#include "storage/procsignal.h"
#include "utils/acl.h"
#include "utils/builtins.h"
#include "utils/injection_point.h"
@@ -708,7 +709,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
/* be sure this is cleared in abort */
proc->delayChkptFlags = 0;
- proc->recoveryConflictPending = false;
+ pg_atomic_write_u32(&proc->pendingRecoveryConflicts, 0);
/* must be cleared with xid/xmin: */
/* avoid unnecessarily dirtying shared cachelines */
@@ -750,7 +751,7 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
/* be sure this is cleared in abort */
proc->delayChkptFlags = 0;
- proc->recoveryConflictPending = false;
+ pg_atomic_write_u32(&proc->pendingRecoveryConflicts, 0);
/* must be cleared with xid/xmin: */
/* avoid unnecessarily dirtying shared cachelines */
@@ -933,7 +934,7 @@ ProcArrayClearTransaction(PGPROC *proc)
proc->vxid.lxid = InvalidLocalTransactionId;
proc->xmin = InvalidTransactionId;
- proc->recoveryConflictPending = false;
+ pg_atomic_write_u32(&proc->pendingRecoveryConflicts, 0);
Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK));
Assert(!proc->delayChkptFlags);
@@ -3445,19 +3446,46 @@ GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid)
}
/*
- * CancelVirtualTransaction - used in recovery conflict processing
+ * SignalRecoveryConflict -- signal that a process is blocking recovery
*
- * Returns pid of the process signaled, or 0 if not found.
+ * The 'pid' is redundant with 'proc', but it acts as a cross-check to
+ * detect process had exited and the PGPROC entry was reused for a different
+ * process.
+ *
+ * Returns true if the process was signaled, or false if not found.
*/
-pid_t
-CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode)
+bool
+SignalRecoveryConflict(PGPROC *proc, pid_t pid, RecoveryConflictReason reason)
{
- return SignalVirtualTransaction(vxid, sigmode, true);
+ bool found = false;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ /*
+ * Kill the pid if it's still here. If not, that's what we wanted so
+ * ignore any errors.
+ */
+ if (proc->pid == pid)
+ {
+ (void) pg_atomic_fetch_or_u32(&proc->pendingRecoveryConflicts, (1 << reason));
+
+ /* wake up the process */
+ (void) SendProcSignal(pid, PROCSIG_RECOVERY_CONFLICT, GetNumberFromPGProc(proc));
+ found = true;
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return found;
}
-pid_t
-SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode,
- bool conflictPending)
+/*
+ * SignalRecoveryConflictWithVirtualXID -- signal that a VXID is blocking recovery
+ *
+ * Like SignalRecoveryConflict, but the target is identified by VXID
+ */
+bool
+SignalRecoveryConflictWithVirtualXID(VirtualTransactionId vxid, RecoveryConflictReason reason)
{
ProcArrayStruct *arrayP = procArray;
int index;
@@ -3476,15 +3504,16 @@ SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode,
if (procvxid.procNumber == vxid.procNumber &&
procvxid.localTransactionId == vxid.localTransactionId)
{
- proc->recoveryConflictPending = conflictPending;
pid = proc->pid;
if (pid != 0)
{
+ (void) pg_atomic_fetch_or_u32(&proc->pendingRecoveryConflicts, (1 << reason));
+
/*
* Kill the pid if it's still here. If not, that's what we
* wanted so ignore any errors.
*/
- (void) SendProcSignal(pid, sigmode, vxid.procNumber);
+ (void) SendProcSignal(pid, PROCSIG_RECOVERY_CONFLICT, vxid.procNumber);
}
break;
}
@@ -3492,7 +3521,50 @@ SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode,
LWLockRelease(ProcArrayLock);
- return pid;
+ return pid != 0;
+}
+
+/*
+ * SignalRecoveryConflictWithDatabase --- signal all backends specified database
+ *
+ * Like SignalRecoveryConflict, but signals all backends using the database.
+ */
+void
+SignalRecoveryConflictWithDatabase(Oid databaseid, RecoveryConflictReason reason)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+
+ /* tell all backends to die */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ if (databaseid == InvalidOid || proc->databaseId == databaseid)
+ {
+ VirtualTransactionId procvxid;
+ pid_t pid;
+
+ GET_VXID_FROM_PGPROC(procvxid, *proc);
+
+ pid = proc->pid;
+ if (pid != 0)
+ {
+ (void) pg_atomic_fetch_or_u32(&proc->pendingRecoveryConflicts, (1 << reason));
+
+ /*
+ * Kill the pid if it's still here. If not, that's what we
+ * wanted so ignore any errors.
+ */
+ (void) SendProcSignal(pid, PROCSIG_RECOVERY_CONFLICT, procvxid.procNumber);
+ }
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
}
/*
@@ -3602,7 +3674,7 @@ CountDBConnections(Oid databaseid)
if (proc->pid == 0)
continue; /* do not count prepared xacts */
- if (!proc->isRegularBackend)
+ if (proc->backendType != B_BACKEND)
continue; /* count only regular backend processes */
if (!OidIsValid(databaseid) ||
proc->databaseId == databaseid)
@@ -3614,46 +3686,6 @@ CountDBConnections(Oid databaseid)
return count;
}
-/*
- * CancelDBBackends --- cancel backends that are using specified database
- */
-void
-CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending)
-{
- ProcArrayStruct *arrayP = procArray;
- int index;
-
- /* tell all backends to die */
- LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-
- for (index = 0; index < arrayP->numProcs; index++)
- {
- int pgprocno = arrayP->pgprocnos[index];
- PGPROC *proc = &allProcs[pgprocno];
-
- if (databaseid == InvalidOid || proc->databaseId == databaseid)
- {
- VirtualTransactionId procvxid;
- pid_t pid;
-
- GET_VXID_FROM_PGPROC(procvxid, *proc);
-
- proc->recoveryConflictPending = conflictPending;
- pid = proc->pid;
- if (pid != 0)
- {
- /*
- * Kill the pid if it's still here. If not, that's what we
- * wanted so ignore any errors.
- */
- (void) SendProcSignal(pid, sigmode, procvxid.procNumber);
- }
- }
- }
-
- LWLockRelease(ProcArrayLock);
-}
-
/*
* CountUserBackends --- count backends that are used by specified user
* (only regular backends, not any type of background worker)
@@ -3674,7 +3706,7 @@ CountUserBackends(Oid roleid)
if (proc->pid == 0)
continue; /* do not count prepared xacts */
- if (!proc->isRegularBackend)
+ if (proc->backendType != B_BACKEND)
continue; /* count only regular backend processes */
if (proc->roleId == roleid)
count++;
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
index 8e56922dcea..5d33559926a 100644
--- a/src/backend/storage/ipc/procsignal.c
+++ b/src/backend/storage/ipc/procsignal.c
@@ -697,26 +697,8 @@ procsignal_sigusr1_handler(SIGNAL_ARGS)
if (CheckProcSignal(PROCSIG_PARALLEL_APPLY_MESSAGE))
HandleParallelApplyMessageInterrupt();
- if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_DATABASE))
- HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_DATABASE);
-
- if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_TABLESPACE))
- HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_TABLESPACE);
-
- if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_LOCK))
- HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_LOCK);
-
- if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT))
- HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT);
-
- if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT))
- HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT);
-
- if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK))
- HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
-
- if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN))
- HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+ if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT))
+ HandleRecoveryConflictInterrupt();
SetLatch(MyLatch);
}
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 1b536363152..9f362ce8641 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -76,20 +76,33 @@
#include "storage/spin.h"
#include "utils/builtins.h"
+/*
+ * This is the first data structure stored in the shared memory segment, at
+ * the offset that PGShmemHeader->content_offset points to. Allocations by
+ * ShmemAlloc() are carved out of the space after this.
+ *
+ * For the base pointer and the total size of the shmem segment, we rely on
+ * the PGShmemHeader.
+ */
+typedef struct ShmemAllocatorData
+{
+ Size free_offset; /* offset to first free space from ShmemBase */
+ HTAB *index; /* copy of ShmemIndex */
+
+ /* protects shared memory and LWLock allocation */
+ slock_t shmem_lock;
+} ShmemAllocatorData;
+
static void *ShmemAllocRaw(Size size, Size *allocated_size);
-static void *ShmemAllocUnlocked(Size size);
/* shared memory global variables */
static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */
-
static void *ShmemBase; /* start address of shared memory */
-
static void *ShmemEnd; /* end+1 address of shared memory */
-slock_t *ShmemLock; /* spinlock for shared memory and LWLock
- * allocation */
-
+static ShmemAllocatorData *ShmemAllocator;
+slock_t *ShmemLock; /* points to ShmemAllocator->shmem_lock */
static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
/* To get reliable results for NUMA inquiry we need to "touch pages" once */
@@ -98,49 +111,64 @@ static bool firstNumaTouch = true;
Datum pg_numa_available(PG_FUNCTION_ARGS);
/*
- * InitShmemAccess() --- set up basic pointers to shared memory.
+ * InitShmemAllocator() --- set up basic pointers to shared memory.
+ *
+ * Called at postmaster or stand-alone backend startup, to initialize the
+ * allocator's data structure in the shared memory segment. In EXEC_BACKEND,
+ * this is also called at backend startup, to set up pointers to the shared
+ * memory areas.
*/
void
-InitShmemAccess(PGShmemHeader *seghdr)
+InitShmemAllocator(PGShmemHeader *seghdr)
{
+ Assert(seghdr != NULL);
+
+ /*
+ * We assume the pointer and offset are MAXALIGN. Not a hard requirement,
+ * but it's true today and keeps the math below simpler.
+ */
+ Assert(seghdr == (void *) MAXALIGN(seghdr));
+ Assert(seghdr->content_offset == MAXALIGN(seghdr->content_offset));
+
ShmemSegHdr = seghdr;
ShmemBase = seghdr;
ShmemEnd = (char *) ShmemBase + seghdr->totalsize;
-}
-/*
- * InitShmemAllocation() --- set up shared-memory space allocation.
- *
- * This should be called only in the postmaster or a standalone backend.
- */
-void
-InitShmemAllocation(void)
-{
- PGShmemHeader *shmhdr = ShmemSegHdr;
- char *aligned;
+#ifndef EXEC_BACKEND
+ Assert(!IsUnderPostmaster);
+#endif
+ if (IsUnderPostmaster)
+ {
+ PGShmemHeader *shmhdr = ShmemSegHdr;
- Assert(shmhdr != NULL);
+ ShmemAllocator = (ShmemAllocatorData *) ((char *) shmhdr + shmhdr->content_offset);
+ ShmemLock = &ShmemAllocator->shmem_lock;
+ }
+ else
+ {
+ Size offset;
- /*
- * Initialize the spinlock used by ShmemAlloc. We must use
- * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet.
- */
- ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t));
+ /*
+ * Allocations after this point should go through ShmemAlloc, which
+ * expects to allocate everything on cache line boundaries. Make sure
+ * the first allocation begins on a cache line boundary.
+ */
+ offset = CACHELINEALIGN(seghdr->content_offset + sizeof(ShmemAllocatorData));
+ if (offset > seghdr->totalsize)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory (%zu bytes requested)",
+ offset)));
- SpinLockInit(ShmemLock);
+ ShmemAllocator = (ShmemAllocatorData *) ((char *) seghdr + seghdr->content_offset);
- /*
- * Allocations after this point should go through ShmemAlloc, which
- * expects to allocate everything on cache line boundaries. Make sure the
- * first allocation begins on a cache line boundary.
- */
- aligned = (char *)
- (CACHELINEALIGN((((char *) shmhdr) + shmhdr->freeoffset)));
- shmhdr->freeoffset = aligned - (char *) shmhdr;
-
- /* ShmemIndex can't be set up yet (need LWLocks first) */
- shmhdr->index = NULL;
- ShmemIndex = (HTAB *) NULL;
+ SpinLockInit(&ShmemAllocator->shmem_lock);
+ ShmemLock = &ShmemAllocator->shmem_lock;
+ ShmemAllocator->free_offset = offset;
+ /* ShmemIndex can't be set up yet (need LWLocks first) */
+ ShmemAllocator->index = NULL;
+ ShmemIndex = (HTAB *) NULL;
+ }
}
/*
@@ -209,13 +237,13 @@ ShmemAllocRaw(Size size, Size *allocated_size)
SpinLockAcquire(ShmemLock);
- newStart = ShmemSegHdr->freeoffset;
+ newStart = ShmemAllocator->free_offset;
newFree = newStart + size;
if (newFree <= ShmemSegHdr->totalsize)
{
newSpace = (char *) ShmemBase + newStart;
- ShmemSegHdr->freeoffset = newFree;
+ ShmemAllocator->free_offset = newFree;
}
else
newSpace = NULL;
@@ -228,45 +256,6 @@ ShmemAllocRaw(Size size, Size *allocated_size)
return newSpace;
}
-/*
- * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory
- *
- * Allocate space without locking ShmemLock. This should be used for,
- * and only for, allocations that must happen before ShmemLock is ready.
- *
- * We consider maxalign, rather than cachealign, sufficient here.
- */
-static void *
-ShmemAllocUnlocked(Size size)
-{
- Size newStart;
- Size newFree;
- void *newSpace;
-
- /*
- * Ensure allocated space is adequately aligned.
- */
- size = MAXALIGN(size);
-
- Assert(ShmemSegHdr != NULL);
-
- newStart = ShmemSegHdr->freeoffset;
-
- newFree = newStart + size;
- if (newFree > ShmemSegHdr->totalsize)
- ereport(ERROR,
- (errcode(ERRCODE_OUT_OF_MEMORY),
- errmsg("out of shared memory (%zu bytes requested)",
- size)));
- ShmemSegHdr->freeoffset = newFree;
-
- newSpace = (char *) ShmemBase + newStart;
-
- Assert(newSpace == (void *) MAXALIGN(newSpace));
-
- return newSpace;
-}
-
/*
* ShmemAddrIsValid -- test if an address refers to shared memory
*
@@ -395,16 +384,14 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
if (!ShmemIndex)
{
- PGShmemHeader *shmemseghdr = ShmemSegHdr;
-
/* Must be trying to create/attach to ShmemIndex itself */
Assert(strcmp(name, "ShmemIndex") == 0);
if (IsUnderPostmaster)
{
/* Must be initializing a (non-standalone) backend */
- Assert(shmemseghdr->index != NULL);
- structPtr = shmemseghdr->index;
+ Assert(ShmemAllocator->index != NULL);
+ structPtr = ShmemAllocator->index;
*foundPtr = true;
}
else
@@ -417,9 +404,9 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
* index has been initialized. This should be OK because no other
* process can be accessing shared memory yet.
*/
- Assert(shmemseghdr->index == NULL);
+ Assert(ShmemAllocator->index == NULL);
structPtr = ShmemAlloc(size);
- shmemseghdr->index = structPtr;
+ ShmemAllocator->index = structPtr;
*foundPtr = false;
}
LWLockRelease(ShmemIndexLock);
@@ -553,15 +540,15 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS)
/* output shared memory allocated but not counted via the shmem index */
values[0] = CStringGetTextDatum("");
nulls[1] = true;
- values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated);
+ values[2] = Int64GetDatum(ShmemAllocator->free_offset - named_allocated);
values[3] = values[2];
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
/* output as-of-yet unused shared memory */
nulls[0] = true;
- values[1] = Int64GetDatum(ShmemSegHdr->freeoffset);
+ values[1] = Int64GetDatum(ShmemAllocator->free_offset);
nulls[1] = false;
- values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset);
+ values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemAllocator->free_offset);
values[3] = values[2];
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
diff --git a/src/backend/storage/ipc/signalfuncs.c b/src/backend/storage/ipc/signalfuncs.c
index 6f7759cd720..d48b4fe3799 100644
--- a/src/backend/storage/ipc/signalfuncs.c
+++ b/src/backend/storage/ipc/signalfuncs.c
@@ -87,10 +87,7 @@ pg_signal_backend(int pid, int sig)
*/
if (!OidIsValid(proc->roleId) || superuser_arg(proc->roleId))
{
- ProcNumber procNumber = GetNumberFromPGProc(proc);
- BackendType backendType = pgstat_get_backend_type_by_proc_number(procNumber);
-
- if (backendType == B_AUTOVAC_WORKER)
+ if (proc->backendType == B_AUTOVAC_WORKER)
{
if (!has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_AUTOVACUUM_WORKER))
return SIGNAL_BACKEND_NOAUTOVAC;
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index afffab77106..d83afbfb9d6 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -71,13 +71,13 @@ static volatile sig_atomic_t got_standby_delay_timeout = false;
static volatile sig_atomic_t got_standby_lock_timeout = false;
static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
- ProcSignalReason reason,
+ RecoveryConflictReason reason,
uint32 wait_event_info,
bool report_waiting);
-static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason);
+static void SendRecoveryConflictWithBufferPin(RecoveryConflictReason reason);
static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
-static const char *get_recovery_conflict_desc(ProcSignalReason reason);
+static const char *get_recovery_conflict_desc(RecoveryConflictReason reason);
/*
* InitRecoveryTransactionEnvironment
@@ -271,7 +271,7 @@ WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
* to be resolved or not.
*/
void
-LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
+LogRecoveryConflict(RecoveryConflictReason reason, TimestampTz wait_start,
TimestampTz now, VirtualTransactionId *wait_list,
bool still_waiting)
{
@@ -358,7 +358,8 @@ LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
*/
static void
ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
- ProcSignalReason reason, uint32 wait_event_info,
+ RecoveryConflictReason reason,
+ uint32 wait_event_info,
bool report_waiting)
{
TimestampTz waitStart = 0;
@@ -384,19 +385,19 @@ ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
/* Is it time to kill it? */
if (WaitExceedsMaxStandbyDelay(wait_event_info))
{
- pid_t pid;
+ bool signaled;
/*
* Now find out who to throw out of the balloon.
*/
Assert(VirtualTransactionIdIsValid(*waitlist));
- pid = CancelVirtualTransaction(*waitlist, reason);
+ signaled = SignalRecoveryConflictWithVirtualXID(*waitlist, reason);
/*
* Wait a little bit for it to die so that we avoid flooding
* an unresponsive backend when system is heavily loaded.
*/
- if (pid != 0)
+ if (signaled)
pg_usleep(5000L);
}
@@ -489,7 +490,7 @@ ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon,
backends = GetConflictingVirtualXIDs(snapshotConflictHorizon,
locator.dbOid);
ResolveRecoveryConflictWithVirtualXIDs(backends,
- PROCSIG_RECOVERY_CONFLICT_SNAPSHOT,
+ RECOVERY_CONFLICT_SNAPSHOT,
WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT,
true);
@@ -560,7 +561,7 @@ ResolveRecoveryConflictWithTablespace(Oid tsid)
temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
InvalidOid);
ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
- PROCSIG_RECOVERY_CONFLICT_TABLESPACE,
+ RECOVERY_CONFLICT_TABLESPACE,
WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE,
true);
}
@@ -581,7 +582,7 @@ ResolveRecoveryConflictWithDatabase(Oid dbid)
*/
while (CountDBBackends(dbid) > 0)
{
- CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true);
+ SignalRecoveryConflictWithDatabase(dbid, RECOVERY_CONFLICT_DATABASE);
/*
* Wait awhile for them to die so that we avoid flooding an
@@ -665,7 +666,7 @@ ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
* because the caller, WaitOnLock(), has already reported that.
*/
ResolveRecoveryConflictWithVirtualXIDs(backends,
- PROCSIG_RECOVERY_CONFLICT_LOCK,
+ RECOVERY_CONFLICT_LOCK,
PG_WAIT_LOCK | locktag.locktag_type,
false);
}
@@ -723,9 +724,8 @@ ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
*/
while (VirtualTransactionIdIsValid(*backends))
{
- SignalVirtualTransaction(*backends,
- PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
- false);
+ (void) SignalRecoveryConflictWithVirtualXID(*backends,
+ RECOVERY_CONFLICT_STARTUP_DEADLOCK);
backends++;
}
@@ -803,7 +803,7 @@ ResolveRecoveryConflictWithBufferPin(void)
/*
* We're already behind, so clear a path as quickly as possible.
*/
- SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+ SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN);
}
else
{
@@ -843,7 +843,7 @@ ResolveRecoveryConflictWithBufferPin(void)
ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
if (got_standby_delay_timeout)
- SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+ SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN);
else if (got_standby_deadlock_timeout)
{
/*
@@ -859,7 +859,7 @@ ResolveRecoveryConflictWithBufferPin(void)
* not be so harmful because the period that the buffer is kept pinned
* is basically no so long. But we should fix this?
*/
- SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+ SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK);
}
/*
@@ -874,18 +874,18 @@ ResolveRecoveryConflictWithBufferPin(void)
}
static void
-SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
+SendRecoveryConflictWithBufferPin(RecoveryConflictReason reason)
{
- Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN ||
- reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+ Assert(reason == RECOVERY_CONFLICT_BUFFERPIN ||
+ reason == RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK);
/*
* We send signal to all backends to ask them if they are holding the
- * buffer pin which is delaying the Startup process. We must not set the
- * conflict flag yet, since most backends will be innocent. Let the
- * SIGUSR1 handling in each backend decide their own fate.
+ * buffer pin which is delaying the Startup process. Most of them will be
+ * innocent, but we let the SIGUSR1 handling in each backend decide their
+ * own fate.
*/
- CancelDBBackends(InvalidOid, reason, false);
+ SignalRecoveryConflictWithDatabase(InvalidOid, reason);
}
/*
@@ -1490,35 +1490,36 @@ LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
/* Return the description of recovery conflict */
static const char *
-get_recovery_conflict_desc(ProcSignalReason reason)
+get_recovery_conflict_desc(RecoveryConflictReason reason)
{
const char *reasonDesc = _("unknown reason");
switch (reason)
{
- case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
+ case RECOVERY_CONFLICT_BUFFERPIN:
reasonDesc = _("recovery conflict on buffer pin");
break;
- case PROCSIG_RECOVERY_CONFLICT_LOCK:
+ case RECOVERY_CONFLICT_LOCK:
reasonDesc = _("recovery conflict on lock");
break;
- case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
+ case RECOVERY_CONFLICT_TABLESPACE:
reasonDesc = _("recovery conflict on tablespace");
break;
- case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
+ case RECOVERY_CONFLICT_SNAPSHOT:
reasonDesc = _("recovery conflict on snapshot");
break;
- case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT:
+ case RECOVERY_CONFLICT_LOGICALSLOT:
reasonDesc = _("recovery conflict on replication slot");
break;
- case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ case RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ reasonDesc = _("recovery conflict on deadlock");
+ break;
+ case RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK:
reasonDesc = _("recovery conflict on buffer deadlock");
break;
- case PROCSIG_RECOVERY_CONFLICT_DATABASE:
+ case RECOVERY_CONFLICT_DATABASE:
reasonDesc = _("recovery conflict on database");
break;
- default:
- break;
}
return reasonDesc;
diff --git a/src/backend/storage/lmgr/deadlock.c b/src/backend/storage/lmgr/deadlock.c
index 8334a887618..0a8dd5eb7c2 100644
--- a/src/backend/storage/lmgr/deadlock.c
+++ b/src/backend/storage/lmgr/deadlock.c
@@ -135,10 +135,9 @@ static PGPROC *blocking_autovacuum_proc = NULL;
* This does per-backend initialization of the deadlock checker; primarily,
* allocation of working memory for DeadLockCheck. We do this per-backend
* since there's no percentage in making the kernel do copy-on-write
- * inheritance of workspace from the postmaster. We want to allocate the
- * space at startup because (a) the deadlock checker might be invoked when
- * there's no free memory left, and (b) the checker is normally run inside a
- * signal handler, which is a very dangerous place to invoke palloc from.
+ * inheritance of workspace from the postmaster. We allocate the space at
+ * startup because the deadlock checker is run with all the partitions of the
+ * lock table locked, and we want to keep that section as short as possible.
*/
void
InitDeadLockChecking(void)
@@ -213,8 +212,7 @@ InitDeadLockChecking(void)
*
* On failure, deadlock details are recorded in deadlockDetails[] for
* subsequent printing by DeadLockReport(). That activity is separate
- * because (a) we don't want to do it while holding all those LWLocks,
- * and (b) we are typically invoked inside a signal handler.
+ * because we don't want to do it while holding all those LWLocks.
*/
DeadLockState
DeadLockCheck(PGPROC *proc)
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 063826ae576..31ccdb1ef89 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -80,15 +80,13 @@ PROC_HDR *ProcGlobal = NULL;
NON_EXEC_STATIC PGPROC *AuxiliaryProcs = NULL;
PGPROC *PreparedXactProcs = NULL;
-static DeadLockState deadlock_state = DS_NOT_YET_CHECKED;
-
/* Is a deadlock check pending? */
static volatile sig_atomic_t got_deadlock_timeout;
static void RemoveProcFromArray(int code, Datum arg);
static void ProcKill(int code, Datum arg);
static void AuxiliaryProcKill(int code, Datum arg);
-static void CheckDeadLock(void);
+static DeadLockState CheckDeadLock(void);
/*
@@ -486,7 +484,7 @@ InitProcess(void)
MyProc->databaseId = InvalidOid;
MyProc->roleId = InvalidOid;
MyProc->tempNamespaceId = InvalidOid;
- MyProc->isRegularBackend = AmRegularBackendProcess();
+ MyProc->backendType = MyBackendType;
MyProc->delayChkptFlags = 0;
MyProc->statusFlags = 0;
/* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
@@ -506,10 +504,10 @@ InitProcess(void)
Assert(dlist_is_empty(&(MyProc->myProcLocks[i])));
}
#endif
- MyProc->recoveryConflictPending = false;
+ pg_atomic_write_u32(&MyProc->pendingRecoveryConflicts, 0);
/* Initialize fields for sync rep */
- MyProc->waitLSN = 0;
+ MyProc->waitLSN = InvalidXLogRecPtr;
MyProc->syncRepState = SYNC_REP_NOT_WAITING;
dlist_node_init(&MyProc->syncRepLinks);
@@ -685,7 +683,7 @@ InitAuxiliaryProcess(void)
MyProc->databaseId = InvalidOid;
MyProc->roleId = InvalidOid;
MyProc->tempNamespaceId = InvalidOid;
- MyProc->isRegularBackend = false;
+ MyProc->backendType = MyBackendType;
MyProc->delayChkptFlags = 0;
MyProc->statusFlags = 0;
MyProc->lwWaiting = LW_WS_NOT_WAITING;
@@ -1322,6 +1320,7 @@ ProcSleep(LOCALLOCK *locallock)
bool allow_autovacuum_cancel = true;
bool logged_recovery_conflict = false;
ProcWaitStatus myWaitStatus;
+ DeadLockState deadlock_state;
/* The caller must've armed the on-error cleanup mechanism */
Assert(GetAwaitedLock() == locallock);
@@ -1447,7 +1446,7 @@ ProcSleep(LOCALLOCK *locallock)
* because the startup process here has already waited
* longer than deadlock_timeout.
*/
- LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK,
+ LogRecoveryConflict(RECOVERY_CONFLICT_LOCK,
standbyWaitStart, now,
cnt > 0 ? vxids : NULL, true);
logged_recovery_conflict = true;
@@ -1462,7 +1461,7 @@ ProcSleep(LOCALLOCK *locallock)
/* check for deadlocks first, as that's probably log-worthy */
if (got_deadlock_timeout)
{
- CheckDeadLock();
+ deadlock_state = CheckDeadLock();
got_deadlock_timeout = false;
}
CHECK_FOR_INTERRUPTS();
@@ -1688,7 +1687,7 @@ ProcSleep(LOCALLOCK *locallock)
* startup process waited longer than deadlock_timeout for it.
*/
if (InHotStandby && logged_recovery_conflict)
- LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK,
+ LogRecoveryConflict(RECOVERY_CONFLICT_LOCK,
standbyWaitStart, GetCurrentTimestamp(),
NULL, false);
@@ -1785,14 +1784,14 @@ ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock)
*
* We only get to this routine, if DEADLOCK_TIMEOUT fired while waiting for a
* lock to be released by some other process. Check if there's a deadlock; if
- * not, just return. (But signal ProcSleep to log a message, if
- * log_lock_waits is true.) If we have a real deadlock, remove ourselves from
- * the lock's wait queue and signal an error to ProcSleep.
+ * not, just return. If we have a real deadlock, remove ourselves from the
+ * lock's wait queue.
*/
-static void
+static DeadLockState
CheckDeadLock(void)
{
int i;
+ DeadLockState result;
/*
* Acquire exclusive lock on the entire shared lock data structures. Must
@@ -1819,17 +1818,20 @@ CheckDeadLock(void)
*/
if (MyProc->links.prev == NULL ||
MyProc->links.next == NULL)
+ {
+ result = DS_NO_DEADLOCK;
goto check_done;
+ }
#ifdef LOCK_DEBUG
if (Debug_deadlocks)
DumpAllLocks();
#endif
- /* Run the deadlock check, and set deadlock_state for use by ProcSleep */
- deadlock_state = DeadLockCheck(MyProc);
+ /* Run the deadlock check */
+ result = DeadLockCheck(MyProc);
- if (deadlock_state == DS_HARD_DEADLOCK)
+ if (result == DS_HARD_DEADLOCK)
{
/*
* Oops. We have a deadlock.
@@ -1841,7 +1843,7 @@ CheckDeadLock(void)
*
* RemoveFromWaitQueue sets MyProc->waitStatus to
* PROC_WAIT_STATUS_ERROR, so ProcSleep will report an error after we
- * return from the signal handler.
+ * return.
*/
Assert(MyProc->waitLock != NULL);
RemoveFromWaitQueue(MyProc, LockTagHashCode(&(MyProc->waitLock->tag)));
@@ -1868,6 +1870,8 @@ CheckDeadLock(void)
check_done:
for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
LWLockRelease(LockHashPartitionLockByIndex(i));
+
+ return result;
}
/*
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index a2625871185..443434e4ea8 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -602,13 +602,24 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum,
* that decision should be made though? For now just use a cutoff of
* 8, anything between 4 and 8 worked OK in some local testing.
*/
- if (numblocks > 8)
+ if (numblocks > 8 &&
+ file_extend_method != FILE_EXTEND_METHOD_WRITE_ZEROS)
{
- int ret;
+ int ret = 0;
- ret = FileFallocate(v->mdfd_vfd,
- seekpos, (pgoff_t) BLCKSZ * numblocks,
- WAIT_EVENT_DATA_FILE_EXTEND);
+#ifdef HAVE_POSIX_FALLOCATE
+ if (file_extend_method == FILE_EXTEND_METHOD_POSIX_FALLOCATE)
+ {
+ ret = FileFallocate(v->mdfd_vfd,
+ seekpos, (pgoff_t) BLCKSZ * numblocks,
+ WAIT_EVENT_DATA_FILE_EXTEND);
+ }
+ else
+#endif
+ {
+ elog(ERROR, "unsupported file_extend_method: %d",
+ file_extend_method);
+ }
if (ret != 0)
{
ereport(ERROR,
diff --git a/src/backend/tcop/backend_startup.c b/src/backend/tcop/backend_startup.c
index 94a7b839563..c517115927c 100644
--- a/src/backend/tcop/backend_startup.c
+++ b/src/backend/tcop/backend_startup.c
@@ -846,10 +846,9 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done)
if (strlen(port->user_name) >= NAMEDATALEN)
port->user_name[NAMEDATALEN - 1] = '\0';
+ Assert(MyBackendType == B_BACKEND || MyBackendType == B_DEAD_END_BACKEND);
if (am_walsender)
MyBackendType = B_WAL_SENDER;
- else
- MyBackendType = B_BACKEND;
/*
* Normal walsender backends, e.g. for streaming replication, are not
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index e54bf1e760f..21de158adbb 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -67,6 +67,7 @@
#include "storage/proc.h"
#include "storage/procsignal.h"
#include "storage/sinval.h"
+#include "storage/standby.h"
#include "tcop/backend_startup.h"
#include "tcop/fastpath.h"
#include "tcop/pquery.h"
@@ -155,10 +156,6 @@ static const char *userDoption = NULL; /* -D switch */
static bool EchoQuery = false; /* -E switch */
static bool UseSemiNewlineNewline = false; /* -j switch */
-/* whether or not, and why, we were canceled by conflict with recovery */
-static volatile sig_atomic_t RecoveryConflictPending = false;
-static volatile sig_atomic_t RecoveryConflictPendingReasons[NUM_PROCSIGNALS];
-
/* reused buffer to pass to SendRowDescriptionMessage() */
static MemoryContext row_description_context = NULL;
static StringInfoData row_description_buf;
@@ -175,7 +172,6 @@ static void forbidden_in_wal_sender(char firstchar);
static bool check_log_statement(List *stmt_list);
static int errdetail_execute(List *raw_parsetree_list);
static int errdetail_params(ParamListInfo params);
-static int errdetail_abort(void);
static void bind_param_error_callback(void *arg);
static void start_xact_command(void);
static void finish_xact_command(void);
@@ -183,6 +179,9 @@ static bool IsTransactionExitStmt(Node *parsetree);
static bool IsTransactionExitStmtList(List *pstmts);
static bool IsTransactionStmtList(List *pstmts);
static void drop_unnamed_stmt(void);
+static void ProcessRecoveryConflictInterrupts(void);
+static void ProcessRecoveryConflictInterrupt(RecoveryConflictReason reason);
+static void report_recovery_conflict(RecoveryConflictReason reason);
static void log_disconnections(int code, Datum arg);
static void enable_statement_timeout(void);
static void disable_statement_timeout(void);
@@ -1117,7 +1116,7 @@ exec_simple_query(const char *query_string)
/*
* Get the command name for use in status display (it also becomes the
- * default completion tag, down inside PortalRun). Set ps_status and
+ * default completion tag, in PortalDefineQuery). Set ps_status and
* do any special start-of-SQL-command processing needed by the
* destination.
*/
@@ -1141,8 +1140,7 @@ exec_simple_query(const char *query_string)
ereport(ERROR,
(errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION),
errmsg("current transaction is aborted, "
- "commands ignored until end of transaction block"),
- errdetail_abort()));
+ "commands ignored until end of transaction block")));
/* Make sure we are in a transaction command */
start_xact_command();
@@ -1498,8 +1496,7 @@ exec_parse_message(const char *query_string, /* string to execute */
ereport(ERROR,
(errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION),
errmsg("current transaction is aborted, "
- "commands ignored until end of transaction block"),
- errdetail_abort()));
+ "commands ignored until end of transaction block")));
/*
* Create the CachedPlanSource before we do parse analysis, since it
@@ -1750,8 +1747,7 @@ exec_bind_message(StringInfo input_message)
ereport(ERROR,
(errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION),
errmsg("current transaction is aborted, "
- "commands ignored until end of transaction block"),
- errdetail_abort()));
+ "commands ignored until end of transaction block")));
/*
* Create the portal. Allow silent replacement of an existing portal only
@@ -2255,8 +2251,7 @@ exec_execute_message(const char *portal_name, long max_rows)
ereport(ERROR,
(errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION),
errmsg("current transaction is aborted, "
- "commands ignored until end of transaction block"),
- errdetail_abort()));
+ "commands ignored until end of transaction block")));
/* Check for cancel signal before we start execution */
CHECK_FOR_INTERRUPTS();
@@ -2536,54 +2531,40 @@ errdetail_params(ParamListInfo params)
return 0;
}
-/*
- * errdetail_abort
- *
- * Add an errdetail() line showing abort reason, if any.
- */
-static int
-errdetail_abort(void)
-{
- if (MyProc->recoveryConflictPending)
- errdetail("Abort reason: recovery conflict");
-
- return 0;
-}
-
/*
* errdetail_recovery_conflict
*
* Add an errdetail() line showing conflict source.
*/
static int
-errdetail_recovery_conflict(ProcSignalReason reason)
+errdetail_recovery_conflict(RecoveryConflictReason reason)
{
switch (reason)
{
- case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
+ case RECOVERY_CONFLICT_BUFFERPIN:
errdetail("User was holding shared buffer pin for too long.");
break;
- case PROCSIG_RECOVERY_CONFLICT_LOCK:
+ case RECOVERY_CONFLICT_LOCK:
errdetail("User was holding a relation lock for too long.");
break;
- case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
+ case RECOVERY_CONFLICT_TABLESPACE:
errdetail("User was or might have been using tablespace that must be dropped.");
break;
- case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
+ case RECOVERY_CONFLICT_SNAPSHOT:
errdetail("User query might have needed to see row versions that must be removed.");
break;
- case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT:
+ case RECOVERY_CONFLICT_LOGICALSLOT:
errdetail("User was using a logical replication slot that must be invalidated.");
break;
- case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ case RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ errdetail("User transaction caused deadlock with recovery.");
+ break;
+ case RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK:
errdetail("User transaction caused buffer deadlock with recovery.");
break;
- case PROCSIG_RECOVERY_CONFLICT_DATABASE:
+ case RECOVERY_CONFLICT_DATABASE:
errdetail("User was connected to a database that must be dropped.");
break;
- default:
- break;
- /* no errdetail */
}
return 0;
@@ -2692,8 +2673,7 @@ exec_describe_statement_message(const char *stmt_name)
ereport(ERROR,
(errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION),
errmsg("current transaction is aborted, "
- "commands ignored until end of transaction block"),
- errdetail_abort()));
+ "commands ignored until end of transaction block")));
if (whereToSendOutput != DestRemote)
return; /* can't actually do anything... */
@@ -2769,8 +2749,7 @@ exec_describe_portal_message(const char *portal_name)
ereport(ERROR,
(errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION),
errmsg("current transaction is aborted, "
- "commands ignored until end of transaction block"),
- errdetail_abort()));
+ "commands ignored until end of transaction block")));
if (whereToSendOutput != DestRemote)
return; /* can't actually do anything... */
@@ -3088,15 +3067,14 @@ FloatExceptionHandler(SIGNAL_ARGS)
}
/*
- * Tell the next CHECK_FOR_INTERRUPTS() to check for a particular type of
- * recovery conflict. Runs in a SIGUSR1 handler.
+ * Tell the next CHECK_FOR_INTERRUPTS() to process recovery conflicts. Runs
+ * in a SIGUSR1 handler.
*/
void
-HandleRecoveryConflictInterrupt(ProcSignalReason reason)
+HandleRecoveryConflictInterrupt(void)
{
- RecoveryConflictPendingReasons[reason] = true;
- RecoveryConflictPending = true;
- InterruptPending = true;
+ if (pg_atomic_read_u32(&MyProc->pendingRecoveryConflicts) != 0)
+ InterruptPending = true;
/* latch will be set by procsignal_sigusr1_handler */
}
@@ -3104,49 +3082,73 @@ HandleRecoveryConflictInterrupt(ProcSignalReason reason)
* Check one individual conflict reason.
*/
static void
-ProcessRecoveryConflictInterrupt(ProcSignalReason reason)
+ProcessRecoveryConflictInterrupt(RecoveryConflictReason reason)
{
switch (reason)
{
- case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ case RECOVERY_CONFLICT_STARTUP_DEADLOCK:
/*
+ * The startup process is waiting on a lock held by us, and has
+ * requested us to check if it is a deadlock (i.e. the deadlock
+ * timeout expired).
+ *
* If we aren't waiting for a lock we can never deadlock.
*/
if (GetAwaitedLock() == NULL)
return;
- /* Intentional fall through to check wait for pin */
- /* FALLTHROUGH */
+ /* Set the flag so that ProcSleep() will check for deadlocks. */
+ CheckDeadLockAlert();
+ return;
- case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
+ case RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK:
/*
- * If PROCSIG_RECOVERY_CONFLICT_BUFFERPIN is requested but we
- * aren't blocking the Startup process there is nothing more to
- * do.
+ * The startup process is waiting on a buffer pin, and has
+ * requested us to check if there is a deadlock involving the pin.
*
- * When PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK is requested,
- * if we're waiting for locks and the startup process is not
- * waiting for buffer pin (i.e., also waiting for locks), we set
- * the flag so that ProcSleep() will check for deadlocks.
+ * If we're not waiting on a lock, there can be no deadlock.
+ */
+ if (GetAwaitedLock() == NULL)
+ return;
+
+ /*
+ * If we're not holding the buffer pin, also no deadlock. (The
+ * startup process doesn't know who's holding the pin, and sends
+ * this signal to *all* backends, so this is the common case.)
*/
if (!HoldingBufferPinThatDelaysRecovery())
- {
- if (reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK &&
- GetStartupBufferPinWaitBufId() < 0)
- CheckDeadLockAlert();
return;
- }
- MyProc->recoveryConflictPending = true;
+ /*
+ * Otherwise, we probably have a deadlock. Unfortunately the
+ * normal deadlock detector doesn't know about buffer pins, so we
+ * cannot perform comprehensively deadlock check. Instead, we
+ * just assume that it is a deadlock if the above two conditions
+ * are met. In principle this can lead to false positives, but
+ * it's rare in practice because sessions in a hot standby server
+ * rarely hold locks that can block other backends.
+ */
+ report_recovery_conflict(reason);
+ return;
+
+ case RECOVERY_CONFLICT_BUFFERPIN:
- /* Intentional fall through to error handling */
- /* FALLTHROUGH */
+ /*
+ * Someone is holding a buffer pin that the startup process is
+ * waiting for, and it got tired of waiting. If that's us, error
+ * out to release the pin.
+ */
+ if (!HoldingBufferPinThatDelaysRecovery())
+ return;
- case PROCSIG_RECOVERY_CONFLICT_LOCK:
- case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
- case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
+ report_recovery_conflict(reason);
+ return;
+
+ case RECOVERY_CONFLICT_LOCK:
+ case RECOVERY_CONFLICT_TABLESPACE:
+ case RECOVERY_CONFLICT_SNAPSHOT:
/*
* If we aren't in a transaction any longer then ignore.
@@ -3154,108 +3156,128 @@ ProcessRecoveryConflictInterrupt(ProcSignalReason reason)
if (!IsTransactionOrTransactionBlock())
return;
- /* FALLTHROUGH */
+ report_recovery_conflict(reason);
+ return;
- case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT:
+ case RECOVERY_CONFLICT_LOGICALSLOT:
+ report_recovery_conflict(reason);
+ return;
- /*
- * If we're not in a subtransaction then we are OK to throw an
- * ERROR to resolve the conflict. Otherwise drop through to the
- * FATAL case.
- *
- * PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT is a special case that
- * always throws an ERROR (ie never promotes to FATAL), though it
- * still has to respect QueryCancelHoldoffCount, so it shares this
- * code path. Logical decoding slots are only acquired while
- * performing logical decoding. During logical decoding no user
- * controlled code is run. During [sub]transaction abort, the
- * slot is released. Therefore user controlled code cannot
- * intercept an error before the replication slot is released.
- *
- * XXX other times that we can throw just an ERROR *may* be
- * PROCSIG_RECOVERY_CONFLICT_LOCK if no locks are held in parent
- * transactions
- *
- * PROCSIG_RECOVERY_CONFLICT_SNAPSHOT if no snapshots are held by
- * parent transactions and the transaction is not
- * transaction-snapshot mode
- *
- * PROCSIG_RECOVERY_CONFLICT_TABLESPACE if no temp files or
- * cursors open in parent transactions
- */
- if (reason == PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT ||
- !IsSubTransaction())
- {
- /*
- * If we already aborted then we no longer need to cancel. We
- * do this here since we do not wish to ignore aborted
- * subtransactions, which must cause FATAL, currently.
- */
- if (IsAbortedTransactionBlockState())
- return;
+ case RECOVERY_CONFLICT_DATABASE:
- /*
- * If a recovery conflict happens while we are waiting for
- * input from the client, the client is presumably just
- * sitting idle in a transaction, preventing recovery from
- * making progress. We'll drop through to the FATAL case
- * below to dislodge it, in that case.
- */
- if (!DoingCommandRead)
- {
- /* Avoid losing sync in the FE/BE protocol. */
- if (QueryCancelHoldoffCount != 0)
- {
- /*
- * Re-arm and defer this interrupt until later. See
- * similar code in ProcessInterrupts().
- */
- RecoveryConflictPendingReasons[reason] = true;
- RecoveryConflictPending = true;
- InterruptPending = true;
- return;
- }
+ /* The database is being dropped; terminate the session */
+ report_recovery_conflict(reason);
+ return;
+ }
+ elog(FATAL, "unrecognized conflict mode: %d", (int) reason);
+}
- /*
- * We are cleared to throw an ERROR. Either it's the
- * logical slot case, or we have a top-level transaction
- * that we can abort and a conflict that isn't inherently
- * non-retryable.
- */
- LockErrorCleanup();
- pgstat_report_recovery_conflict(reason);
- ereport(ERROR,
- (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
- errmsg("canceling statement due to conflict with recovery"),
- errdetail_recovery_conflict(reason)));
- break;
- }
- }
+/*
+ * This transaction or session is conflicting with recovery and needs to be
+ * killed. Roll back the transaction, if that's sufficient, or terminate the
+ * connection, or do nothing if we're already in an aborted state.
+ */
+static void
+report_recovery_conflict(RecoveryConflictReason reason)
+{
+ bool fatal;
- /* Intentional fall through to session cancel */
- /* FALLTHROUGH */
+ if (reason == RECOVERY_CONFLICT_DATABASE)
+ {
+ /* note: no hint about reconnecting, and different errcode */
+ pgstat_report_recovery_conflict(reason);
+ ereport(FATAL,
+ (errcode(ERRCODE_DATABASE_DROPPED),
+ errmsg("terminating connection due to conflict with recovery"),
+ errdetail_recovery_conflict(reason)));
+ }
+ if (reason == RECOVERY_CONFLICT_LOGICALSLOT)
+ {
+ /*
+ * RECOVERY_CONFLICT_LOGICALSLOT is a special case that always throws
+ * an ERROR (ie never promotes to FATAL), though it still has to
+ * respect QueryCancelHoldoffCount, so it shares this code path.
+ * Logical decoding slots are only acquired while performing logical
+ * decoding. During logical decoding no user controlled code is run.
+ * During [sub]transaction abort, the slot is released. Therefore
+ * user controlled code cannot intercept an error before the
+ * replication slot is released.
+ */
+ fatal = false;
+ }
+ else
+ {
+ fatal = IsSubTransaction();
+ }
- case PROCSIG_RECOVERY_CONFLICT_DATABASE:
+ /*
+ * If we're not in a subtransaction then we are OK to throw an ERROR to
+ * resolve the conflict.
+ *
+ * XXX other times that we can throw just an ERROR *may* be
+ * RECOVERY_CONFLICT_LOCK if no locks are held in parent transactions
+ *
+ * RECOVERY_CONFLICT_SNAPSHOT if no snapshots are held by parent
+ * transactions and the transaction is not transaction-snapshot mode
+ *
+ * RECOVERY_CONFLICT_TABLESPACE if no temp files or cursors open in parent
+ * transactions
+ */
+ if (!fatal)
+ {
+ /*
+ * If we already aborted then we no longer need to cancel. We do this
+ * here since we do not wish to ignore aborted subtransactions, which
+ * must cause FATAL, currently.
+ */
+ if (IsAbortedTransactionBlockState())
+ return;
+
+ /*
+ * If a recovery conflict happens while we are waiting for input from
+ * the client, the client is presumably just sitting idle in a
+ * transaction, preventing recovery from making progress. We'll drop
+ * through to the FATAL case below to dislodge it, in that case.
+ */
+ if (!DoingCommandRead)
+ {
+ /* Avoid losing sync in the FE/BE protocol. */
+ if (QueryCancelHoldoffCount != 0)
+ {
+ /*
+ * Re-arm and defer this interrupt until later. See similar
+ * code in ProcessInterrupts().
+ */
+ (void) pg_atomic_fetch_or_u32(&MyProc->pendingRecoveryConflicts, (1 << reason));
+ InterruptPending = true;
+ return;
+ }
/*
- * Retrying is not possible because the database is dropped, or we
- * decided above that we couldn't resolve the conflict with an
- * ERROR and fell through. Terminate the session.
+ * We are cleared to throw an ERROR. Either it's the logical slot
+ * case, or we have a top-level transaction that we can abort and
+ * a conflict that isn't inherently non-retryable.
*/
+ LockErrorCleanup();
pgstat_report_recovery_conflict(reason);
- ereport(FATAL,
- (errcode(reason == PROCSIG_RECOVERY_CONFLICT_DATABASE ?
- ERRCODE_DATABASE_DROPPED :
- ERRCODE_T_R_SERIALIZATION_FAILURE),
- errmsg("terminating connection due to conflict with recovery"),
- errdetail_recovery_conflict(reason),
- errhint("In a moment you should be able to reconnect to the"
- " database and repeat your command.")));
- break;
-
- default:
- elog(FATAL, "unrecognized conflict mode: %d", (int) reason);
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("canceling statement due to conflict with recovery"),
+ errdetail_recovery_conflict(reason)));
+ }
}
+
+ /*
+ * We couldn't resolve the conflict with ERROR, so terminate the whole
+ * session.
+ */
+ pgstat_report_recovery_conflict(reason);
+ ereport(FATAL,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("terminating connection due to conflict with recovery"),
+ errdetail_recovery_conflict(reason),
+ errhint("In a moment you should be able to reconnect to the"
+ " database and repeat your command.")));
}
/*
@@ -3264,6 +3286,8 @@ ProcessRecoveryConflictInterrupt(ProcSignalReason reason)
static void
ProcessRecoveryConflictInterrupts(void)
{
+ uint32 pending;
+
/*
* We don't need to worry about joggling the elbow of proc_exit, because
* proc_exit_prepare() holds interrupts, so ProcessInterrupts() won't call
@@ -3271,17 +3295,27 @@ ProcessRecoveryConflictInterrupts(void)
*/
Assert(!proc_exit_inprogress);
Assert(InterruptHoldoffCount == 0);
- Assert(RecoveryConflictPending);
- RecoveryConflictPending = false;
+ /* Are any recovery conflict pending? */
+ pending = pg_atomic_read_membarrier_u32(&MyProc->pendingRecoveryConflicts);
+ if (pending == 0)
+ return;
- for (ProcSignalReason reason = PROCSIG_RECOVERY_CONFLICT_FIRST;
- reason <= PROCSIG_RECOVERY_CONFLICT_LAST;
+ /*
+ * Check the conflicts one by one, clearing each flag only before
+ * processing the particular conflict. This ensures that if multiple
+ * conflicts are pending, we come back here to process the remaining
+ * conflicts, if an error is thrown during processing one of them.
+ */
+ for (RecoveryConflictReason reason = 0;
+ reason < NUM_RECOVERY_CONFLICT_REASONS;
reason++)
{
- if (RecoveryConflictPendingReasons[reason])
+ if ((pending & (1 << reason)) != 0)
{
- RecoveryConflictPendingReasons[reason] = false;
+ /* clear the flag */
+ (void) pg_atomic_fetch_and_u32(&MyProc->pendingRecoveryConflicts, ~(1 << reason));
+
ProcessRecoveryConflictInterrupt(reason);
}
}
@@ -3472,7 +3506,7 @@ ProcessInterrupts(void)
}
}
- if (RecoveryConflictPending)
+ if (pg_atomic_read_u32(&MyProc->pendingRecoveryConflicts) != 0)
ProcessRecoveryConflictInterrupts();
if (IdleInTransactionSessionTimeoutPending)
diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c
index 6dee28ae525..3937f25bcc6 100644
--- a/src/backend/tsearch/dict_synonym.c
+++ b/src/backend/tsearch/dict_synonym.c
@@ -50,7 +50,7 @@ findwrd(char *in, char **end, uint16 *flags)
/* Skip leading spaces */
while (*in && isspace((unsigned char) *in))
- in += pg_mblen(in);
+ in += pg_mblen_cstr(in);
/* Return NULL on empty lines */
if (*in == '\0')
@@ -65,7 +65,7 @@ findwrd(char *in, char **end, uint16 *flags)
while (*in && !isspace((unsigned char) *in))
{
lastchar = in;
- in += pg_mblen(in);
+ in += pg_mblen_cstr(in);
}
if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags)
diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c
index 7253f64e5f7..0fd4cf3dfa8 100644
--- a/src/backend/tsearch/dict_thesaurus.c
+++ b/src/backend/tsearch/dict_thesaurus.c
@@ -191,7 +191,7 @@ thesaurusRead(const char *filename, DictThesaurus *d)
/* is it a comment? */
while (*ptr && isspace((unsigned char) *ptr))
- ptr += pg_mblen(ptr);
+ ptr += pg_mblen_cstr(ptr);
if (t_iseq(ptr, '#') || *ptr == '\0' ||
t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
@@ -237,13 +237,13 @@ thesaurusRead(const char *filename, DictThesaurus *d)
{
useasis = true;
state = TR_INSUBS;
- beginwrd = ptr + pg_mblen(ptr);
+ beginwrd = ptr + pg_mblen_cstr(ptr);
}
else if (t_iseq(ptr, '\\'))
{
useasis = false;
state = TR_INSUBS;
- beginwrd = ptr + pg_mblen(ptr);
+ beginwrd = ptr + pg_mblen_cstr(ptr);
}
else if (!isspace((unsigned char) *ptr))
{
@@ -267,7 +267,7 @@ thesaurusRead(const char *filename, DictThesaurus *d)
else
elog(ERROR, "unrecognized thesaurus state: %d", state);
- ptr += pg_mblen(ptr);
+ ptr += pg_mblen_cstr(ptr);
}
if (state == TR_INSUBS)
diff --git a/src/backend/tsearch/regis.c b/src/backend/tsearch/regis.c
index 1c7d5c361f1..51ba78fabbc 100644
--- a/src/backend/tsearch/regis.c
+++ b/src/backend/tsearch/regis.c
@@ -37,7 +37,7 @@ RS_isRegis(const char *str)
{
if (state == RS_IN_WAIT)
{
- if (t_isalpha(c))
+ if (t_isalpha_cstr(c))
/* okay */ ;
else if (t_iseq(c, '['))
state = RS_IN_ONEOF;
@@ -48,14 +48,14 @@ RS_isRegis(const char *str)
{
if (t_iseq(c, '^'))
state = RS_IN_NONEOF;
- else if (t_isalpha(c))
+ else if (t_isalpha_cstr(c))
state = RS_IN_ONEOF_IN;
else
return false;
}
else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
{
- if (t_isalpha(c))
+ if (t_isalpha_cstr(c))
/* okay */ ;
else if (t_iseq(c, ']'))
state = RS_IN_WAIT;
@@ -64,7 +64,7 @@ RS_isRegis(const char *str)
}
else
elog(ERROR, "internal error in RS_isRegis: state %d", state);
- c += pg_mblen(c);
+ c += pg_mblen_cstr(c);
}
return (state == RS_IN_WAIT);
@@ -96,15 +96,14 @@ RS_compile(Regis *r, bool issuffix, const char *str)
{
if (state == RS_IN_WAIT)
{
- if (t_isalpha(c))
+ if (t_isalpha_cstr(c))
{
if (ptr)
ptr = newRegisNode(ptr, len);
else
ptr = r->node = newRegisNode(NULL, len);
- COPYCHAR(ptr->data, c);
ptr->type = RSF_ONEOF;
- ptr->len = pg_mblen(c);
+ ptr->len = ts_copychar_cstr(ptr->data, c);
}
else if (t_iseq(c, '['))
{
@@ -125,10 +124,9 @@ RS_compile(Regis *r, bool issuffix, const char *str)
ptr->type = RSF_NONEOF;
state = RS_IN_NONEOF;
}
- else if (t_isalpha(c))
+ else if (t_isalpha_cstr(c))
{
- COPYCHAR(ptr->data, c);
- ptr->len = pg_mblen(c);
+ ptr->len = ts_copychar_cstr(ptr->data, c);
state = RS_IN_ONEOF_IN;
}
else /* shouldn't get here */
@@ -136,11 +134,8 @@ RS_compile(Regis *r, bool issuffix, const char *str)
}
else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
{
- if (t_isalpha(c))
- {
- COPYCHAR(ptr->data + ptr->len, c);
- ptr->len += pg_mblen(c);
- }
+ if (t_isalpha_cstr(c))
+ ptr->len += ts_copychar_cstr(ptr->data + ptr->len, c);
else if (t_iseq(c, ']'))
state = RS_IN_WAIT;
else /* shouldn't get here */
@@ -148,7 +143,7 @@ RS_compile(Regis *r, bool issuffix, const char *str)
}
else
elog(ERROR, "internal error in RS_compile: state %d", state);
- c += pg_mblen(c);
+ c += pg_mblen_cstr(c);
}
if (state != RS_IN_WAIT) /* shouldn't get here */
@@ -187,10 +182,10 @@ mb_strchr(char *str, char *c)
char *ptr = str;
bool res = false;
- clen = pg_mblen(c);
+ clen = pg_mblen_cstr(c);
while (*ptr && !res)
{
- plen = pg_mblen(ptr);
+ plen = pg_mblen_cstr(ptr);
if (plen == clen)
{
i = plen;
@@ -219,7 +214,7 @@ RS_execute(Regis *r, char *str)
while (*c)
{
len++;
- c += pg_mblen(c);
+ c += pg_mblen_cstr(c);
}
if (len < r->nchar)
@@ -230,7 +225,7 @@ RS_execute(Regis *r, char *str)
{
len -= r->nchar;
while (len-- > 0)
- c += pg_mblen(c);
+ c += pg_mblen_cstr(c);
}
@@ -250,7 +245,7 @@ RS_execute(Regis *r, char *str)
elog(ERROR, "unrecognized regis node type: %d", ptr->type);
}
ptr = ptr->next;
- c += pg_mblen(c);
+ c += pg_mblen_cstr(c);
}
return true;
diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c
index ad0ceec37b0..a1bfd2a9f9b 100644
--- a/src/backend/tsearch/spell.c
+++ b/src/backend/tsearch/spell.c
@@ -233,7 +233,7 @@ findchar(char *str, int c)
{
if (t_iseq(str, c))
return str;
- str += pg_mblen(str);
+ str += pg_mblen_cstr(str);
}
return NULL;
@@ -246,7 +246,7 @@ findchar2(char *str, int c1, int c2)
{
if (t_iseq(str, c1) || t_iseq(str, c2))
return str;
- str += pg_mblen(str);
+ str += pg_mblen_cstr(str);
}
return NULL;
@@ -353,6 +353,7 @@ getNextFlagFromString(IspellDict *Conf, const char **sflagset, char *sflag)
char *next;
const char *sbuf = *sflagset;
int maxstep;
+ int clen;
bool stop = false;
bool met_comma = false;
@@ -364,11 +365,11 @@ getNextFlagFromString(IspellDict *Conf, const char **sflagset, char *sflag)
{
case FM_LONG:
case FM_CHAR:
- COPYCHAR(sflag, *sflagset);
- sflag += pg_mblen(*sflagset);
+ clen = ts_copychar_cstr(sflag, *sflagset);
+ sflag += clen;
/* Go to start of the next flag */
- *sflagset += pg_mblen(*sflagset);
+ *sflagset += clen;
/* Check if we get all characters of flag */
maxstep--;
@@ -418,7 +419,7 @@ getNextFlagFromString(IspellDict *Conf, const char **sflagset, char *sflag)
*sflagset)));
}
- *sflagset += pg_mblen(*sflagset);
+ *sflagset += pg_mblen_cstr(*sflagset);
}
stop = true;
break;
@@ -544,7 +545,7 @@ NIImportDictionary(IspellDict *Conf, const char *filename)
while (*s)
{
/* we allow only single encoded flags for faster works */
- if (pg_mblen(s) == 1 && isprint((unsigned char) *s) && !isspace((unsigned char) *s))
+ if (pg_mblen_cstr(s) == 1 && isprint((unsigned char) *s) && !isspace((unsigned char) *s))
s++;
else
{
@@ -565,7 +566,7 @@ NIImportDictionary(IspellDict *Conf, const char *filename)
*s = '\0';
break;
}
- s += pg_mblen(s);
+ s += pg_mblen_cstr(s);
}
pstr = lowerstr_ctx(Conf, line);
@@ -797,17 +798,17 @@ get_nextfield(char **str, char *next)
while (**str)
{
+ int clen = pg_mblen_cstr(*str);
+
if (state == PAE_WAIT_MASK)
{
if (t_iseq(*str, '#'))
return false;
else if (!isspace((unsigned char) **str))
{
- int clen = pg_mblen(*str);
-
if (clen < avail)
{
- COPYCHAR(next, *str);
+ ts_copychar_with_len(next, *str, clen);
next += clen;
avail -= clen;
}
@@ -823,17 +824,15 @@ get_nextfield(char **str, char *next)
}
else
{
- int clen = pg_mblen(*str);
-
if (clen < avail)
{
- COPYCHAR(next, *str);
+ ts_copychar_with_len(next, *str, clen);
next += clen;
avail -= clen;
}
}
}
- *str += pg_mblen(*str);
+ *str += clen;
}
*next = '\0';
@@ -923,14 +922,15 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
while (*str)
{
+ int clen = pg_mblen_cstr(str);
+
if (state == PAE_WAIT_MASK)
{
if (t_iseq(str, '#'))
return false;
else if (!isspace((unsigned char) *str))
{
- COPYCHAR(pmask, str);
- pmask += pg_mblen(str);
+ pmask += ts_copychar_with_len(pmask, str, clen);
state = PAE_INMASK;
}
}
@@ -943,8 +943,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
}
else if (!isspace((unsigned char) *str))
{
- COPYCHAR(pmask, str);
- pmask += pg_mblen(str);
+ pmask += ts_copychar_with_len(pmask, str, clen);
}
}
else if (state == PAE_WAIT_FIND)
@@ -953,10 +952,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
{
state = PAE_INFIND;
}
- else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ )
+ else if (t_isalpha_cstr(str) || t_iseq(str, '\'') /* english 's */ )
{
- COPYCHAR(prepl, str);
- prepl += pg_mblen(str);
+ prepl += ts_copychar_with_len(prepl, str, clen);
state = PAE_INREPL;
}
else if (!isspace((unsigned char) *str))
@@ -971,10 +969,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
*pfind = '\0';
state = PAE_WAIT_REPL;
}
- else if (t_isalpha(str))
+ else if (t_isalpha_cstr(str))
{
- COPYCHAR(pfind, str);
- pfind += pg_mblen(str);
+ pfind += ts_copychar_with_len(pfind, str, clen);
}
else if (!isspace((unsigned char) *str))
ereport(ERROR,
@@ -987,10 +984,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
{
break; /* void repl */
}
- else if (t_isalpha(str))
+ else if (t_isalpha_cstr(str))
{
- COPYCHAR(prepl, str);
- prepl += pg_mblen(str);
+ prepl += ts_copychar_with_len(prepl, str, clen);
state = PAE_INREPL;
}
else if (!isspace((unsigned char) *str))
@@ -1005,10 +1001,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
*prepl = '\0';
break;
}
- else if (t_isalpha(str))
+ else if (t_isalpha_cstr(str))
{
- COPYCHAR(prepl, str);
- prepl += pg_mblen(str);
+ prepl += ts_copychar_with_len(prepl, str, clen);
}
else if (!isspace((unsigned char) *str))
ereport(ERROR,
@@ -1018,7 +1013,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
else
elog(ERROR, "unrecognized state in parse_affentry: %d", state);
- str += pg_mblen(str);
+ str += clen;
}
*pmask = *pfind = *prepl = '\0';
@@ -1071,10 +1066,9 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val)
CompoundAffixFlag *newValue;
char sbuf[BUFSIZ];
char *sflag;
- int clen;
while (*s && isspace((unsigned char) *s))
- s += pg_mblen(s);
+ s += pg_mblen_cstr(s);
if (!*s)
ereport(ERROR,
@@ -1085,8 +1079,8 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val)
sflag = sbuf;
while (*s && !isspace((unsigned char) *s) && *s != '\n')
{
- clen = pg_mblen(s);
- COPYCHAR(sflag, s);
+ int clen = ts_copychar_cstr(sflag, s);
+
sflag += clen;
s += clen;
}
@@ -1267,7 +1261,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
char *s = recoded + strlen("FLAG");
while (*s && isspace((unsigned char) *s))
- s += pg_mblen(s);
+ s += pg_mblen_cstr(s);
if (*s)
{
@@ -1466,11 +1460,11 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
if (s)
{
while (*s && !isspace((unsigned char) *s))
- s += pg_mblen(s);
+ s += pg_mblen_cstr(s);
while (*s && isspace((unsigned char) *s))
- s += pg_mblen(s);
+ s += pg_mblen_cstr(s);
- if (*s && pg_mblen(s) == 1)
+ if (*s && pg_mblen_cstr(s) == 1)
{
addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG);
Conf->usecompound = true;
@@ -1499,7 +1493,7 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
flagflags = 0;
while (*s && isspace((unsigned char) *s))
- s += pg_mblen(s);
+ s += pg_mblen_cstr(s);
if (*s == '*')
{
@@ -1520,12 +1514,11 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
* be followed by EOL, whitespace, or ':'. Otherwise this is a
* new-format flag command.
*/
- if (*s && pg_mblen(s) == 1)
+ if (*s && pg_mblen_cstr(s) == 1)
{
- COPYCHAR(flag, s);
+ flag[0] = *s++;
flag[1] = '\0';
- s++;
if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' ||
isspace((unsigned char) *s))
{
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c
index 1e98f321957..df02ffb12fd 100644
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -23,32 +23,40 @@ static void tsearch_readline_callback(void *arg);
/* space for a single character plus a trailing NUL */
#define WC_BUF_LEN 2
-int
-t_isalpha(const char *ptr)
-{
- pg_wchar wstr[WC_BUF_LEN];
- int wlen pg_attribute_unused();
-
- wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr));
- Assert(wlen <= 1);
-
- /* pass single character, or NUL if empty */
- return pg_iswalpha(wstr[0], pg_database_locale());
-}
-
-int
-t_isalnum(const char *ptr)
-{
- pg_wchar wstr[WC_BUF_LEN];
- int wlen pg_attribute_unused();
-
- wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr));
- Assert(wlen <= 1);
-
- /* pass single character, or NUL if empty */
- return pg_iswalnum(wstr[0], pg_database_locale());
+#define GENERATE_T_ISCLASS_DEF(character_class) \
+/* mblen shall be that of the first character */ \
+int \
+t_is##character_class##_with_len(const char *ptr, int mblen) \
+{ \
+ pg_wchar wstr[WC_BUF_LEN]; \
+ int wlen pg_attribute_unused(); \
+ wlen = pg_mb2wchar_with_len(ptr, wstr, mblen); \
+ Assert(wlen <= 1); \
+ /* pass single character, or NUL if empty */ \
+ return pg_isw##character_class(wstr[0], pg_database_locale()); \
+} \
+\
+/* ptr shall point to a NUL-terminated string */ \
+int \
+t_is##character_class##_cstr(const char *ptr) \
+{ \
+ return t_is##character_class##_with_len(ptr, pg_mblen_cstr(ptr)); \
+} \
+/* ptr shall point to a string with pre-validated encoding */ \
+int \
+t_is##character_class##_unbounded(const char *ptr) \
+{ \
+ return t_is##character_class##_with_len(ptr, pg_mblen_unbounded(ptr)); \
+} \
+/* historical name for _unbounded */ \
+int \
+t_is##character_class(const char *ptr) \
+{ \
+ return t_is##character_class##_unbounded(ptr); \
}
+GENERATE_T_ISCLASS_DEF(alnum)
+GENERATE_T_ISCLASS_DEF(alpha)
/*
* Set up to read a file using tsearch_readline(). This facility is
diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c
index 5afa6e4bad8..64b60bb9513 100644
--- a/src/backend/tsearch/ts_selfuncs.c
+++ b/src/backend/tsearch/ts_selfuncs.c
@@ -108,12 +108,14 @@ tsmatchsel(PG_FUNCTION_ARGS)
* OK, there's a Var and a Const we're dealing with here. We need the
* Const to be a TSQuery, else we can't do anything useful. We have to
* check this because the Var might be the TSQuery not the TSVector.
+ *
+ * Also check that the Var really is a TSVector, in case this estimator is
+ * mistakenly attached to some other operator.
*/
- if (((Const *) other)->consttype == TSQUERYOID)
+ if (((Const *) other)->consttype == TSQUERYOID &&
+ vardata.vartype == TSVECTOROID)
{
/* tsvector @@ tsquery or the other way around */
- Assert(vardata.vartype == TSVECTOROID);
-
selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
}
else
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index 0c513d694e7..48ee050e37f 100644
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -444,7 +444,7 @@ compute_tsvector_stats(VacAttrStats *stats,
stats->statypid[0] = TEXTOID;
stats->statyplen[0] = -1; /* typlen, -1 for varlena */
stats->statypbyval[0] = false;
- stats->statypalign[0] = 'i';
+ stats->statypalign[0] = TYPALIGN_INT;
}
}
else
diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c
index 9072d22423f..52cf65533e4 100644
--- a/src/backend/tsearch/ts_utils.c
+++ b/src/backend/tsearch/ts_utils.c
@@ -90,7 +90,7 @@ readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *, size
/* Trim trailing space */
while (*pbuf && !isspace((unsigned char) *pbuf))
- pbuf += pg_mblen(pbuf);
+ pbuf += pg_mblen_cstr(pbuf);
*pbuf = '\0';
/* Skip empty lines */
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index bfe8aa7fbce..8b9b34e762a 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -1683,7 +1683,8 @@ TParserGet(TParser *prs)
prs->state->charlen = 0;
else
prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
- pg_mblen(prs->str + prs->state->posbyte);
+ pg_mblen_range(prs->str + prs->state->posbyte,
+ prs->str + prs->lenstr);
Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
diff --git a/src/backend/utils/.gitignore b/src/backend/utils/.gitignore
index 303c01d0515..fa9cfb39693 100644
--- a/src/backend/utils/.gitignore
+++ b/src/backend/utils/.gitignore
@@ -5,3 +5,6 @@
/guc_tables.inc.c
/probes.h
/errcodes.h
+/pgstat_wait_event.c
+/wait_event_funcs_data.c
+/wait_event_types.h
diff --git a/src/backend/utils/Makefile b/src/backend/utils/Makefile
index 6df31504f32..81b4a956bda 100644
--- a/src/backend/utils/Makefile
+++ b/src/backend/utils/Makefile
@@ -43,7 +43,7 @@ generated-header-symlinks: $(top_builddir)/src/include/utils/header-stamp submak
submake-adt-headers:
$(MAKE) -C adt jsonpath_gram.h
-$(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h guc_tables.inc.c
+$(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h
# fmgr-stamp records the last time we ran Gen_fmgrtab.pl. We don't rely on
# the timestamps of the individual output files, because the Perl script
@@ -58,6 +58,12 @@ errcodes.h: $(top_srcdir)/src/backend/utils/errcodes.txt generate-errcodes.pl
guc_tables.inc.c: $(top_srcdir)/src/backend/utils/misc/guc_parameters.dat $(top_srcdir)/src/backend/utils/misc/gen_guc_tables.pl
$(PERL) $(top_srcdir)/src/backend/utils/misc/gen_guc_tables.pl $< $@
+pgstat_wait_event.c: wait_event_types.h
+wait_event_funcs_data.c: wait_event_types.h
+
+wait_event_types.h: $(top_srcdir)/src/backend/utils/activity/wait_event_names.txt $(top_srcdir)/src/backend/utils/activity/generate-wait_event_types.pl
+ $(PERL) $(top_srcdir)/src/backend/utils/activity/generate-wait_event_types.pl --code $<
+
ifeq ($(enable_dtrace), yes)
probes.h: postprocess_dtrace.sed probes.h.tmp
sed -f $^ >$@
@@ -73,8 +79,8 @@ endif
# These generated headers must be symlinked into src/include/.
# We use header-stamp to record that we've done this because the symlinks
# themselves may appear older than fmgr-stamp.
-$(top_builddir)/src/include/utils/header-stamp: fmgr-stamp errcodes.h probes.h guc_tables.inc.c
- cd '$(dir $@)' && for file in fmgroids.h fmgrprotos.h errcodes.h probes.h guc_tables.inc.c; do \
+$(top_builddir)/src/include/utils/header-stamp: fmgr-stamp errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h
+ cd '$(dir $@)' && for file in fmgroids.h fmgrprotos.h errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h; do \
rm -f $$file && $(LN_S) "../../../$(subdir)/$$file" . ; \
done
touch $@
@@ -93,3 +99,4 @@ uninstall-data:
clean:
rm -f probes.h probes.h.tmp
rm -f fmgroids.h fmgrprotos.h fmgrtab.c fmgr-stamp errcodes.h guc_tables.inc.c
+ rm -f wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c
diff --git a/src/backend/utils/activity/.gitignore b/src/backend/utils/activity/.gitignore
deleted file mode 100644
index bd0c0c77729..00000000000
--- a/src/backend/utils/activity/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-/pgstat_wait_event.c
-/wait_event_types.h
-/wait_event_funcs_data.c
diff --git a/src/backend/utils/activity/Makefile b/src/backend/utils/activity/Makefile
index 0eb29ee78aa..c37bfb350bb 100644
--- a/src/backend/utils/activity/Makefile
+++ b/src/backend/utils/activity/Makefile
@@ -36,17 +36,8 @@ OBJS = \
wait_event.o \
wait_event_funcs.o
-include $(top_srcdir)/src/backend/common.mk
-
-wait_event_funcs.o: wait_event_funcs_data.c
-wait_event_funcs_data.c: wait_event_types.h
-
-wait_event.o: pgstat_wait_event.c
-pgstat_wait_event.c: wait_event_types.h
- touch $@
+# Force these dependencies to be known even without dependency info built:
+wait_event.o: wait_event.c $(top_builddir)/src/backend/utils/pgstat_wait_event.c
+wait_event_funcs.o: wait_event_funcs.c $(top_builddir)/src/backend/utils/wait_event_funcs_data.c
-wait_event_types.h: $(top_srcdir)/src/backend/utils/activity/wait_event_names.txt generate-wait_event_types.pl
- $(PERL) $(srcdir)/generate-wait_event_types.pl --code $<
-
-clean:
- rm -f wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c
index c84e6536580..cd087129469 100644
--- a/src/backend/utils/activity/backend_status.c
+++ b/src/backend/utils/activity/backend_status.c
@@ -1164,31 +1164,6 @@ pgstat_get_my_plan_id(void)
return MyBEEntry->st_plan_id;
}
-/* ----------
- * pgstat_get_backend_type_by_proc_number() -
- *
- * Return the type of the backend with the specified ProcNumber. This looks
- * directly at the BackendStatusArray, so the return value may be out of date.
- * The only current use of this function is in pg_signal_backend(), which is
- * inherently racy, so we don't worry too much about this.
- *
- * It is the caller's responsibility to use this wisely; at minimum, callers
- * should ensure that procNumber is valid and perform the required permissions
- * checks.
- * ----------
- */
-BackendType
-pgstat_get_backend_type_by_proc_number(ProcNumber procNumber)
-{
- volatile PgBackendStatus *status = &BackendStatusArray[procNumber];
-
- /*
- * We bypass the changecount mechanism since fetching and storing an int
- * is almost certainly atomic.
- */
- return status->st_backendType;
-}
-
/* ----------
* cmp_lbestatus
*
diff --git a/src/backend/utils/activity/meson.build b/src/backend/utils/activity/meson.build
index 9f48d5970e1..53bd5a246ca 100644
--- a/src/backend/utils/activity/meson.build
+++ b/src/backend/utils/activity/meson.build
@@ -30,7 +30,6 @@ waitevent_sources = files(
wait_event = static_library('wait_event_names',
waitevent_sources,
dependencies: [backend_code],
- include_directories: include_directories('../../../include/utils'),
kwargs: internal_lib_args,
)
diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c
index 1350f5f62f1..f2f8d3ff75f 100644
--- a/src/backend/utils/activity/pgstat_backend.c
+++ b/src/backend/utils/activity/pgstat_backend.c
@@ -326,7 +326,7 @@ pgstat_create_backend(ProcNumber procnum)
PgStatShared_Backend *shstatent;
entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_BACKEND, InvalidOid,
- MyProcNumber, false);
+ procnum, false);
shstatent = (PgStatShared_Backend *) entry_ref->shared_stats;
/*
diff --git a/src/backend/utils/activity/pgstat_database.c b/src/backend/utils/activity/pgstat_database.c
index d7f6d4c5ee6..6309909bcd0 100644
--- a/src/backend/utils/activity/pgstat_database.c
+++ b/src/backend/utils/activity/pgstat_database.c
@@ -17,7 +17,7 @@
#include "postgres.h"
-#include "storage/procsignal.h"
+#include "storage/standby.h"
#include "utils/pgstat_internal.h"
#include "utils/timestamp.h"
@@ -88,31 +88,41 @@ pgstat_report_recovery_conflict(int reason)
dbentry = pgstat_prep_database_pending(MyDatabaseId);
- switch (reason)
+ switch ((RecoveryConflictReason) reason)
{
- case PROCSIG_RECOVERY_CONFLICT_DATABASE:
+ case RECOVERY_CONFLICT_DATABASE:
/*
* Since we drop the information about the database as soon as it
* replicates, there is no point in counting these conflicts.
*/
break;
- case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
+ case RECOVERY_CONFLICT_TABLESPACE:
dbentry->conflict_tablespace++;
break;
- case PROCSIG_RECOVERY_CONFLICT_LOCK:
+ case RECOVERY_CONFLICT_LOCK:
dbentry->conflict_lock++;
break;
- case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
+ case RECOVERY_CONFLICT_SNAPSHOT:
dbentry->conflict_snapshot++;
break;
- case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
+ case RECOVERY_CONFLICT_BUFFERPIN:
dbentry->conflict_bufferpin++;
break;
- case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT:
+ case RECOVERY_CONFLICT_LOGICALSLOT:
dbentry->conflict_logicalslot++;
break;
- case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ case RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ dbentry->conflict_startup_deadlock++;
+ break;
+ case RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK:
+
+ /*
+ * The difference between RECOVERY_CONFLICT_STARTUP_DEADLOCK and
+ * RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK is merely whether a buffer
+ * pin was part of the deadlock. We use the same counter for both
+ * reasons.
+ */
dbentry->conflict_startup_deadlock++;
break;
}
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index e4f2c440257..aca2c8fc742 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -503,4 +503,4 @@ pgstat_get_wait_event(uint32 wait_event_info)
return event_name;
}
-#include "pgstat_wait_event.c"
+#include "utils/pgstat_wait_event.c"
diff --git a/src/backend/utils/activity/wait_event_funcs.c b/src/backend/utils/activity/wait_event_funcs.c
index b62ee83ef73..fa10a80b088 100644
--- a/src/backend/utils/activity/wait_event_funcs.c
+++ b/src/backend/utils/activity/wait_event_funcs.c
@@ -31,7 +31,7 @@ static const struct
waitEventData[] =
{
-#include "wait_event_funcs_data.c"
+#include "utils/wait_event_funcs_data.c"
/* end of list */
{NULL, NULL, NULL}
};
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 5537a2d2530..4aa864fe3c3 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -14,13 +14,13 @@
#
# The files generated from this one are:
#
-# src/backend/utils/activity/wait_event_types.h
+# wait_event_types.h
# typedef enum definitions for wait events.
#
-# src/backend/utils/activity/pgstat_wait_event.c
+# pgstat_wait_event.c
# C functions to get the wait event name based on the enum.
#
-# src/backend/utils/activity/wait_event_types.sgml
+# wait_event_types.sgml
# SGML tables of wait events for inclusion in the documentation.
#
# When adding a new wait event, make sure it is placed in the appropriate
@@ -213,6 +213,8 @@ CONTROL_FILE_WRITE_UPDATE "Waiting for a write to update the pg_contro
COPY_FILE_COPY "Waiting for a file copy operation."
COPY_FILE_READ "Waiting for a read during a file copy operation."
COPY_FILE_WRITE "Waiting for a write during a file copy operation."
+COPY_FROM_READ "Waiting to read data from a pipe, a file or a program during COPY FROM."
+COPY_TO_WRITE "Waiting to write data to a pipe, a file or a program during COPY TO."
DATA_FILE_EXTEND "Waiting for a relation data file to be extended."
DATA_FILE_FLUSH "Waiting for a relation data file to reach durable storage."
DATA_FILE_IMMEDIATE_SYNC "Waiting for an immediate synchronization of a relation data file to durable storage."
diff --git a/src/backend/utils/adt/array_expanded.c b/src/backend/utils/adt/array_expanded.c
index 01e3dddcbbb..7e8352af52b 100644
--- a/src/backend/utils/adt/array_expanded.c
+++ b/src/backend/utils/adt/array_expanded.c
@@ -238,6 +238,7 @@ EA_get_flat_size(ExpandedObjectHeader *eohptr)
Datum *dvalues;
bool *dnulls;
Size nbytes;
+ uint8 typalignby;
int i;
Assert(eah->ea_magic == EA_MAGIC);
@@ -261,12 +262,13 @@ EA_get_flat_size(ExpandedObjectHeader *eohptr)
dvalues = eah->dvalues;
dnulls = eah->dnulls;
nbytes = 0;
+ typalignby = typalign_to_alignby(eah->typalign);
for (i = 0; i < nelems; i++)
{
if (dnulls && dnulls[i])
continue;
nbytes = att_addlength_datum(nbytes, eah->typlen, dvalues[i]);
- nbytes = att_align_nominal(nbytes, eah->typalign);
+ nbytes = att_nominal_alignby(nbytes, typalignby);
/* check for overflow of total request */
if (!AllocSizeIsValid(nbytes))
ereport(ERROR,
diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c
index e71d32773b5..734e5fea45e 100644
--- a/src/backend/utils/adt/arrayfuncs.c
+++ b/src/backend/utils/adt/arrayfuncs.c
@@ -75,6 +75,7 @@ typedef struct ArrayIteratorData
int16 typlen; /* element type's length */
bool typbyval; /* element type's byval property */
char typalign; /* element type's align property */
+ uint8 typalignby; /* typalign mapped to numeric alignment */
/* information about the requested slice size */
int slice_ndim; /* slice dimension, or 0 if not slicing */
@@ -123,7 +124,7 @@ static bool array_get_isnull(const bits8 *nullbitmap, int offset);
static void array_set_isnull(bits8 *nullbitmap, int offset, bool isNull);
static Datum ArrayCast(char *value, bool byval, int len);
static int ArrayCastAndSet(Datum src,
- int typlen, bool typbyval, char typalign,
+ int typlen, bool typbyval, uint8 typalignby,
char *dest);
static char *array_seek(char *ptr, int offset, bits8 *nullbitmap, int nitems,
int typlen, bool typbyval, char typalign);
@@ -187,6 +188,7 @@ array_in(PG_FUNCTION_ARGS)
int typlen;
bool typbyval;
char typalign;
+ uint8 typalignby;
char typdelim;
Oid typioparam;
char *p;
@@ -232,6 +234,7 @@ array_in(PG_FUNCTION_ARGS)
typlen = my_extra->typlen;
typbyval = my_extra->typbyval;
typalign = my_extra->typalign;
+ typalignby = typalign_to_alignby(typalign);
typdelim = my_extra->typdelim;
typioparam = my_extra->typioparam;
@@ -328,7 +331,7 @@ array_in(PG_FUNCTION_ARGS)
if (typlen == -1)
values[i] = PointerGetDatum(PG_DETOAST_DATUM(values[i]));
nbytes = att_addlength_datum(nbytes, typlen, values[i]);
- nbytes = att_align_nominal(nbytes, typalign);
+ nbytes = att_nominal_alignby(nbytes, typalignby);
/* check for overflow of total request */
if (!AllocSizeIsValid(nbytes))
ereturn(escontext, (Datum) 0,
@@ -972,6 +975,7 @@ CopyArrayEls(ArrayType *array,
bits8 *bitmap = ARR_NULLBITMAP(array);
int bitval = 0;
int bitmask = 1;
+ uint8 typalignby = typalign_to_alignby(typalign);
int i;
if (typbyval)
@@ -988,7 +992,7 @@ CopyArrayEls(ArrayType *array,
else
{
bitval |= bitmask;
- p += ArrayCastAndSet(values[i], typlen, typbyval, typalign, p);
+ p += ArrayCastAndSet(values[i], typlen, typbyval, typalignby, p);
if (freedata)
pfree(DatumGetPointer(values[i]));
}
@@ -1112,7 +1116,7 @@ array_out(PG_FUNCTION_ARGS)
needquotes = (bool *) palloc(nitems * sizeof(bool));
overall_length = 0;
- array_iter_setup(&iter, v);
+ array_iter_setup(&iter, v, typlen, typbyval, typalign);
for (i = 0; i < nitems; i++)
{
@@ -1121,8 +1125,7 @@ array_out(PG_FUNCTION_ARGS)
bool needquote;
/* Get source element, checking for NULL */
- itemvalue = array_iter_next(&iter, &isnull, i,
- typlen, typbyval, typalign);
+ itemvalue = array_iter_next(&iter, &isnull, i);
if (isnull)
{
@@ -1468,6 +1471,7 @@ ReadArrayBinary(StringInfo buf,
int i;
bool hasnull;
int32 totbytes;
+ uint8 typalignby = typalign_to_alignby(typalign);
for (i = 0; i < nitems; i++)
{
@@ -1526,7 +1530,7 @@ ReadArrayBinary(StringInfo buf,
if (typlen == -1)
values[i] = PointerGetDatum(PG_DETOAST_DATUM(values[i]));
totbytes = att_addlength_datum(totbytes, typlen, values[i]);
- totbytes = att_align_nominal(totbytes, typalign);
+ totbytes = att_nominal_alignby(totbytes, typalignby);
/* check for overflow of total request */
if (!AllocSizeIsValid(totbytes))
ereport(ERROR,
@@ -1614,7 +1618,7 @@ array_send(PG_FUNCTION_ARGS)
}
/* Send the array elements using the element's own sendproc */
- array_iter_setup(&iter, v);
+ array_iter_setup(&iter, v, typlen, typbyval, typalign);
for (i = 0; i < nitems; i++)
{
@@ -1622,8 +1626,7 @@ array_send(PG_FUNCTION_ARGS)
bool isnull;
/* Get source element, checking for NULL */
- itemvalue = array_iter_next(&iter, &isnull, i,
- typlen, typbyval, typalign);
+ itemvalue = array_iter_next(&iter, &isnull, i);
if (isnull)
{
@@ -2231,6 +2234,7 @@ array_set_element(Datum arraydatum,
addedafter,
lenbefore,
lenafter;
+ uint8 elmalignby = typalign_to_alignby(elmalign);
if (arraytyplen > 0)
{
@@ -2258,7 +2262,7 @@ array_set_element(Datum arraydatum,
resultarray = (char *) palloc(arraytyplen);
memcpy(resultarray, DatumGetPointer(arraydatum), arraytyplen);
elt_ptr = resultarray + indx[0] * elmlen;
- ArrayCastAndSet(dataValue, elmlen, elmbyval, elmalign, elt_ptr);
+ ArrayCastAndSet(dataValue, elmlen, elmbyval, elmalignby, elt_ptr);
return PointerGetDatum(resultarray);
}
@@ -2416,7 +2420,7 @@ array_set_element(Datum arraydatum,
else
{
olditemlen = att_addlength_pointer(0, elmlen, elt_ptr);
- olditemlen = att_align_nominal(olditemlen, elmalign);
+ olditemlen = att_nominal_alignby(olditemlen, elmalignby);
}
lenafter = olddatasize - lenbefore - olditemlen;
}
@@ -2426,7 +2430,7 @@ array_set_element(Datum arraydatum,
else
{
newitemlen = att_addlength_datum(0, elmlen, dataValue);
- newitemlen = att_align_nominal(newitemlen, elmalign);
+ newitemlen = att_nominal_alignby(newitemlen, elmalignby);
}
newsize = overheadlen + lenbefore + newitemlen + lenafter;
@@ -2449,7 +2453,7 @@ array_set_element(Datum arraydatum,
(char *) array + oldoverheadlen,
lenbefore);
if (!isNull)
- ArrayCastAndSet(dataValue, elmlen, elmbyval, elmalign,
+ ArrayCastAndSet(dataValue, elmlen, elmbyval, elmalignby,
(char *) newarray + overheadlen + lenbefore);
memcpy((char *) newarray + overheadlen + lenbefore + newitemlen,
(char *) array + oldoverheadlen + lenbefore + olditemlen,
@@ -3221,6 +3225,7 @@ array_map(Datum arrayd,
int typlen;
bool typbyval;
char typalign;
+ uint8 typalignby;
array_iter iter;
ArrayMetaState *inp_extra;
ArrayMetaState *ret_extra;
@@ -3270,21 +3275,21 @@ array_map(Datum arrayd,
typlen = ret_extra->typlen;
typbyval = ret_extra->typbyval;
typalign = ret_extra->typalign;
+ typalignby = typalign_to_alignby(typalign);
/* Allocate temporary arrays for new values */
values = (Datum *) palloc(nitems * sizeof(Datum));
nulls = (bool *) palloc(nitems * sizeof(bool));
/* Loop over source data */
- array_iter_setup(&iter, v);
+ array_iter_setup(&iter, v, inp_typlen, inp_typbyval, inp_typalign);
hasnulls = false;
for (i = 0; i < nitems; i++)
{
/* Get source element, checking for NULL */
*transform_source =
- array_iter_next(&iter, transform_source_isnull, i,
- inp_typlen, inp_typbyval, inp_typalign);
+ array_iter_next(&iter, transform_source_isnull, i);
/* Apply the given expression to source element */
values[i] = ExecEvalExpr(exprstate, econtext, &nulls[i]);
@@ -3298,7 +3303,7 @@ array_map(Datum arrayd,
values[i] = PointerGetDatum(PG_DETOAST_DATUM(values[i]));
/* Update total result size */
nbytes = att_addlength_datum(nbytes, typlen, values[i]);
- nbytes = att_align_nominal(nbytes, typalign);
+ nbytes = att_nominal_alignby(nbytes, typalignby);
/* check for overflow of total request */
if (!AllocSizeIsValid(nbytes))
ereport(ERROR,
@@ -3505,6 +3510,7 @@ construct_md_array(Datum *elems,
int32 dataoffset;
int i;
int nelems;
+ uint8 elmalignby = typalign_to_alignby(elmalign);
if (ndims < 0) /* we do allow zero-dimension arrays */
ereport(ERROR,
@@ -3538,7 +3544,7 @@ construct_md_array(Datum *elems,
if (elmlen == -1)
elems[i] = PointerGetDatum(PG_DETOAST_DATUM(elems[i]));
nbytes = att_addlength_datum(nbytes, elmlen, elems[i]);
- nbytes = att_align_nominal(nbytes, elmalign);
+ nbytes = att_nominal_alignby(nbytes, elmalignby);
/* check for overflow of total request */
if (!AllocSizeIsValid(nbytes))
ereport(ERROR,
@@ -3641,6 +3647,7 @@ deconstruct_array(const ArrayType *array,
bits8 *bitmap;
int bitmask;
int i;
+ uint8 elmalignby = typalign_to_alignby(elmalign);
Assert(ARR_ELEMTYPE(array) == elmtype);
@@ -3673,7 +3680,7 @@ deconstruct_array(const ArrayType *array,
{
elems[i] = fetch_att(p, elmbyval, elmlen);
p = att_addlength_pointer(p, elmlen, p);
- p = (char *) att_align_nominal(p, elmalign);
+ p = (char *) att_nominal_alignby(p, elmalignby);
}
/* advance bitmap pointer if any */
@@ -3729,6 +3736,12 @@ deconstruct_array_builtin(const ArrayType *array,
elmalign = TYPALIGN_SHORT;
break;
+ case INT4OID:
+ elmlen = sizeof(int32);
+ elmbyval = true;
+ elmalign = TYPALIGN_INT;
+ break;
+
case OIDOID:
elmlen = sizeof(Oid);
elmbyval = true;
@@ -3878,8 +3891,8 @@ array_eq(PG_FUNCTION_ARGS)
/* Loop over source data */
nitems = ArrayGetNItems(ndims1, dims1);
- array_iter_setup(&it1, array1);
- array_iter_setup(&it2, array2);
+ array_iter_setup(&it1, array1, typlen, typbyval, typalign);
+ array_iter_setup(&it2, array2, typlen, typbyval, typalign);
for (i = 0; i < nitems; i++)
{
@@ -3890,10 +3903,8 @@ array_eq(PG_FUNCTION_ARGS)
bool oprresult;
/* Get elements, checking for NULL */
- elt1 = array_iter_next(&it1, &isnull1, i,
- typlen, typbyval, typalign);
- elt2 = array_iter_next(&it2, &isnull2, i,
- typlen, typbyval, typalign);
+ elt1 = array_iter_next(&it1, &isnull1, i);
+ elt2 = array_iter_next(&it2, &isnull2, i);
/*
* We consider two NULLs equal; NULL and not-NULL are unequal.
@@ -4042,8 +4053,8 @@ array_cmp(FunctionCallInfo fcinfo)
/* Loop over source data */
min_nitems = Min(nitems1, nitems2);
- array_iter_setup(&it1, array1);
- array_iter_setup(&it2, array2);
+ array_iter_setup(&it1, array1, typlen, typbyval, typalign);
+ array_iter_setup(&it2, array2, typlen, typbyval, typalign);
for (i = 0; i < min_nitems; i++)
{
@@ -4054,8 +4065,8 @@ array_cmp(FunctionCallInfo fcinfo)
int32 cmpresult;
/* Get elements, checking for NULL */
- elt1 = array_iter_next(&it1, &isnull1, i, typlen, typbyval, typalign);
- elt2 = array_iter_next(&it2, &isnull2, i, typlen, typbyval, typalign);
+ elt1 = array_iter_next(&it1, &isnull1, i);
+ elt2 = array_iter_next(&it2, &isnull2, i);
/*
* We consider two NULLs equal; NULL > not-NULL.
@@ -4238,7 +4249,7 @@ hash_array(PG_FUNCTION_ARGS)
/* Loop over source data */
nitems = ArrayGetNItems(ndims, dims);
- array_iter_setup(&iter, array);
+ array_iter_setup(&iter, array, typlen, typbyval, typalign);
for (i = 0; i < nitems; i++)
{
@@ -4247,7 +4258,7 @@ hash_array(PG_FUNCTION_ARGS)
uint32 elthash;
/* Get element, checking for NULL */
- elt = array_iter_next(&iter, &isnull, i, typlen, typbyval, typalign);
+ elt = array_iter_next(&iter, &isnull, i);
if (isnull)
{
@@ -4328,7 +4339,7 @@ hash_array_extended(PG_FUNCTION_ARGS)
/* Loop over source data */
nitems = ArrayGetNItems(ndims, dims);
- array_iter_setup(&iter, array);
+ array_iter_setup(&iter, array, typlen, typbyval, typalign);
for (i = 0; i < nitems; i++)
{
@@ -4337,7 +4348,7 @@ hash_array_extended(PG_FUNCTION_ARGS)
uint64 elthash;
/* Get element, checking for NULL */
- elt = array_iter_next(&iter, &isnull, i, typlen, typbyval, typalign);
+ elt = array_iter_next(&iter, &isnull, i);
if (isnull)
{
@@ -4451,7 +4462,7 @@ array_contain_compare(AnyArrayType *array1, AnyArrayType *array2, Oid collation,
/* Loop over source data */
nelems1 = ArrayGetNItems(AARR_NDIM(array1), AARR_DIMS(array1));
- array_iter_setup(&it1, array1);
+ array_iter_setup(&it1, array1, typlen, typbyval, typalign);
for (i = 0; i < nelems1; i++)
{
@@ -4459,7 +4470,7 @@ array_contain_compare(AnyArrayType *array1, AnyArrayType *array2, Oid collation,
bool isnull1;
/* Get element, checking for NULL */
- elt1 = array_iter_next(&it1, &isnull1, i, typlen, typbyval, typalign);
+ elt1 = array_iter_next(&it1, &isnull1, i);
/*
* We assume that the comparison operator is strict, so a NULL can't
@@ -4626,6 +4637,7 @@ array_create_iterator(ArrayType *arr, int slice_ndim, ArrayMetaState *mstate)
&iterator->typlen,
&iterator->typbyval,
&iterator->typalign);
+ iterator->typalignby = typalign_to_alignby(iterator->typalign);
/*
* Remember the slicing parameters.
@@ -4700,7 +4712,7 @@ array_iterate(ArrayIterator iterator, Datum *value, bool *isnull)
/* Move our data pointer forward to the next element */
p = att_addlength_pointer(p, iterator->typlen, p);
- p = (char *) att_align_nominal(p, iterator->typalign);
+ p = (char *) att_nominal_alignby(p, iterator->typalignby);
iterator->data_ptr = p;
}
}
@@ -4730,7 +4742,7 @@ array_iterate(ArrayIterator iterator, Datum *value, bool *isnull)
/* Move our data pointer forward to the next element */
p = att_addlength_pointer(p, iterator->typlen, p);
- p = (char *) att_align_nominal(p, iterator->typalign);
+ p = (char *) att_nominal_alignby(p, iterator->typalignby);
}
}
@@ -4828,7 +4840,7 @@ static int
ArrayCastAndSet(Datum src,
int typlen,
bool typbyval,
- char typalign,
+ uint8 typalignby,
char *dest)
{
int inc;
@@ -4839,14 +4851,14 @@ ArrayCastAndSet(Datum src,
store_att_byval(dest, src, typlen);
else
memmove(dest, DatumGetPointer(src), typlen);
- inc = att_align_nominal(typlen, typalign);
+ inc = att_nominal_alignby(typlen, typalignby);
}
else
{
Assert(!typbyval);
inc = att_addlength_datum(0, typlen, src);
memmove(dest, DatumGetPointer(src), inc);
- inc = att_align_nominal(inc, typalign);
+ inc = att_nominal_alignby(inc, typalignby);
}
return inc;
@@ -4867,12 +4879,13 @@ static char *
array_seek(char *ptr, int offset, bits8 *nullbitmap, int nitems,
int typlen, bool typbyval, char typalign)
{
+ uint8 typalignby = typalign_to_alignby(typalign);
int bitmask;
int i;
/* easy if fixed-size elements and no NULLs */
if (typlen > 0 && !nullbitmap)
- return ptr + nitems * ((Size) att_align_nominal(typlen, typalign));
+ return ptr + nitems * ((Size) att_nominal_alignby(typlen, typalignby));
/* seems worth having separate loops for NULL and no-NULLs cases */
if (nullbitmap)
@@ -4885,7 +4898,7 @@ array_seek(char *ptr, int offset, bits8 *nullbitmap, int nitems,
if (*nullbitmap & bitmask)
{
ptr = att_addlength_pointer(ptr, typlen, ptr);
- ptr = (char *) att_align_nominal(ptr, typalign);
+ ptr = (char *) att_nominal_alignby(ptr, typalignby);
}
bitmask <<= 1;
if (bitmask == 0x100)
@@ -4900,7 +4913,7 @@ array_seek(char *ptr, int offset, bits8 *nullbitmap, int nitems,
for (i = 0; i < nitems; i++)
{
ptr = att_addlength_pointer(ptr, typlen, ptr);
- ptr = (char *) att_align_nominal(ptr, typalign);
+ ptr = (char *) att_nominal_alignby(ptr, typalignby);
}
}
return ptr;
@@ -5050,12 +5063,13 @@ array_slice_size(char *arraydataptr, bits8 *arraynullsptr,
j,
inc;
int count = 0;
+ uint8 typalignby = typalign_to_alignby(typalign);
mda_get_range(ndim, span, st, endp);
/* Pretty easy for fixed element length without nulls ... */
if (typlen > 0 && !arraynullsptr)
- return ArrayGetNItems(ndim, span) * att_align_nominal(typlen, typalign);
+ return ArrayGetNItems(ndim, span) * att_nominal_alignby(typlen, typalignby);
/* Else gotta do it the hard way */
src_offset = ArrayGetOffset(ndim, dim, lb, st);
@@ -5077,7 +5091,7 @@ array_slice_size(char *arraydataptr, bits8 *arraynullsptr,
if (!array_get_isnull(arraynullsptr, src_offset))
{
inc = att_addlength_pointer(0, typlen, ptr);
- inc = att_align_nominal(inc, typalign);
+ inc = att_nominal_alignby(inc, typalignby);
ptr += inc;
count += inc;
}
@@ -6096,6 +6110,7 @@ array_fill_internal(ArrayType *dims, ArrayType *lbs,
int16 elmlen;
bool elmbyval;
char elmalign;
+ uint8 elmalignby;
ArrayMetaState *my_extra;
/*
@@ -6190,6 +6205,7 @@ array_fill_internal(ArrayType *dims, ArrayType *lbs,
elmlen = my_extra->typlen;
elmbyval = my_extra->typbyval;
elmalign = my_extra->typalign;
+ elmalignby = typalign_to_alignby(elmalign);
/* compute required space */
if (!isnull)
@@ -6204,7 +6220,7 @@ array_fill_internal(ArrayType *dims, ArrayType *lbs,
value = PointerGetDatum(PG_DETOAST_DATUM(value));
nbytes = att_addlength_datum(0, elmlen, value);
- nbytes = att_align_nominal(nbytes, elmalign);
+ nbytes = att_nominal_alignby(nbytes, elmalignby);
Assert(nbytes > 0);
totbytes = nbytes * nitems;
@@ -6228,7 +6244,7 @@ array_fill_internal(ArrayType *dims, ArrayType *lbs,
p = ARR_DATA_PTR(result);
for (i = 0; i < nitems; i++)
- p += ArrayCastAndSet(value, elmlen, elmbyval, elmalign, p);
+ p += ArrayCastAndSet(value, elmlen, elmbyval, elmalignby, p);
}
else
{
@@ -6259,9 +6275,6 @@ array_unnest(PG_FUNCTION_ARGS)
array_iter iter;
int nextelem;
int numelems;
- int16 elmlen;
- bool elmbyval;
- char elmalign;
} array_unnest_fctx;
FuncCallContext *funcctx;
@@ -6272,6 +6285,9 @@ array_unnest(PG_FUNCTION_ARGS)
if (SRF_IS_FIRSTCALL())
{
AnyArrayType *arr;
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
/* create a function context for cross-call persistence */
funcctx = SRF_FIRSTCALL_INIT();
@@ -6293,23 +6309,24 @@ array_unnest(PG_FUNCTION_ARGS)
/* allocate memory for user context */
fctx = palloc_object(array_unnest_fctx);
- /* initialize state */
- array_iter_setup(&fctx->iter, arr);
- fctx->nextelem = 0;
- fctx->numelems = ArrayGetNItems(AARR_NDIM(arr), AARR_DIMS(arr));
-
+ /* get element-type data */
if (VARATT_IS_EXPANDED_HEADER(arr))
{
/* we can just grab the type data from expanded array */
- fctx->elmlen = arr->xpn.typlen;
- fctx->elmbyval = arr->xpn.typbyval;
- fctx->elmalign = arr->xpn.typalign;
+ elmlen = arr->xpn.typlen;
+ elmbyval = arr->xpn.typbyval;
+ elmalign = arr->xpn.typalign;
}
else
get_typlenbyvalalign(AARR_ELEMTYPE(arr),
- &fctx->elmlen,
- &fctx->elmbyval,
- &fctx->elmalign);
+ &elmlen,
+ &elmbyval,
+ &elmalign);
+
+ /* initialize state */
+ array_iter_setup(&fctx->iter, arr, elmlen, elmbyval, elmalign);
+ fctx->nextelem = 0;
+ fctx->numelems = ArrayGetNItems(AARR_NDIM(arr), AARR_DIMS(arr));
funcctx->user_fctx = fctx;
MemoryContextSwitchTo(oldcontext);
@@ -6324,8 +6341,7 @@ array_unnest(PG_FUNCTION_ARGS)
int offset = fctx->nextelem++;
Datum elem;
- elem = array_iter_next(&fctx->iter, &fcinfo->isnull, offset,
- fctx->elmlen, fctx->elmbyval, fctx->elmalign);
+ elem = array_iter_next(&fctx->iter, &fcinfo->isnull, offset);
SRF_RETURN_NEXT(funcctx, elem);
}
@@ -6401,6 +6417,7 @@ array_replace_internal(ArrayType *array,
int typlen;
bool typbyval;
char typalign;
+ uint8 typalignby;
char *arraydataptr;
bits8 *bitmap;
int bitmask;
@@ -6445,6 +6462,7 @@ array_replace_internal(ArrayType *array,
typlen = typentry->typlen;
typbyval = typentry->typbyval;
typalign = typentry->typalign;
+ typalignby = typalign_to_alignby(typalign);
/*
* Detoast values if they are toasted. The replacement value must be
@@ -6506,7 +6524,7 @@ array_replace_internal(ArrayType *array,
isNull = false;
elt = fetch_att(arraydataptr, typbyval, typlen);
arraydataptr = att_addlength_datum(arraydataptr, typlen, elt);
- arraydataptr = (char *) att_align_nominal(arraydataptr, typalign);
+ arraydataptr = (char *) att_nominal_alignby(arraydataptr, typalignby);
if (search_isnull)
{
@@ -6553,7 +6571,7 @@ array_replace_internal(ArrayType *array,
{
/* Update total result size */
nbytes = att_addlength_datum(nbytes, typlen, values[nresult]);
- nbytes = att_align_nominal(nbytes, typalign);
+ nbytes = att_nominal_alignby(nbytes, typalignby);
/* check for overflow of total request */
if (!AllocSizeIsValid(nbytes))
ereport(ERROR,
@@ -6860,6 +6878,7 @@ width_bucket_array_variable(Datum operand,
int typlen = typentry->typlen;
bool typbyval = typentry->typbyval;
char typalign = typentry->typalign;
+ uint8 typalignby = typalign_to_alignby(typalign);
int left;
int right;
@@ -6883,7 +6902,7 @@ width_bucket_array_variable(Datum operand,
for (i = left; i < mid; i++)
{
ptr = att_addlength_pointer(ptr, typlen, ptr);
- ptr = (char *) att_align_nominal(ptr, typalign);
+ ptr = (char *) att_nominal_alignby(ptr, typalignby);
}
locfcinfo->args[0].value = operand;
@@ -6908,7 +6927,7 @@ width_bucket_array_variable(Datum operand,
* ensures we do only O(N) array indexing work, not O(N^2).
*/
ptr = att_addlength_pointer(ptr, typlen, ptr);
- thresholds_data = (char *) att_align_nominal(ptr, typalign);
+ thresholds_data = (char *) att_nominal_alignby(ptr, typalignby);
}
}
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
index 3c7f54f2638..f5f835e944a 100644
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -290,7 +290,7 @@ hex_decode_safe_scalar(const char *src, size_t len, char *dst, Node *escontext)
ereturn(escontext, 0,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid hexadecimal digit: \"%.*s\"",
- pg_mblen(s), s)));
+ pg_mblen_range(s, srcend), s)));
s++;
if (s >= srcend)
ereturn(escontext, 0,
@@ -300,7 +300,7 @@ hex_decode_safe_scalar(const char *src, size_t len, char *dst, Node *escontext)
ereturn(escontext, 0,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid hexadecimal digit: \"%.*s\"",
- pg_mblen(s), s)));
+ pg_mblen_range(s, srcend), s)));
s++;
*p++ = (v1 << 4) | v2;
}
@@ -564,7 +564,7 @@ pg_base64_decode_internal(const char *src, size_t len, char *dst, bool url)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid symbol \"%.*s\" found while decoding %s sequence",
- pg_mblen(s - 1), s - 1,
+ pg_mblen_range(s - 1, srcend), s - 1,
url ? "base64url" : "base64")));
}
}
diff --git a/src/backend/utils/adt/format_type.c b/src/backend/utils/adt/format_type.c
index 544205ca067..3cd5053d118 100644
--- a/src/backend/utils/adt/format_type.c
+++ b/src/backend/utils/adt/format_type.c
@@ -448,11 +448,15 @@ oidvectortypes(PG_FUNCTION_ARGS)
{
oidvector *oidArray = (oidvector *) PG_GETARG_POINTER(0);
char *result;
- int numargs = oidArray->dim1;
+ int numargs;
int num;
size_t total;
size_t left;
+ /* validate input before fetching dim1 */
+ check_valid_oidvector(oidArray);
+ numargs = oidArray->dim1;
+
total = 20 * numargs + 1;
result = palloc(total);
result[0] = '\0';
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index cf580c63c78..7720911a6a9 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -1438,7 +1438,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
ereport(ERROR,
(errcode(ERRCODE_INVALID_DATETIME_FORMAT),
errmsg("invalid datetime format separator: \"%s\"",
- pnstrdup(str, pg_mblen(str)))));
+ pnstrdup(str, pg_mblen_cstr(str)))));
if (*str == ' ')
n->type = NODE_TYPE_SPACE;
@@ -1468,7 +1468,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
/* backslash quotes the next character, if any */
if (*str == '\\' && *(str + 1))
str++;
- chlen = pg_mblen(str);
+ chlen = pg_mblen_cstr(str);
n->type = NODE_TYPE_CHAR;
memcpy(n->character, str, chlen);
n->character[chlen] = '\0';
@@ -1486,7 +1486,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
*/
if (*str == '\\' && *(str + 1) == '"')
str++;
- chlen = pg_mblen(str);
+ chlen = pg_mblen_cstr(str);
if ((flags & DCH_FLAG) && is_separator_char(str))
n->type = NODE_TYPE_SEPARATOR;
@@ -1992,8 +1992,8 @@ asc_toupper_z(const char *buff)
do { \
if (IS_SUFFIX_THth(_suf)) \
{ \
- if (*(ptr)) (ptr) += pg_mblen(ptr); \
- if (*(ptr)) (ptr) += pg_mblen(ptr); \
+ if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \
+ if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \
} \
} while (0)
@@ -3183,7 +3183,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out,
* insist that the consumed character match the format's
* character.
*/
- s += pg_mblen(s);
+ s += pg_mblen_cstr(s);
}
continue;
}
@@ -3205,11 +3205,11 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out,
if (extra_skip > 0)
extra_skip--;
else
- s += pg_mblen(s);
+ s += pg_mblen_cstr(s);
}
else
{
- int chlen = pg_mblen(s);
+ int chlen = pg_mblen_cstr(s);
/*
* Standard mode requires strict match of format characters.
@@ -5724,13 +5724,15 @@ NUM_numpart_to_char(NUMProc *Np, int id)
static void
NUM_eat_non_data_chars(NUMProc *Np, int n, size_t input_len)
{
+ const char *end = Np->inout + input_len;
+
while (n-- > 0)
{
if (OVERLOAD_TEST)
break; /* end of input */
if (strchr("0123456789.,+-", *Np->inout_p) != NULL)
break; /* it's a data character */
- Np->inout_p += pg_mblen(Np->inout_p);
+ Np->inout_p += pg_mblen_range(Np->inout_p, end);
}
}
@@ -6167,7 +6169,7 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
}
else
{
- Np->inout_p += pg_mblen(Np->inout_p);
+ Np->inout_p += pg_mblen_range(Np->inout_p, Np->inout + input_len);
}
continue;
}
diff --git a/src/backend/utils/adt/int.c b/src/backend/utils/adt/int.c
index d2302626585..ff54d50ea9d 100644
--- a/src/backend/utils/adt/int.c
+++ b/src/backend/utils/adt/int.c
@@ -134,6 +134,30 @@ buildint2vector(const int16 *int2s, int n)
return result;
}
+/*
+ * validate that an array object meets the restrictions of int2vector
+ *
+ * We need this because there are pathways by which a general int2[] array can
+ * be cast to int2vector, allowing the type's restrictions to be violated.
+ * All code that receives an int2vector as a SQL parameter should check this.
+ */
+static void
+check_valid_int2vector(const int2vector *int2Array)
+{
+ /*
+ * We insist on ndim == 1 and dataoffset == 0 (that is, no nulls) because
+ * otherwise the array's layout will not be what calling code expects. We
+ * needn't be picky about the index lower bound though. Checking elemtype
+ * is just paranoia.
+ */
+ if (int2Array->ndim != 1 ||
+ int2Array->dataoffset != 0 ||
+ int2Array->elemtype != INT2OID)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("array is not a valid int2vector")));
+}
+
/*
* int2vectorin - converts "num num ..." to internal form
*/
@@ -208,10 +232,14 @@ int2vectorout(PG_FUNCTION_ARGS)
{
int2vector *int2Array = (int2vector *) PG_GETARG_POINTER(0);
int num,
- nnums = int2Array->dim1;
+ nnums;
char *rp;
char *result;
+ /* validate input before fetching dim1 */
+ check_valid_int2vector(int2Array);
+ nnums = int2Array->dim1;
+
/* assumes sign, 5 digits, ' ' */
rp = result = (char *) palloc(nnums * 7 + 1);
for (num = 0; num < nnums; num++)
@@ -272,6 +300,7 @@ int2vectorrecv(PG_FUNCTION_ARGS)
Datum
int2vectorsend(PG_FUNCTION_ARGS)
{
+ /* We don't do check_valid_int2vector, since array_send won't care */
return array_send(fcinfo);
}
diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c
index 1e5b60801e4..d5b64d7fca5 100644
--- a/src/backend/utils/adt/jsonfuncs.c
+++ b/src/backend/utils/adt/jsonfuncs.c
@@ -695,7 +695,7 @@ report_json_context(JsonLexContext *lex)
{
/* Advance to next multibyte character */
if (IS_HIGHBIT_SET(*context_start))
- context_start += pg_mblen(context_start);
+ context_start += pg_mblen_range(context_start, context_end);
else
context_start++;
}
diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y
index 4543626ffc8..87070235d11 100644
--- a/src/backend/utils/adt/jsonpath_gram.y
+++ b/src/backend/utils/adt/jsonpath_gram.y
@@ -599,7 +599,8 @@ makeItemLikeRegex(JsonPathParseItem *expr, JsonPathString *pattern,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("invalid input syntax for type %s", "jsonpath"),
errdetail("Unrecognized flag character \"%.*s\" in LIKE_REGEX predicate.",
- pg_mblen(flags->val + i), flags->val + i)));
+ pg_mblen_range(flags->val + i, flags->val + flags->len),
+ flags->val + i)));
break;
}
}
diff --git a/src/backend/utils/adt/levenshtein.c b/src/backend/utils/adt/levenshtein.c
index fb2ba591acd..5b3d84029f6 100644
--- a/src/backend/utils/adt/levenshtein.c
+++ b/src/backend/utils/adt/levenshtein.c
@@ -83,6 +83,8 @@ varstr_levenshtein(const char *source, int slen,
int *s_char_len = NULL;
int j;
const char *y;
+ const char *send = source + slen;
+ const char *tend = target + tlen;
/*
* For varstr_levenshtein_less_equal, we have real variables called
@@ -183,10 +185,10 @@ varstr_levenshtein(const char *source, int slen,
#endif
/*
- * In order to avoid calling pg_mblen() repeatedly on each character in s,
- * we cache all the lengths before starting the main loop -- but if all
- * the characters in both strings are single byte, then we skip this and
- * use a fast-path in the main loop. If only one string contains
+ * In order to avoid calling pg_mblen_range() repeatedly on each character
+ * in s, we cache all the lengths before starting the main loop -- but if
+ * all the characters in both strings are single byte, then we skip this
+ * and use a fast-path in the main loop. If only one string contains
* multi-byte characters, we still build the array, so that the fast-path
* needn't deal with the case where the array hasn't been initialized.
*/
@@ -198,7 +200,7 @@ varstr_levenshtein(const char *source, int slen,
s_char_len = (int *) palloc((m + 1) * sizeof(int));
for (i = 0; i < m; ++i)
{
- s_char_len[i] = pg_mblen(cp);
+ s_char_len[i] = pg_mblen_range(cp, send);
cp += s_char_len[i];
}
s_char_len[i] = 0;
@@ -224,7 +226,7 @@ varstr_levenshtein(const char *source, int slen,
{
int *temp;
const char *x = source;
- int y_char_len = n != tlen + 1 ? pg_mblen(y) : 1;
+ int y_char_len = n != tlen + 1 ? pg_mblen_range(y, tend) : 1;
int i;
#ifdef LEVENSHTEIN_LESS_EQUAL
diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c
index 2143d8658e8..350bc07f210 100644
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -55,20 +55,20 @@ static int Generic_Text_IC_like(text *str, text *pat, Oid collation);
*--------------------
*/
static inline int
-wchareq(const char *p1, const char *p2)
+wchareq(const char *p1, int p1len, const char *p2, int p2len)
{
- int p1_len;
+ int p1clen;
/* Optimization: quickly compare the first byte. */
if (*p1 != *p2)
return 0;
- p1_len = pg_mblen(p1);
- if (pg_mblen(p2) != p1_len)
+ p1clen = pg_mblen_with_len(p1, p1len);
+ if (pg_mblen_with_len(p2, p2len) != p1clen)
return 0;
/* They are the same length */
- while (p1_len--)
+ while (p1clen--)
{
if (*p1++ != *p2++)
return 0;
@@ -93,11 +93,11 @@ wchareq(const char *p1, const char *p2)
#define NextByte(p, plen) ((p)++, (plen)--)
/* Set up to compile like_match.c for multibyte characters */
-#define CHAREQ(p1, p2) wchareq((p1), (p2))
+#define CHAREQ(p1, p1len, p2, p2len) wchareq((p1), (p1len), (p2), (p2len))
#define NextChar(p, plen) \
- do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0)
+ do { int __l = pg_mblen_with_len((p), (plen)); (p) +=__l; (plen) -=__l; } while (0)
#define CopyAdvChar(dst, src, srclen) \
- do { int __l = pg_mblen(src); \
+ do { int __l = pg_mblen_with_len((src), (srclen)); \
(srclen) -= __l; \
while (__l-- > 0) \
*(dst)++ = *(src)++; \
@@ -109,7 +109,7 @@ wchareq(const char *p1, const char *p2)
#include "like_match.c"
/* Set up to compile like_match.c for single-byte characters */
-#define CHAREQ(p1, p2) (*(p1) == *(p2))
+#define CHAREQ(p1, p1len, p2, p2len) (*(p1) == *(p2))
#define NextChar(p, plen) NextByte((p), (plen))
#define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)
diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c
index 02990ca9a1b..f5f72b82e21 100644
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -442,6 +442,7 @@ do_like_escape(text *pat, text *esc)
errhint("Escape string must be empty or one character.")));
e = VARDATA_ANY(esc);
+ elen = VARSIZE_ANY_EXHDR(esc);
/*
* If specified escape is '\', just copy the pattern as-is.
@@ -460,7 +461,7 @@ do_like_escape(text *pat, text *esc)
afterescape = false;
while (plen > 0)
{
- if (CHAREQ(p, e) && !afterescape)
+ if (CHAREQ(p, plen, e, elen) && !afterescape)
{
*r++ = '\\';
NextChar(p, plen);
diff --git a/src/backend/utils/adt/mcxtfuncs.c b/src/backend/utils/adt/mcxtfuncs.c
index 12b8d4cefaf..c7f7b8bc2dd 100644
--- a/src/backend/utils/adt/mcxtfuncs.c
+++ b/src/backend/utils/adt/mcxtfuncs.c
@@ -19,6 +19,7 @@
#include "mb/pg_wchar.h"
#include "storage/proc.h"
#include "storage/procarray.h"
+#include "storage/procsignal.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/hsearch.h"
diff --git a/src/backend/utils/adt/multirangetypes.c b/src/backend/utils/adt/multirangetypes.c
index 07e2a81d46a..9548989d782 100644
--- a/src/backend/utils/adt/multirangetypes.c
+++ b/src/backend/utils/adt/multirangetypes.c
@@ -485,8 +485,9 @@ multirange_canonicalize(TypeCacheEntry *rangetyp, int32 input_range_count,
int32 output_range_count = 0;
/* Sort the ranges so we can find the ones that overlap/meet. */
- qsort_arg(ranges, input_range_count, sizeof(RangeType *), range_compare,
- rangetyp);
+ if (ranges != NULL)
+ qsort_arg(ranges, input_range_count, sizeof(RangeType *),
+ range_compare, rangetyp);
/* Now merge where possible: */
for (i = 0; i < input_range_count; i++)
@@ -572,21 +573,22 @@ multirange_size_estimate(TypeCacheEntry *rangetyp, int32 range_count,
RangeType **ranges)
{
char elemalign = rangetyp->rngelemtype->typalign;
+ uint8 elemalignby = typalign_to_alignby(elemalign);
Size size;
int32 i;
/*
* Count space for MultirangeType struct, items and flags.
*/
- size = att_align_nominal(sizeof(MultirangeType) +
- Max(range_count - 1, 0) * sizeof(uint32) +
- range_count * sizeof(uint8), elemalign);
+ size = att_nominal_alignby(sizeof(MultirangeType) +
+ Max(range_count - 1, 0) * sizeof(uint32) +
+ range_count * sizeof(uint8), elemalignby);
/* Count space for range bounds */
for (i = 0; i < range_count; i++)
- size += att_align_nominal(VARSIZE(ranges[i]) -
- sizeof(RangeType) -
- sizeof(char), elemalign);
+ size += att_nominal_alignby(VARSIZE(ranges[i]) -
+ sizeof(RangeType) -
+ sizeof(char), elemalignby);
return size;
}
@@ -605,6 +607,7 @@ write_multirange_data(MultirangeType *multirange, TypeCacheEntry *rangetyp,
const char *begin;
char *ptr;
char elemalign = rangetyp->rngelemtype->typalign;
+ uint8 elemalignby = typalign_to_alignby(elemalign);
items = MultirangeGetItemsPtr(multirange);
flags = MultirangeGetFlagsPtr(multirange);
@@ -630,7 +633,7 @@ write_multirange_data(MultirangeType *multirange, TypeCacheEntry *rangetyp,
flags[i] = *((char *) ranges[i] + VARSIZE(ranges[i]) - sizeof(char));
len = VARSIZE(ranges[i]) - sizeof(RangeType) - sizeof(char);
memcpy(ptr, ranges[i] + 1, len);
- ptr += att_align_nominal(len, elemalign);
+ ptr += att_nominal_alignby(len, elemalignby);
}
}
diff --git a/src/backend/utils/adt/network_selfuncs.c b/src/backend/utils/adt/network_selfuncs.c
index 902f9c25db0..2a8d2ded907 100644
--- a/src/backend/utils/adt/network_selfuncs.c
+++ b/src/backend/utils/adt/network_selfuncs.c
@@ -43,9 +43,9 @@
/* Maximum number of items to consider in join selectivity calculations */
#define MAX_CONSIDERED_ELEMS 1024
-static Selectivity networkjoinsel_inner(Oid operator,
+static Selectivity networkjoinsel_inner(Oid operator, int opr_codenum,
VariableStatData *vardata1, VariableStatData *vardata2);
-static Selectivity networkjoinsel_semi(Oid operator,
+static Selectivity networkjoinsel_semi(Oid operator, int opr_codenum,
VariableStatData *vardata1, VariableStatData *vardata2);
static Selectivity mcv_population(float4 *mcv_numbers, int mcv_nvalues);
static Selectivity inet_hist_value_sel(const Datum *values, int nvalues,
@@ -82,6 +82,7 @@ networksel(PG_FUNCTION_ARGS)
Oid operator = PG_GETARG_OID(1);
List *args = (List *) PG_GETARG_POINTER(2);
int varRelid = PG_GETARG_INT32(3);
+ int opr_codenum;
VariableStatData vardata;
Node *other;
bool varonleft;
@@ -95,6 +96,14 @@ networksel(PG_FUNCTION_ARGS)
nullfrac;
FmgrInfo proc;
+ /*
+ * Before all else, verify that the operator is one of the ones supported
+ * by this function, which in turn proves that the input datatypes are
+ * what we expect. Otherwise, attaching this selectivity function to some
+ * unexpected operator could cause trouble.
+ */
+ opr_codenum = inet_opr_codenum(operator);
+
/*
* If expression is not (variable op something) or (something op
* variable), then punt and return a default estimate.
@@ -150,13 +159,12 @@ networksel(PG_FUNCTION_ARGS)
STATISTIC_KIND_HISTOGRAM, InvalidOid,
ATTSTATSSLOT_VALUES))
{
- int opr_codenum = inet_opr_codenum(operator);
+ int h_codenum;
/* Commute if needed, so we can consider histogram to be on the left */
- if (!varonleft)
- opr_codenum = -opr_codenum;
+ h_codenum = varonleft ? opr_codenum : -opr_codenum;
non_mcv_selec = inet_hist_value_sel(hslot.values, hslot.nvalues,
- constvalue, opr_codenum);
+ constvalue, h_codenum);
free_attstatsslot(&hslot);
}
@@ -203,10 +211,19 @@ networkjoinsel(PG_FUNCTION_ARGS)
#endif
SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
double selec;
+ int opr_codenum;
VariableStatData vardata1;
VariableStatData vardata2;
bool join_is_reversed;
+ /*
+ * Before all else, verify that the operator is one of the ones supported
+ * by this function, which in turn proves that the input datatypes are
+ * what we expect. Otherwise, attaching this selectivity function to some
+ * unexpected operator could cause trouble.
+ */
+ opr_codenum = inet_opr_codenum(operator);
+
get_join_variables(root, args, sjinfo,
&vardata1, &vardata2, &join_is_reversed);
@@ -220,15 +237,18 @@ networkjoinsel(PG_FUNCTION_ARGS)
* Selectivity for left/full join is not exactly the same as inner
* join, but we neglect the difference, as eqjoinsel does.
*/
- selec = networkjoinsel_inner(operator, &vardata1, &vardata2);
+ selec = networkjoinsel_inner(operator, opr_codenum,
+ &vardata1, &vardata2);
break;
case JOIN_SEMI:
case JOIN_ANTI:
/* Here, it's important that we pass the outer var on the left. */
if (!join_is_reversed)
- selec = networkjoinsel_semi(operator, &vardata1, &vardata2);
+ selec = networkjoinsel_semi(operator, opr_codenum,
+ &vardata1, &vardata2);
else
selec = networkjoinsel_semi(get_commutator(operator),
+ -opr_codenum,
&vardata2, &vardata1);
break;
default:
@@ -260,7 +280,7 @@ networkjoinsel(PG_FUNCTION_ARGS)
* Also, MCV vs histogram selectivity is not neglected as in eqjoinsel_inner().
*/
static Selectivity
-networkjoinsel_inner(Oid operator,
+networkjoinsel_inner(Oid operator, int opr_codenum,
VariableStatData *vardata1, VariableStatData *vardata2)
{
Form_pg_statistic stats;
@@ -273,7 +293,6 @@ networkjoinsel_inner(Oid operator,
mcv2_exists = false,
hist1_exists = false,
hist2_exists = false;
- int opr_codenum;
int mcv1_length = 0,
mcv2_length = 0;
AttStatsSlot mcv1_slot;
@@ -325,8 +344,6 @@ networkjoinsel_inner(Oid operator,
memset(&hist2_slot, 0, sizeof(hist2_slot));
}
- opr_codenum = inet_opr_codenum(operator);
-
/*
* Calculate selectivity for MCV vs MCV matches.
*/
@@ -387,7 +404,7 @@ networkjoinsel_inner(Oid operator,
* histogram selectivity for semi/anti join cases.
*/
static Selectivity
-networkjoinsel_semi(Oid operator,
+networkjoinsel_semi(Oid operator, int opr_codenum,
VariableStatData *vardata1, VariableStatData *vardata2)
{
Form_pg_statistic stats;
@@ -401,7 +418,6 @@ networkjoinsel_semi(Oid operator,
mcv2_exists = false,
hist1_exists = false,
hist2_exists = false;
- int opr_codenum;
FmgrInfo proc;
int i,
mcv1_length = 0,
@@ -455,7 +471,6 @@ networkjoinsel_semi(Oid operator,
memset(&hist2_slot, 0, sizeof(hist2_slot));
}
- opr_codenum = inet_opr_codenum(operator);
fmgr_info(get_opcode(operator), &proc);
/* Estimate number of input rows represented by RHS histogram. */
@@ -827,6 +842,9 @@ inet_semi_join_sel(Datum lhs_value,
/*
* Assign useful code numbers for the subnet inclusion/overlap operators
*
+ * This will throw an error if the operator is not one of the ones we
+ * support in networksel() and networkjoinsel().
+ *
* Only inet_masklen_inclusion_cmp() and inet_hist_match_divider() depend
* on the exact codes assigned here; but many other places in this file
* know that they can negate a code to obtain the code for the commutator
diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c
index 891ae6ba7fe..3bd3635d98a 100644
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@@ -48,8 +48,8 @@
* Uncomment the following to enable compilation of dump_numeric()
* and dump_var() and to get a dump of any result produced by make_result().
* ----------
-#define NUMERIC_DEBUG
*/
+/* #define NUMERIC_DEBUG */
/* ----------
diff --git a/src/backend/utils/adt/oid.c b/src/backend/utils/adt/oid.c
index 6f4c299dee9..a3419728971 100644
--- a/src/backend/utils/adt/oid.c
+++ b/src/backend/utils/adt/oid.c
@@ -107,6 +107,30 @@ buildoidvector(const Oid *oids, int n)
return result;
}
+/*
+ * validate that an array object meets the restrictions of oidvector
+ *
+ * We need this because there are pathways by which a general oid[] array can
+ * be cast to oidvector, allowing the type's restrictions to be violated.
+ * All code that receives an oidvector as a SQL parameter should check this.
+ */
+void
+check_valid_oidvector(const oidvector *oidArray)
+{
+ /*
+ * We insist on ndim == 1 and dataoffset == 0 (that is, no nulls) because
+ * otherwise the array's layout will not be what calling code expects. We
+ * needn't be picky about the index lower bound though. Checking elemtype
+ * is just paranoia.
+ */
+ if (oidArray->ndim != 1 ||
+ oidArray->dataoffset != 0 ||
+ oidArray->elemtype != OIDOID)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("array is not a valid oidvector")));
+}
+
/*
* oidvectorin - converts "num num ..." to internal form
*/
@@ -159,10 +183,14 @@ oidvectorout(PG_FUNCTION_ARGS)
{
oidvector *oidArray = (oidvector *) PG_GETARG_POINTER(0);
int num,
- nnums = oidArray->dim1;
+ nnums;
char *rp;
char *result;
+ /* validate input before fetching dim1 */
+ check_valid_oidvector(oidArray);
+ nnums = oidArray->dim1;
+
/* assumes sign, 10 digits, ' ' */
rp = result = (char *) palloc(nnums * 12 + 1);
for (num = 0; num < nnums; num++)
@@ -225,6 +253,7 @@ oidvectorrecv(PG_FUNCTION_ARGS)
Datum
oidvectorsend(PG_FUNCTION_ARGS)
{
+ /* We don't do check_valid_oidvector, since array_send won't care */
return array_send(fcinfo);
}
diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c
index a003f90066c..5b0d098bd07 100644
--- a/src/backend/utils/adt/oracle_compat.c
+++ b/src/backend/utils/adt/oracle_compat.c
@@ -169,8 +169,8 @@ lpad(PG_FUNCTION_ARGS)
char *ptr1,
*ptr2,
*ptr2start,
- *ptr2end,
*ptr_ret;
+ const char *ptr2end;
int m,
s1len,
s2len;
@@ -215,7 +215,7 @@ lpad(PG_FUNCTION_ARGS)
while (m--)
{
- int mlen = pg_mblen(ptr2);
+ int mlen = pg_mblen_range(ptr2, ptr2end);
memcpy(ptr_ret, ptr2, mlen);
ptr_ret += mlen;
@@ -228,7 +228,7 @@ lpad(PG_FUNCTION_ARGS)
while (s1len--)
{
- int mlen = pg_mblen(ptr1);
+ int mlen = pg_mblen_unbounded(ptr1);
memcpy(ptr_ret, ptr1, mlen);
ptr_ret += mlen;
@@ -267,8 +267,8 @@ rpad(PG_FUNCTION_ARGS)
char *ptr1,
*ptr2,
*ptr2start,
- *ptr2end,
*ptr_ret;
+ const char *ptr2end;
int m,
s1len,
s2len;
@@ -308,11 +308,12 @@ rpad(PG_FUNCTION_ARGS)
m = len - s1len;
ptr1 = VARDATA_ANY(string1);
+
ptr_ret = VARDATA(ret);
while (s1len--)
{
- int mlen = pg_mblen(ptr1);
+ int mlen = pg_mblen_unbounded(ptr1);
memcpy(ptr_ret, ptr1, mlen);
ptr_ret += mlen;
@@ -324,7 +325,7 @@ rpad(PG_FUNCTION_ARGS)
while (m--)
{
- int mlen = pg_mblen(ptr2);
+ int mlen = pg_mblen_range(ptr2, ptr2end);
memcpy(ptr_ret, ptr2, mlen);
ptr_ret += mlen;
@@ -409,6 +410,7 @@ dotrim(const char *string, int stringlen,
*/
const char **stringchars;
const char **setchars;
+ const char *setend;
int *stringmblen;
int *setmblen;
int stringnchars;
@@ -416,6 +418,7 @@ dotrim(const char *string, int stringlen,
int resultndx;
int resultnchars;
const char *p;
+ const char *pend;
int len;
int mblen;
const char *str_pos;
@@ -426,10 +429,11 @@ dotrim(const char *string, int stringlen,
stringnchars = 0;
p = string;
len = stringlen;
+ pend = p + len;
while (len > 0)
{
stringchars[stringnchars] = p;
- stringmblen[stringnchars] = mblen = pg_mblen(p);
+ stringmblen[stringnchars] = mblen = pg_mblen_range(p, pend);
stringnchars++;
p += mblen;
len -= mblen;
@@ -440,10 +444,11 @@ dotrim(const char *string, int stringlen,
setnchars = 0;
p = set;
len = setlen;
+ setend = set + setlen;
while (len > 0)
{
setchars[setnchars] = p;
- setmblen[setnchars] = mblen = pg_mblen(p);
+ setmblen[setnchars] = mblen = pg_mblen_range(p, setend);
setnchars++;
p += mblen;
len -= mblen;
@@ -821,6 +826,8 @@ translate(PG_FUNCTION_ARGS)
*to_end;
char *source,
*target;
+ const char *source_end;
+ const char *from_end;
int m,
fromlen,
tolen,
@@ -835,9 +842,11 @@ translate(PG_FUNCTION_ARGS)
if (m <= 0)
PG_RETURN_TEXT_P(string);
source = VARDATA_ANY(string);
+ source_end = source + m;
fromlen = VARSIZE_ANY_EXHDR(from);
from_ptr = VARDATA_ANY(from);
+ from_end = from_ptr + fromlen;
tolen = VARSIZE_ANY_EXHDR(to);
to_ptr = VARDATA_ANY(to);
to_end = to_ptr + tolen;
@@ -861,12 +870,12 @@ translate(PG_FUNCTION_ARGS)
while (m > 0)
{
- source_len = pg_mblen(source);
+ source_len = pg_mblen_range(source, source_end);
from_index = 0;
for (i = 0; i < fromlen; i += len)
{
- len = pg_mblen(&from_ptr[i]);
+ len = pg_mblen_range(&from_ptr[i], from_end);
if (len == source_len &&
memcmp(source, &from_ptr[i], len) == 0)
break;
@@ -882,11 +891,11 @@ translate(PG_FUNCTION_ARGS)
{
if (p >= to_end)
break;
- p += pg_mblen(p);
+ p += pg_mblen_range(p, to_end);
}
if (p < to_end)
{
- len = pg_mblen(p);
+ len = pg_mblen_range(p, to_end);
memcpy(target, p, len);
target += len;
retlen += len;
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 2f96e889595..78f6ea161a0 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -527,11 +527,11 @@ strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
result_size = wchar2char(result, workspace, max_size + 1, loc);
- if (result_size + 1 > destsize)
- return result_size;
-
- memcpy(dest, result, result_size);
- dest[result_size] = '\0';
+ if (destsize >= result_size + 1)
+ {
+ memcpy(dest, result, result_size);
+ dest[result_size] = '\0';
+ }
pfree(workspace);
pfree(result);
@@ -638,11 +638,11 @@ strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
result_size = wchar2char(result, workspace, max_size + 1, loc);
- if (result_size + 1 > destsize)
- return result_size;
-
- memcpy(dest, result, result_size);
- dest[result_size] = '\0';
+ if (destsize >= result_size + 1)
+ {
+ memcpy(dest, result, result_size);
+ dest[result_size] = '\0';
+ }
pfree(workspace);
pfree(result);
@@ -725,11 +725,11 @@ strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
result_size = wchar2char(result, workspace, max_size + 1, loc);
- if (result_size + 1 > destsize)
- return result_size;
-
- memcpy(dest, result, result_size);
- dest[result_size] = '\0';
+ if (destsize >= result_size + 1)
+ {
+ memcpy(dest, result, result_size);
+ dest[result_size] = '\0';
+ }
pfree(workspace);
pfree(result);
diff --git a/src/backend/utils/adt/pg_upgrade_support.c b/src/backend/utils/adt/pg_upgrade_support.c
index 697143aec44..b505a6b4fee 100644
--- a/src/backend/utils/adt/pg_upgrade_support.c
+++ b/src/backend/utils/adt/pg_upgrade_support.c
@@ -282,11 +282,12 @@ binary_upgrade_set_missing_value(PG_FUNCTION_ARGS)
* upgraded without data loss.
*/
Datum
-binary_upgrade_logical_slot_has_caught_up(PG_FUNCTION_ARGS)
+binary_upgrade_check_logical_slot_pending_wal(PG_FUNCTION_ARGS)
{
Name slot_name;
XLogRecPtr end_of_wal;
- bool found_pending_wal;
+ XLogRecPtr scan_cutoff_lsn;
+ XLogRecPtr last_pending_wal;
CHECK_IS_BINARY_UPGRADE;
@@ -297,6 +298,7 @@ binary_upgrade_logical_slot_has_caught_up(PG_FUNCTION_ARGS)
Assert(has_rolreplication(GetUserId()));
slot_name = PG_GETARG_NAME(0);
+ scan_cutoff_lsn = PG_GETARG_LSN(1);
/* Acquire the given slot */
ReplicationSlotAcquire(NameStr(*slot_name), true, true);
@@ -307,12 +309,16 @@ binary_upgrade_logical_slot_has_caught_up(PG_FUNCTION_ARGS)
Assert(MyReplicationSlot->data.invalidated == RS_INVAL_NONE);
end_of_wal = GetFlushRecPtr(NULL);
- found_pending_wal = LogicalReplicationSlotHasPendingWal(end_of_wal);
+ last_pending_wal = LogicalReplicationSlotCheckPendingWal(end_of_wal,
+ scan_cutoff_lsn);
/* Clean up */
ReplicationSlotRelease();
- PG_RETURN_BOOL(!found_pending_wal);
+ if (XLogRecPtrIsValid(last_pending_wal))
+ PG_RETURN_LSN(last_pending_wal);
+ else
+ PG_RETURN_NULL();
}
/*
diff --git a/src/backend/utils/adt/rangetypes_typanalyze.c b/src/backend/utils/adt/rangetypes_typanalyze.c
index 38d12dedbc5..278d4e6941a 100644
--- a/src/backend/utils/adt/rangetypes_typanalyze.c
+++ b/src/backend/utils/adt/rangetypes_typanalyze.c
@@ -398,7 +398,7 @@ compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
stats->statypid[slot_idx] = FLOAT8OID;
stats->statyplen[slot_idx] = sizeof(float8);
stats->statypbyval[slot_idx] = true;
- stats->statypalign[slot_idx] = 'd';
+ stats->statypalign[slot_idx] = TYPALIGN_DOUBLE;
/* Store the fraction of empty ranges */
emptyfrac = palloc_object(float4);
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 94cd15bbab1..311b9877bbb 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -443,7 +443,7 @@ parse_re_flags(pg_re_flags *flags, text *opts)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid regular expression option: \"%.*s\"",
- pg_mblen(opt_p + i), opt_p + i)));
+ pg_mblen_range(opt_p + i, opt_p + opt_len), opt_p + i)));
break;
}
}
@@ -673,12 +673,13 @@ textregexreplace(PG_FUNCTION_ARGS)
if (VARSIZE_ANY_EXHDR(opt) > 0)
{
char *opt_p = VARDATA_ANY(opt);
+ const char *end_p = opt_p + VARSIZE_ANY_EXHDR(opt);
if (*opt_p >= '0' && *opt_p <= '9')
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid regular expression option: \"%.*s\"",
- pg_mblen(opt_p), opt_p),
+ pg_mblen_range(opt_p, end_p), opt_p),
errhint("If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly.")));
}
@@ -772,6 +773,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
*r;
int plen,
elen;
+ const char *pend;
bool afterescape = false;
int nquotes = 0;
int bracket_depth = 0; /* square bracket nesting level */
@@ -779,6 +781,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
p = VARDATA_ANY(pat_text);
plen = VARSIZE_ANY_EXHDR(pat_text);
+ pend = p + plen;
if (esc_text == NULL)
{
/* No ESCAPE clause provided; default to backslash as escape */
@@ -878,7 +881,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
if (elen > 1)
{
- int mblen = pg_mblen(p);
+ int mblen = pg_mblen_range(p, pend);
if (mblen > 1)
{
diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c
index e3bf1fbbfd7..7e54f36c2a7 100644
--- a/src/backend/utils/adt/tsquery.c
+++ b/src/backend/utils/adt/tsquery.c
@@ -120,7 +120,7 @@ get_modifiers(char *buf, int16 *weight, bool *prefix)
return buf;
buf++;
- while (*buf && pg_mblen(buf) == 1)
+ while (*buf && pg_mblen_cstr(buf) == 1)
{
switch (*buf)
{
@@ -259,12 +259,12 @@ parse_or_operator(TSQueryParserState pstate)
return false;
/* it shouldn't be a part of any word */
- if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalnum(ptr))
+ if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalnum_cstr(ptr))
return false;
for (;;)
{
- ptr += pg_mblen(ptr);
+ ptr += pg_mblen_cstr(ptr);
if (*ptr == '\0') /* got end of string without operand */
return false;
@@ -390,7 +390,7 @@ gettoken_query_standard(TSQueryParserState state, int8 *operator,
break;
}
- state->buf += pg_mblen(state->buf);
+ state->buf += pg_mblen_cstr(state->buf);
}
}
@@ -502,7 +502,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
break;
}
- state->buf += pg_mblen(state->buf);
+ state->buf += pg_mblen_cstr(state->buf);
}
}
@@ -1014,9 +1014,8 @@ infix(INFIX *in, int parentPriority, bool rightPhraseOp)
*(in->cur) = '\\';
in->cur++;
}
- COPYCHAR(in->cur, op);
- clen = pg_mblen(op);
+ clen = ts_copychar_cstr(in->cur, op);
op += clen;
in->cur += clen;
}
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c
index 38342298a5d..024f5160cd4 100644
--- a/src/backend/utils/adt/tsvector.c
+++ b/src/backend/utils/adt/tsvector.c
@@ -319,9 +319,9 @@ tsvectorout(PG_FUNCTION_ARGS)
lenbuf = 0,
pp;
WordEntry *ptr = ARRPTR(out);
- char *curbegin,
- *curin,
+ char *curin,
*curout;
+ const char *curend;
lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
for (i = 0; i < out->size; i++)
@@ -334,13 +334,14 @@ tsvectorout(PG_FUNCTION_ARGS)
curout = outbuf = (char *) palloc(lenbuf);
for (i = 0; i < out->size; i++)
{
- curbegin = curin = STRPTR(out) + ptr->pos;
+ curin = STRPTR(out) + ptr->pos;
+ curend = curin + ptr->len;
if (i != 0)
*curout++ = ' ';
*curout++ = '\'';
- while (curin - curbegin < ptr->len)
+ while (curin < curend)
{
- int len = pg_mblen(curin);
+ int len = pg_mblen_range(curin, curend);
if (t_iseq(curin, '\''))
*curout++ = '\'';
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 94e0fed8309..71c7c7d3b3c 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -2604,11 +2604,15 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws)
if (ws)
{
char *buf;
+ const char *end;
buf = VARDATA_ANY(ws);
- while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws))
+ end = buf + VARSIZE_ANY_EXHDR(ws);
+ while (buf < end)
{
- if (pg_mblen(buf) == 1)
+ int len = pg_mblen_range(buf, end);
+
+ if (len == 1)
{
switch (*buf)
{
@@ -2632,7 +2636,7 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws)
stat->weight |= 0;
}
}
- buf += pg_mblen(buf);
+ buf += len;
}
}
diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c
index b3c04f6344f..efeaeb55334 100644
--- a/src/backend/utils/adt/tsvector_parser.c
+++ b/src/backend/utils/adt/tsvector_parser.c
@@ -208,8 +208,7 @@ gettoken_tsvector(TSVectorParseState state,
PRSSYNTAXERROR;
else if (!isspace((unsigned char) *state->prsbuf))
{
- COPYCHAR(curpos, state->prsbuf);
- curpos += pg_mblen(state->prsbuf);
+ curpos += ts_copychar_cstr(curpos, state->prsbuf);
statecode = WAITENDWORD;
}
}
@@ -223,8 +222,7 @@ gettoken_tsvector(TSVectorParseState state,
else
{
RESIZEPRSBUF;
- COPYCHAR(curpos, state->prsbuf);
- curpos += pg_mblen(state->prsbuf);
+ curpos += ts_copychar_cstr(curpos, state->prsbuf);
Assert(oldstate != 0);
statecode = oldstate;
}
@@ -259,8 +257,7 @@ gettoken_tsvector(TSVectorParseState state,
else
{
RESIZEPRSBUF;
- COPYCHAR(curpos, state->prsbuf);
- curpos += pg_mblen(state->prsbuf);
+ curpos += ts_copychar_cstr(curpos, state->prsbuf);
}
}
else if (statecode == WAITENDCMPLX)
@@ -279,8 +276,7 @@ gettoken_tsvector(TSVectorParseState state,
else
{
RESIZEPRSBUF;
- COPYCHAR(curpos, state->prsbuf);
- curpos += pg_mblen(state->prsbuf);
+ curpos += ts_copychar_cstr(curpos, state->prsbuf);
}
}
else if (statecode == WAITCHARCMPLX)
@@ -288,8 +284,7 @@ gettoken_tsvector(TSVectorParseState state,
if (!state->is_web && t_iseq(state->prsbuf, '\''))
{
RESIZEPRSBUF;
- COPYCHAR(curpos, state->prsbuf);
- curpos += pg_mblen(state->prsbuf);
+ curpos += ts_copychar_cstr(curpos, state->prsbuf);
statecode = WAITENDCMPLX;
}
else
@@ -300,7 +295,7 @@ gettoken_tsvector(TSVectorParseState state,
PRSSYNTAXERROR;
if (state->oprisdelim)
{
- /* state->prsbuf+=pg_mblen(state->prsbuf); */
+ /* state->prsbuf+=pg_mblen_cstr(state->prsbuf); */
RETURN_TOKEN;
}
else
@@ -383,6 +378,6 @@ gettoken_tsvector(TSVectorParseState state,
statecode);
/* get next char */
- state->prsbuf += pg_mblen(state->prsbuf);
+ state->prsbuf += pg_mblen_cstr(state->prsbuf);
}
}
diff --git a/src/backend/utils/adt/varbit.c b/src/backend/utils/adt/varbit.c
index 50ffee679b9..65ad1bfe18f 100644
--- a/src/backend/utils/adt/varbit.c
+++ b/src/backend/utils/adt/varbit.c
@@ -232,7 +232,7 @@ bit_in(PG_FUNCTION_ARGS)
ereturn(escontext, (Datum) 0,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("\"%.*s\" is not a valid binary digit",
- pg_mblen(sp), sp)));
+ pg_mblen_cstr(sp), sp)));
x >>= 1;
if (x == 0)
@@ -257,7 +257,7 @@ bit_in(PG_FUNCTION_ARGS)
ereturn(escontext, (Datum) 0,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("\"%.*s\" is not a valid hexadecimal digit",
- pg_mblen(sp), sp)));
+ pg_mblen_cstr(sp), sp)));
if (bc)
{
@@ -533,7 +533,7 @@ varbit_in(PG_FUNCTION_ARGS)
ereturn(escontext, (Datum) 0,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("\"%.*s\" is not a valid binary digit",
- pg_mblen(sp), sp)));
+ pg_mblen_cstr(sp), sp)));
x >>= 1;
if (x == 0)
@@ -558,7 +558,7 @@ varbit_in(PG_FUNCTION_ARGS)
ereturn(escontext, (Datum) 0,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("\"%.*s\" is not a valid hexadecimal digit",
- pg_mblen(sp), sp)));
+ pg_mblen_cstr(sp), sp)));
if (bc)
{
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 6c1ebb0866d..6bb14620a63 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -494,8 +494,11 @@ text_catenate(text *t1, text *t2)
* charlen_to_bytelen()
* Compute the number of bytes occupied by n characters starting at *p
*
- * It is caller's responsibility that there actually are n characters;
- * the string need not be null-terminated.
+ * The caller shall ensure there are n complete characters. Callers achieve
+ * this by deriving "n" from regmatch_t findings from searching a wchar array.
+ * pg_mb2wchar_with_len() skips any trailing incomplete character, so regex
+ * matches will end no later than the last complete character. (The string
+ * need not be null-terminated.)
*/
static int
charlen_to_bytelen(const char *p, int n)
@@ -510,7 +513,7 @@ charlen_to_bytelen(const char *p, int n)
const char *s;
for (s = p; n > 0; n--)
- s += pg_mblen(s);
+ s += pg_mblen_unbounded(s); /* caller verified encoding */
return s - p;
}
@@ -644,6 +647,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
int32 slice_start;
int32 slice_size;
int32 slice_strlen;
+ int32 slice_len;
text *slice;
int32 E1;
int32 i;
@@ -713,7 +717,8 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
slice = (text *) DatumGetPointer(str);
/* see if we got back an empty string */
- if (VARSIZE_ANY_EXHDR(slice) == 0)
+ slice_len = VARSIZE_ANY_EXHDR(slice);
+ if (slice_len == 0)
{
if (slice != (text *) DatumGetPointer(str))
pfree(slice);
@@ -722,7 +727,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
/* Now we can get the actual length of the slice in MB characters */
slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
- VARSIZE_ANY_EXHDR(slice));
+ slice_len);
/*
* Check that the start position wasn't > slice_strlen. If so, SQL99
@@ -749,7 +754,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
*/
p = VARDATA_ANY(slice);
for (i = 0; i < S1 - 1; i++)
- p += pg_mblen(p);
+ p += pg_mblen_unbounded(p);
/* hang onto a pointer to our start position */
s = p;
@@ -759,7 +764,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
* length.
*/
for (i = S1; i < E1; i++)
- p += pg_mblen(p);
+ p += pg_mblen_unbounded(p);
ret = (text *) palloc(VARHDRSZ + (p - s));
SET_VARSIZE(ret, VARHDRSZ + (p - s));
@@ -1064,6 +1069,8 @@ text_position_next(TextPositionState *state)
*/
if (state->is_multibyte_char_in_char && state->locale->deterministic)
{
+ const char *haystack_end = state->str1 + state->len1;
+
/* Walk one character at a time, until we reach the match. */
/* the search should never move backwards. */
@@ -1072,7 +1079,7 @@ text_position_next(TextPositionState *state)
while (state->refpoint < matchptr)
{
/* step to next character. */
- state->refpoint += pg_mblen(state->refpoint);
+ state->refpoint += pg_mblen_range(state->refpoint, haystack_end);
state->refpos++;
/*
@@ -1160,7 +1167,7 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
test_end = hptr;
do
{
- test_end += pg_mblen(test_end);
+ test_end += pg_mblen_range(test_end, haystack_end);
if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0)
{
state->last_match_len_tmp = (test_end - hptr);
@@ -1173,7 +1180,7 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
if (result_hptr)
break;
- hptr += pg_mblen(hptr);
+ hptr += pg_mblen_range(hptr, haystack_end);
}
return (char *) result_hptr;
@@ -3767,6 +3774,8 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
}
else
{
+ const char *end_ptr;
+
/*
* When fldsep is NULL, each character in the input string becomes a
* separate element in the result set. The separator is effectively
@@ -3775,10 +3784,11 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
start_ptr = VARDATA_ANY(inputstring);
+ end_ptr = start_ptr + inputstring_len;
while (inputstring_len > 0)
{
- int chunk_len = pg_mblen(start_ptr);
+ int chunk_len = pg_mblen_range(start_ptr, end_ptr);
CHECK_FOR_INTERRUPTS();
@@ -3898,6 +3908,7 @@ array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
int typlen;
bool typbyval;
char typalign;
+ uint8 typalignby;
StringInfoData buf;
bool printed = false;
char *p;
@@ -3947,6 +3958,7 @@ array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
typlen = my_extra->typlen;
typbyval = my_extra->typbyval;
typalign = my_extra->typalign;
+ typalignby = typalign_to_alignby(typalign);
p = ARR_DATA_PTR(v);
bitmap = ARR_NULLBITMAP(v);
@@ -3983,7 +3995,7 @@ array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
printed = true;
p = att_addlength_pointer(p, typlen, p);
- p = (char *) att_align_nominal(p, typalign);
+ p = (char *) att_nominal_alignby(p, typalignby);
}
/* advance bitmap pointer if any */
@@ -4682,7 +4694,7 @@ text_reverse(PG_FUNCTION_ARGS)
{
int sz;
- sz = pg_mblen(p);
+ sz = pg_mblen_range(p, endp);
dst -= sz;
memcpy(dst, p, sz);
p += sz;
@@ -4843,7 +4855,7 @@ text_format(PG_FUNCTION_ARGS)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized format() type specifier \"%.*s\"",
- pg_mblen(cp), cp),
+ pg_mblen_range(cp, end_ptr), cp),
errhint("For a single \"%%\" use \"%%%%\".")));
/* If indirect width was specified, get its value */
@@ -4964,7 +4976,7 @@ text_format(PG_FUNCTION_ARGS)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized format() type specifier \"%.*s\"",
- pg_mblen(cp), cp),
+ pg_mblen_range(cp, end_ptr), cp),
errhint("For a single \"%%\" use \"%%%%\".")));
break;
}
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index f69dc68286c..fcb13e7c0a1 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -2376,8 +2376,7 @@ sqlchar_to_unicode(const char *s)
char *utf8string;
pg_wchar ret[2]; /* need space for trailing zero */
- /* note we're not assuming s is null-terminated */
- utf8string = pg_server_to_any(s, pg_mblen(s), PG_UTF8);
+ utf8string = pg_server_to_any(s, pg_mblen_cstr(s), PG_UTF8);
pg_encoding_mb2wchar_with_len(PG_UTF8, utf8string, ret,
pg_encoding_mblen(PG_UTF8, utf8string));
@@ -2430,7 +2429,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped,
initStringInfo(&buf);
- for (p = ident; *p; p += pg_mblen(p))
+ for (p = ident; *p; p += pg_mblen_cstr(p))
{
if (*p == ':' && (p == ident || fully_escaped))
appendStringInfoString(&buf, "_x003A_");
@@ -2455,7 +2454,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped,
: !is_valid_xml_namechar(u))
appendStringInfo(&buf, "_x%04X_", (unsigned int) u);
else
- appendBinaryStringInfo(&buf, p, pg_mblen(p));
+ appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p));
}
}
@@ -2478,7 +2477,7 @@ map_xml_name_to_sql_identifier(const char *name)
initStringInfo(&buf);
- for (p = name; *p; p += pg_mblen(p))
+ for (p = name; *p; p += pg_mblen_cstr(p))
{
if (*p == '_' && *(p + 1) == 'x'
&& isxdigit((unsigned char) *(p + 2))
@@ -2496,7 +2495,7 @@ map_xml_name_to_sql_identifier(const char *name)
p += 6;
}
else
- appendBinaryStringInfo(&buf, p, pg_mblen(p));
+ appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p));
}
return buf.data;
diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index aa530d3685e..129906e2daa 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -182,6 +182,7 @@ static bool matches_backtrace_functions(const char *funcname);
static pg_noinline void set_backtrace(ErrorData *edata, int num_skip);
static void set_errdata_field(MemoryContextData *cxt, char **ptr, const char *str);
static void FreeErrorDataContents(ErrorData *edata);
+static int log_min_messages_cmp(const ListCell *a, const ListCell *b);
static void write_console(const char *line, int len);
static const char *process_log_prefix_padding(const char *p, int *ppadding);
static void log_line_prefix(StringInfo buf, ErrorData *edata);
@@ -235,7 +236,7 @@ is_log_level_output(int elevel, int log_min_level)
static inline bool
should_output_to_server(int elevel)
{
- return is_log_level_output(elevel, log_min_messages);
+ return is_log_level_output(elevel, log_min_messages[MyBackendType]);
}
/*
@@ -2170,6 +2171,250 @@ DebugFileOpen(void)
}
+/*
+ * GUC check_hook for log_min_messages
+ *
+ * This value is parsed as a comma-separated list of zero or more TYPE:LEVEL
+ * elements. For each element, TYPE corresponds to a bk_category value (see
+ * postmaster/proctypelist.h); LEVEL is one of server_message_level_options.
+ *
+ * In addition, there must be a single LEVEL element (with no TYPE part)
+ * which sets the default level for process types that aren't specified.
+ */
+bool
+check_log_min_messages(char **newval, void **extra, GucSource source)
+{
+ char *rawstring;
+ List *elemlist;
+ StringInfoData buf;
+ char *result;
+ int newlevel[BACKEND_NUM_TYPES];
+ bool assigned[BACKEND_NUM_TYPES] = {0};
+ int genericlevel = -1; /* -1 means not assigned */
+
+ const char *const process_types[] = {
+#define PG_PROCTYPE(bktype, bkcategory, description, main_func, shmem_attach) \
+ [bktype] = bkcategory,
+#include "postmaster/proctypelist.h"
+#undef PG_PROCTYPE
+ };
+
+ /* Need a modifiable copy of string. */
+ rawstring = guc_strdup(LOG, *newval);
+ if (rawstring == NULL)
+ return false;
+
+ /* Parse the string into a list. */
+ if (!SplitGUCList(rawstring, ',', &elemlist))
+ {
+ /* syntax error in list */
+ GUC_check_errdetail("List syntax is invalid.");
+ list_free(elemlist);
+ guc_free(rawstring);
+ return false;
+ }
+
+ /* Validate and assign log level and process type. */
+ foreach_ptr(char, elem, elemlist)
+ {
+ char *sep = strchr(elem, ':');
+
+ /*
+ * If there's no ':' separator in the entry, this is the default log
+ * level. Otherwise it's a process type-specific entry.
+ */
+ if (sep == NULL)
+ {
+ const struct config_enum_entry *entry;
+ bool found;
+
+ /* Reject duplicates for generic log level. */
+ if (genericlevel != -1)
+ {
+ GUC_check_errdetail("Redundant specification of default log level.");
+ goto lmm_fail;
+ }
+
+ /* Validate the log level */
+ found = false;
+ for (entry = server_message_level_options; entry && entry->name; entry++)
+ {
+ if (pg_strcasecmp(entry->name, elem) == 0)
+ {
+ genericlevel = entry->val;
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ {
+ GUC_check_errdetail("Unrecognized log level: \"%s\".", elem);
+ goto lmm_fail;
+ }
+ }
+ else
+ {
+ char *loglevel = sep + 1;
+ char *ptype = elem;
+ bool found;
+ int level;
+ const struct config_enum_entry *entry;
+
+ /*
+ * Temporarily clobber the ':' with a string terminator, so that
+ * we can validate it. We restore this at the bottom.
+ */
+ *sep = '\0';
+
+ /* Validate the log level */
+ found = false;
+ for (entry = server_message_level_options; entry && entry->name; entry++)
+ {
+ if (pg_strcasecmp(entry->name, loglevel) == 0)
+ {
+ level = entry->val;
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ {
+ GUC_check_errdetail("Unrecognized log level for process type \"%s\": \"%s\".",
+ ptype, loglevel);
+ goto lmm_fail;
+ }
+
+ /* Is the process type name valid and unique? */
+ found = false;
+ for (int i = 0; i < BACKEND_NUM_TYPES; i++)
+ {
+ if (pg_strcasecmp(process_types[i], ptype) == 0)
+ {
+ /* Reject duplicates for a process type. */
+ if (assigned[i])
+ {
+ GUC_check_errdetail("Redundant log level specification for process type \"%s\".",
+ ptype);
+ goto lmm_fail;
+ }
+
+ newlevel[i] = level;
+ assigned[i] = true;
+ found = true;
+
+ /*
+ * note: we must keep looking! some process types appear
+ * multiple times in proctypelist.h.
+ */
+ }
+ }
+
+ if (!found)
+ {
+ GUC_check_errdetail("Unrecognized process type \"%s\".", ptype);
+ goto lmm_fail;
+ }
+
+ /* Put the separator back in place */
+ *sep = ':';
+ }
+
+ /* all good */
+ continue;
+
+lmm_fail:
+ guc_free(rawstring);
+ list_free(elemlist);
+ return false;
+ }
+
+ /*
+ * The generic log level must be specified. It is the fallback value.
+ */
+ if (genericlevel == -1)
+ {
+ GUC_check_errdetail("Default log level was not defined.");
+ guc_free(rawstring);
+ list_free(elemlist);
+ return false;
+ }
+
+ /* Apply the default log level to all processes not listed. */
+ for (int i = 0; i < BACKEND_NUM_TYPES; i++)
+ {
+ if (!assigned[i])
+ newlevel[i] = genericlevel;
+ }
+
+ /*
+ * Save an ordered representation of the user-specified string, for the
+ * show_hook.
+ */
+ list_sort(elemlist, log_min_messages_cmp);
+
+ initStringInfoExt(&buf, strlen(rawstring) + 1);
+ foreach_ptr(char, elem, elemlist)
+ {
+ if (foreach_current_index(elem) == 0)
+ appendStringInfoString(&buf, elem);
+ else
+ appendStringInfo(&buf, ", %s", elem);
+ }
+
+ result = (char *) guc_malloc(LOG, buf.len + 1);
+ if (!result)
+ return false;
+ memcpy(result, buf.data, buf.len);
+ result[buf.len] = '\0';
+
+ guc_free(*newval);
+ *newval = result;
+
+ guc_free(rawstring);
+ list_free(elemlist);
+ pfree(buf.data);
+
+ /*
+ * Pass back data for assign_log_min_messages to use.
+ */
+ *extra = guc_malloc(LOG, BACKEND_NUM_TYPES * sizeof(int));
+ if (!*extra)
+ return false;
+ memcpy(*extra, newlevel, BACKEND_NUM_TYPES * sizeof(int));
+
+ return true;
+}
+
+/*
+ * list_sort() callback for check_log_min_messages. The default element
+ * goes first; the rest are ordered by strcmp() of the process type.
+ */
+static int
+log_min_messages_cmp(const ListCell *a, const ListCell *b)
+{
+ const char *s = lfirst(a);
+ const char *t = lfirst(b);
+
+ if (strchr(s, ':') == NULL)
+ return -1;
+ else if (strchr(t, ':') == NULL)
+ return 1;
+ else
+ return strcmp(s, t);
+}
+
+/*
+ * GUC assign_hook for log_min_messages
+ */
+void
+assign_log_min_messages(const char *newval, void *extra)
+{
+ for (int i = 0; i < BACKEND_NUM_TYPES; i++)
+ log_min_messages[i] = ((int *) extra)[i];
+}
+
/*
* GUC check_hook for backtrace_functions
*
@@ -2779,7 +3024,12 @@ get_backend_type_for_log(void)
if (MyProcPid == PostmasterPid)
backend_type_str = "postmaster";
else if (MyBackendType == B_BG_WORKER)
- backend_type_str = MyBgworkerEntry->bgw_type;
+ {
+ if (MyBgworkerEntry)
+ backend_type_str = MyBgworkerEntry->bgw_type;
+ else
+ backend_type_str = "early bgworker";
+ }
else
backend_type_str = GetBackendTypeDesc(MyBackendType);
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index 563f20374ff..03f6c8479f2 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -266,7 +266,7 @@ GetBackendTypeDesc(BackendType backendType)
switch (backendType)
{
-#define PG_PROCTYPE(bktype, description, main_func, shmem_attach) \
+#define PG_PROCTYPE(bktype, bkcategory, description, main_func, shmem_attach) \
case bktype: backendDesc = description; break;
#include "postmaster/proctypelist.h"
#undef PG_PROCTYPE
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index 6950e743d03..a5a734839af 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -38,6 +38,7 @@
#include "catalog/namespace.h"
#include "mb/pg_wchar.h"
#include "utils/fmgrprotos.h"
+#include "utils/memdebug.h"
#include "utils/memutils.h"
#include "utils/relcache.h"
#include "varatt.h"
@@ -97,6 +98,13 @@ static char *perform_default_encoding_conversion(const char *src,
int len, bool is_client_to_server);
static int cliplen(const char *str, int len, int limit);
+pg_noreturn
+static void report_invalid_encoding_int(int encoding, const char *mbstr,
+ int mblen, int len);
+
+pg_noreturn
+static void report_invalid_encoding_db(const char *mbstr, int mblen, int len);
+
/*
* Prepare for a future call to SetClientEncoding. Success should mean
@@ -1021,11 +1029,126 @@ pg_encoding_wchar2mb_with_len(int encoding,
return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
}
-/* returns the byte length of a multibyte character */
+/*
+ * Returns the byte length of a multibyte character sequence in a
+ * null-terminated string. Raises an illegal byte sequence error if the
+ * sequence would hit a null terminator.
+ *
+ * The caller is expected to have checked for a terminator at *mbstr == 0
+ * before calling, but some callers want 1 in that case, so this function
+ * continues that tradition.
+ *
+ * This must only be used for strings that have a null-terminator to enable
+ * bounds detection.
+ */
+int
+pg_mblen_cstr(const char *mbstr)
+{
+ int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+ /*
+ * The .mblen functions return 1 when given a pointer to a terminator.
+ * Some callers depend on that, so we tolerate it for now. Well-behaved
+ * callers check the leading byte for a terminator *before* calling.
+ */
+ for (int i = 1; i < length; ++i)
+ if (unlikely(mbstr[i] == 0))
+ report_invalid_encoding_db(mbstr, length, i);
+
+ /*
+ * String should be NUL-terminated, but checking that would make typical
+ * callers O(N^2), tripling Valgrind check-world time. Unless
+ * VALGRIND_EXPENSIVE, check 1 byte after each actual character. (If we
+ * found a character, not a terminator, the next byte must be a terminator
+ * or the start of the next character.) If the caller iterates the whole
+ * string, the last call will diagnose a missing terminator.
+ */
+ if (mbstr[0] != '\0')
+ {
+#ifdef VALGRIND_EXPENSIVE
+ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, strlen(mbstr));
+#else
+ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr + length, 1);
+#endif
+ }
+
+ return length;
+}
+
+/*
+ * Returns the byte length of a multibyte character sequence bounded by a range
+ * [mbstr, end) of at least one byte in size. Raises an illegal byte sequence
+ * error if the sequence would exceed the range.
+ */
+int
+pg_mblen_range(const char *mbstr, const char *end)
+{
+ int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+ Assert(end > mbstr);
+#ifdef VALGRIND_EXPENSIVE
+ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, end - mbstr);
+#else
+ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
+#endif
+
+ if (unlikely(mbstr + length > end))
+ report_invalid_encoding_db(mbstr, length, end - mbstr);
+
+ return length;
+}
+
+/*
+ * Returns the byte length of a multibyte character sequence bounded by a range
+ * extending for 'limit' bytes, which must be at least one. Raises an illegal
+ * byte sequence error if the sequence would exceed the range.
+ */
+int
+pg_mblen_with_len(const char *mbstr, int limit)
+{
+ int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+ Assert(limit >= 1);
+#ifdef VALGRIND_EXPENSIVE
+ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, limit);
+#else
+ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
+#endif
+
+ if (unlikely(length > limit))
+ report_invalid_encoding_db(mbstr, length, limit);
+
+ return length;
+}
+
+
+/*
+ * Returns the length of a multibyte character sequence, without any
+ * validation of bounds.
+ *
+ * PLEASE NOTE: This function can only be used safely if the caller has
+ * already verified the input string, since otherwise there is a risk of
+ * overrunning the buffer if the string is invalid. A prior call to a
+ * pg_mbstrlen* function suffices.
+ */
+int
+pg_mblen_unbounded(const char *mbstr)
+{
+ int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+
+ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length);
+
+ return length;
+}
+
+/*
+ * Historical name for pg_mblen_unbounded(). Should not be used and will be
+ * removed in a later version.
+ */
int
pg_mblen(const char *mbstr)
{
- return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
+ return pg_mblen_unbounded(mbstr);
}
/* returns the display length of a multibyte character */
@@ -1047,14 +1170,14 @@ pg_mbstrlen(const char *mbstr)
while (*mbstr)
{
- mbstr += pg_mblen(mbstr);
+ mbstr += pg_mblen_cstr(mbstr);
len++;
}
return len;
}
/* returns the length (counted in wchars) of a multibyte string
- * (not necessarily NULL terminated)
+ * (stops at the first of "limit" or a NUL)
*/
int
pg_mbstrlen_with_len(const char *mbstr, int limit)
@@ -1067,7 +1190,7 @@ pg_mbstrlen_with_len(const char *mbstr, int limit)
while (limit > 0 && *mbstr)
{
- int l = pg_mblen(mbstr);
+ int l = pg_mblen_with_len(mbstr, limit);
limit -= l;
mbstr += l;
@@ -1137,7 +1260,7 @@ pg_mbcharcliplen(const char *mbstr, int len, int limit)
while (len > 0 && *mbstr)
{
- l = pg_mblen(mbstr);
+ l = pg_mblen_with_len(mbstr, len);
nch++;
if (nch > limit)
break;
@@ -1701,12 +1824,19 @@ void
report_invalid_encoding(int encoding, const char *mbstr, int len)
{
int l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len);
+
+ report_invalid_encoding_int(encoding, mbstr, l, len);
+}
+
+static void
+report_invalid_encoding_int(int encoding, const char *mbstr, int mblen, int len)
+{
char buf[8 * 5 + 1];
char *p = buf;
int j,
jlimit;
- jlimit = Min(l, len);
+ jlimit = Min(mblen, len);
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
for (j = 0; j < jlimit; j++)
@@ -1723,6 +1853,12 @@ report_invalid_encoding(int encoding, const char *mbstr, int len)
buf)));
}
+static void
+report_invalid_encoding_db(const char *mbstr, int mblen, int len)
+{
+ report_invalid_encoding_int(GetDatabaseEncoding(), mbstr, mblen, len);
+}
+
/*
* report_untranslatable_char: complain about untranslatable character
*
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index f0260e6e412..762b8efe6b0 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -1042,6 +1042,13 @@
options => 'file_copy_method_options',
},
+{ name => 'file_extend_method', type => 'enum', context => 'PGC_SIGHUP', group => 'RESOURCES_DISK',
+ short_desc => 'Selects the method used for extending data files.',
+ variable => 'file_extend_method',
+ boot_val => 'DEFAULT_FILE_EXTEND_METHOD',
+ options => 'file_extend_method_options',
+},
+
{ name => 'from_collapse_limit', type => 'int', context => 'PGC_USERSET', group => 'QUERY_TUNING_OTHER',
short_desc => 'Sets the FROM-list size beyond which subqueries are not collapsed.',
long_desc => 'The planner will merge subqueries into upper queries if the resulting FROM list would have no more than this many items.',
@@ -1686,12 +1693,14 @@
options => 'server_message_level_options',
},
-{ name => 'log_min_messages', type => 'enum', context => 'PGC_SUSET', group => 'LOGGING_WHEN',
+{ name => 'log_min_messages', type => 'string', context => 'PGC_SUSET', group => 'LOGGING_WHEN',
short_desc => 'Sets the message levels that are logged.',
long_desc => 'Each level includes all the levels that follow it. The later the level, the fewer messages are sent.',
- variable => 'log_min_messages',
- boot_val => 'WARNING',
- options => 'server_message_level_options',
+ flags => 'GUC_LIST_INPUT',
+ variable => 'log_min_messages_string',
+ boot_val => '"WARNING"',
+ check_hook => 'check_log_min_messages',
+ assign_hook => 'assign_log_min_messages',
},
{ name => 'log_parameter_max_length', type => 'int', context => 'PGC_SUSET', group => 'LOGGING_WHAT',
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 13c569d8790..741fce8dede 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -80,6 +80,7 @@
#include "storage/bufmgr.h"
#include "storage/bufpage.h"
#include "storage/copydir.h"
+#include "storage/fd.h"
#include "storage/io_worker.h"
#include "storage/large_object.h"
#include "storage/pg_shmem.h"
@@ -146,7 +147,7 @@ static const struct config_enum_entry client_message_level_options[] = {
{NULL, 0, false}
};
-static const struct config_enum_entry server_message_level_options[] = {
+const struct config_enum_entry server_message_level_options[] = {
{"debug5", DEBUG5, false},
{"debug4", DEBUG4, false},
{"debug3", DEBUG3, false},
@@ -491,6 +492,14 @@ static const struct config_enum_entry file_copy_method_options[] = {
{NULL, 0, false}
};
+static const struct config_enum_entry file_extend_method_options[] = {
+#ifdef HAVE_POSIX_FALLOCATE
+ {"posix_fallocate", FILE_EXTEND_METHOD_POSIX_FALLOCATE, false},
+#endif
+ {"write_zeros", FILE_EXTEND_METHOD_WRITE_ZEROS, false},
+ {NULL, 0, false}
+};
+
/*
* Options for enum values stored in other modules
*/
@@ -537,7 +546,6 @@ static bool standard_conforming_strings = true;
bool current_role_is_superuser;
int log_min_error_statement = ERROR;
-int log_min_messages = WARNING;
int client_min_messages = NOTICE;
int log_min_duration_sample = -1;
int log_min_duration_statement = -1;
@@ -595,6 +603,7 @@ static char *server_version_string;
static int server_version_num;
static char *debug_io_direct_string;
static char *restrict_nonsystem_relation_kind_string;
+static char *log_min_messages_string;
#ifdef HAVE_SYSLOG
#define DEFAULT_SYSLOG_FACILITY LOG_LOCAL0
@@ -647,6 +656,15 @@ char *role_string;
/* should be static, but guc.c needs to get at this */
bool in_hot_standby_guc;
+/*
+ * set default log_min_messages to WARNING for all process types
+ */
+int log_min_messages[] = {
+#define PG_PROCTYPE(bktype, bkcategory, description, main_func, shmem_attach) \
+ [bktype] = WARNING,
+#include "postmaster/proctypelist.h"
+#undef PG_PROCTYPE
+};
/*
* Displayable names for context types (enum GucContext)
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index c4f92fcdac8..6e82c8e055d 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -179,6 +179,10 @@
# in kilobytes, or -1 for no limit
#file_copy_method = copy # copy, clone (if supported by OS)
+#file_extend_method = posix_fallocate # the default is the first option supported
+ # by the operating system:
+ # posix_fallocate (most Unix-like systems)
+ # write_zeros
#max_notify_queue_pages = 1048576 # limits the number of SLRU pages allocated
# for NOTIFY / LISTEN queue
@@ -528,7 +532,21 @@
# - When to Log -
-#log_min_messages = warning # values in order of decreasing detail:
+#log_min_messages = warning # comma-separated list of
+ # process_type:level entries, plus
+ # one freestanding level as default.
+ # Valid process types are:
+ # archiver autovacuum
+ # backend bgworker
+ # bgwriter checkpointer
+ # ioworker postmaster
+ # slotsyncworker startup
+ # syslogger walreceiver
+ # walsummarizer walwriter
+ # walsender
+ #
+ # Level values in order of decreasing
+ # detail:
# debug5
# debug4
# debug3
diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c
index 4fa4d432021..c1a53e658cb 100644
--- a/src/backend/utils/mmgr/portalmem.c
+++ b/src/backend/utils/mmgr/portalmem.c
@@ -294,9 +294,8 @@ PortalDefineQuery(Portal portal,
portal->prepStmtName = prepStmtName;
portal->sourceText = sourceText;
- portal->qc.commandTag = commandTag;
- portal->qc.nprocessed = 0;
portal->commandTag = commandTag;
+ SetQueryCompletion(&portal->qc, commandTag, 0);
portal->stmts = stmts;
portal->cplan = cplan;
portal->status = PORTAL_DEFINED;
diff --git a/src/backend/utils/sort/sharedtuplestore.c b/src/backend/utils/sort/sharedtuplestore.c
index 8f35a255263..04189f708fa 100644
--- a/src/backend/utils/sort/sharedtuplestore.c
+++ b/src/backend/utils/sort/sharedtuplestore.c
@@ -323,7 +323,8 @@ sts_puttuple(SharedTuplestoreAccessor *accessor, void *meta_data,
/* Do we have space? */
size = accessor->sts->meta_data_size + tuple->t_len;
- if (accessor->write_pointer + size > accessor->write_end)
+ if (accessor->write_pointer == NULL ||
+ accessor->write_pointer + size > accessor->write_end)
{
if (accessor->write_chunk == NULL)
{
diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c
index dd3c0dc1c89..0287d6e87df 100644
--- a/src/bin/pg_combinebackup/copy_file.c
+++ b/src/bin/pg_combinebackup/copy_file.c
@@ -210,7 +210,7 @@ copy_file_blocks(const char *src, const char *dst,
}
if (rb < 0)
- pg_fatal("could not read from file \"%s\": %m", dst);
+ pg_fatal("could not read from file \"%s\": %m", src);
pg_free(buffer);
close(src_fd);
diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c
index 918b8b35646..b9f26ce782e 100644
--- a/src/bin/pg_combinebackup/pg_combinebackup.c
+++ b/src/bin/pg_combinebackup/pg_combinebackup.c
@@ -501,7 +501,7 @@ add_tablespace_mapping(cb_options *opt, char *arg)
tsmap->old_dir);
if (!is_absolute_path(tsmap->new_dir))
- pg_fatal("old directory is not an absolute path in tablespace mapping: %s",
+ pg_fatal("new directory is not an absolute path in tablespace mapping: %s",
tsmap->new_dir);
/* Canonicalize paths to avoid spurious failures when comparing. */
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 2bebefd0ba2..2c3754d020f 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -214,12 +214,6 @@ static int nbinaryUpgradeClassOids = 0;
static SequenceItem *sequences = NULL;
static int nsequences = 0;
-/*
- * For binary upgrade, the dump ID of pg_largeobject_metadata is saved for use
- * as a dependency for pg_shdepend and any large object comments/seclabels.
- */
-static DumpId lo_metadata_dumpId;
-
/* Maximum number of relations to fetch in a fetchAttributeStats() call. */
#define MAX_ATTR_STATS_RELS 64
@@ -1121,27 +1115,20 @@ main(int argc, char **argv)
getTableData(&dopt, tblinfo, numTables, RELKIND_SEQUENCE);
/*
- * For binary upgrade mode, dump pg_largeobject_metadata and the
- * associated pg_shdepend rows. This is faster to restore than the
- * equivalent set of large object commands. We can only do this for
- * upgrades from v12 and newer; in older versions, pg_largeobject_metadata
- * was created WITH OIDS, so the OID column is hidden and won't be dumped.
+ * For binary upgrade mode, dump the pg_shdepend rows for large objects
+ * and maybe even pg_largeobject_metadata (see comment below for details).
+ * This is faster to restore than the equivalent set of large object
+ * commands. We can only do this for upgrades from v12 and newer; in
+ * older versions, pg_largeobject_metadata was created WITH OIDS, so the
+ * OID column is hidden and won't be dumped.
*/
if (dopt.binary_upgrade && fout->remoteVersion >= 120000)
{
- TableInfo *lo_metadata = findTableByOid(LargeObjectMetadataRelationId);
- TableInfo *shdepend = findTableByOid(SharedDependRelationId);
+ TableInfo *shdepend;
- makeTableDataInfo(&dopt, lo_metadata);
+ shdepend = findTableByOid(SharedDependRelationId);
makeTableDataInfo(&dopt, shdepend);
- /*
- * Save pg_largeobject_metadata's dump ID for use as a dependency for
- * pg_shdepend and any large object comments/seclabels.
- */
- lo_metadata_dumpId = lo_metadata->dataObj->dobj.dumpId;
- addObjectDependency(&shdepend->dataObj->dobj, lo_metadata_dumpId);
-
/*
* Only dump large object shdepend rows for this database.
*/
@@ -1150,21 +1137,19 @@ main(int argc, char **argv)
" WHERE datname = current_database())";
/*
- * If upgrading from v16 or newer, only dump large objects with
- * comments/seclabels. For these upgrades, pg_upgrade can copy/link
- * pg_largeobject_metadata's files (which is usually faster) but we
- * still need to dump LOs with comments/seclabels here so that the
- * subsequent COMMENT and SECURITY LABEL commands work. pg_upgrade
- * can't copy/link the files from older versions because aclitem
- * (needed by pg_largeobject_metadata.lomacl) changed its storage
- * format in v16.
+ * For binary upgrades from v16 and newer versions, we can copy
+ * pg_largeobject_metadata's files from the old cluster, so we don't
+ * need to dump its contents. pg_upgrade can't copy/link the files
+ * from older versions because aclitem (needed by
+ * pg_largeobject_metadata.lomacl) changed its storage format in v16.
*/
- if (fout->remoteVersion >= 160000)
- lo_metadata->dataObj->filtercond = "WHERE oid IN "
- "(SELECT objoid FROM pg_description "
- "WHERE classoid = " CppAsString2(LargeObjectRelationId) " "
- "UNION SELECT objoid FROM pg_seclabel "
- "WHERE classoid = " CppAsString2(LargeObjectRelationId) ")";
+ if (fout->remoteVersion < 160000)
+ {
+ TableInfo *lo_metadata;
+
+ lo_metadata = findTableByOid(LargeObjectMetadataRelationId);
+ makeTableDataInfo(&dopt, lo_metadata);
+ }
}
/*
@@ -3979,7 +3964,25 @@ getLOs(Archive *fout)
appendPQExpBufferStr(loQry,
"SELECT oid, lomowner, lomacl, "
"acldefault('L', lomowner) AS acldefault "
- "FROM pg_largeobject_metadata "
+ "FROM pg_largeobject_metadata ");
+
+ /*
+ * For binary upgrades from v12 or newer, we transfer
+ * pg_largeobject_metadata via COPY or by copying/linking its files from
+ * the old cluster. On such upgrades, we only need to consider large
+ * objects that have comments or security labels, since we still restore
+ * those objects via COMMENT/SECURITY LABEL commands.
+ */
+ if (dopt->binary_upgrade &&
+ fout->remoteVersion >= 120000)
+ appendPQExpBufferStr(loQry,
+ "WHERE oid IN "
+ "(SELECT objoid FROM pg_description "
+ "WHERE classoid = " CppAsString2(LargeObjectRelationId) " "
+ "UNION SELECT objoid FROM pg_seclabel "
+ "WHERE classoid = " CppAsString2(LargeObjectRelationId) ") ");
+
+ appendPQExpBufferStr(loQry,
"ORDER BY lomowner, lomacl::pg_catalog.text, oid");
res = ExecuteSqlQuery(fout, loQry->data, PGRES_TUPLES_OK);
@@ -4062,36 +4065,20 @@ getLOs(Archive *fout)
/*
* In binary-upgrade mode for LOs, we do *not* dump out the LO data,
* as it will be copied by pg_upgrade, which simply copies the
- * pg_largeobject table. We *do* however dump out anything but the
- * data, as pg_upgrade copies just pg_largeobject, but not
- * pg_largeobject_metadata, after the dump is restored. In versions
- * before v12, this is done via proper large object commands. In
- * newer versions, we dump the content of pg_largeobject_metadata and
- * any associated pg_shdepend rows, which is faster to restore. (On
- * binary_upgrade)
{
if (fout->remoteVersion >= 120000)
- {
- /*
- * We should've saved pg_largeobject_metadata's dump ID before
- * this point.
- */
- Assert(lo_metadata_dumpId);
-
loinfo->dobj.dump &= ~(DUMP_COMPONENT_DATA | DUMP_COMPONENT_ACL | DUMP_COMPONENT_DEFINITION);
-
- /*
- * Mark the large object as dependent on
- * pg_largeobject_metadata so that any large object
- * comments/seclables are dumped after it.
- */
- loinfo->dobj.dependencies = (DumpId *) pg_malloc(sizeof(DumpId));
- loinfo->dobj.dependencies[0] = lo_metadata_dumpId;
- loinfo->dobj.nDeps = loinfo->dobj.allocDeps = 1;
- }
else
loinfo->dobj.dump &= ~DUMP_COMPONENT_DATA;
}
diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c
index b2c4b9db395..85dc43d4cdb 100644
--- a/src/bin/pg_resetwal/pg_resetwal.c
+++ b/src/bin/pg_resetwal/pg_resetwal.c
@@ -913,10 +913,10 @@ RewriteControlFile(void)
ControlFile.state = DB_SHUTDOWNED;
ControlFile.checkPoint = ControlFile.checkPointCopy.redo;
- ControlFile.minRecoveryPoint = 0;
+ ControlFile.minRecoveryPoint = InvalidXLogRecPtr;
ControlFile.minRecoveryPointTLI = 0;
- ControlFile.backupStartPoint = 0;
- ControlFile.backupEndPoint = 0;
+ ControlFile.backupStartPoint = InvalidXLogRecPtr;
+ ControlFile.backupEndPoint = InvalidXLogRecPtr;
ControlFile.backupEndRequired = false;
/*
@@ -1077,6 +1077,8 @@ KillExistingArchiveStatus(void)
if (closedir(xldir))
pg_fatal("could not close directory \"%s\": %m", ARCHSTATDIR);
+
+#undef ARCHSTATDIR
}
/*
@@ -1111,7 +1113,10 @@ KillExistingWALSummaries(void)
pg_fatal("could not read directory \"%s\": %m", WALSUMMARYDIR);
if (closedir(xldir))
- pg_fatal("could not close directory \"%s\": %m", ARCHSTATDIR);
+ pg_fatal("could not close directory \"%s\": %m", WALSUMMARYDIR);
+
+#undef WALSUMMARY_NHEXCHARS
+#undef WALSUMMARYDIR
}
/*
@@ -1147,7 +1152,7 @@ WriteEmptyXLOG(void)
/* Insert the initial checkpoint record */
recptr = (char *) page + SizeOfXLogLongPHD;
record = (XLogRecord *) recptr;
- record->xl_prev = 0;
+ record->xl_prev = InvalidXLogRecPtr;
record->xl_xid = InvalidTransactionId;
record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint);
record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c
index 356e23a3080..5cfb676f41f 100644
--- a/src/bin/pg_rewind/file_ops.c
+++ b/src/bin/pg_rewind/file_ops.c
@@ -327,7 +327,7 @@ slurpFile(const char *datadir, const char *path, size_t *filesize)
fullpath);
if (fstat(fd, &statbuf) < 0)
- pg_fatal("could not open file \"%s\" for reading: %m",
+ pg_fatal("could not stat file \"%s\": %m",
fullpath);
len = statbuf.st_size;
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 31693843b3c..d0aafd7e7a6 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -377,7 +377,7 @@ main(int argc, char **argv)
{
pg_log_info("source and target cluster are on the same timeline");
rewind_needed = false;
- target_wal_endrec = 0;
+ target_wal_endrec = InvalidXLogRecPtr;
}
else
{
diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c
index a8d20a92a98..5c73773bf0e 100644
--- a/src/bin/pg_upgrade/check.c
+++ b/src/bin/pg_upgrade/check.c
@@ -622,7 +622,7 @@ check_and_dump_old_cluster(void)
{
/*
* Logical replication slots can be migrated since PG17. See comments
- * atop get_old_cluster_logical_slot_infos().
+ * in get_db_rel_and_slot_infos().
*/
check_old_cluster_for_valid_slots();
diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c
index 47e8d1039a2..ad4b1530e6d 100644
--- a/src/bin/pg_upgrade/info.c
+++ b/src/bin/pg_upgrade/info.c
@@ -29,7 +29,7 @@ static void free_rel_infos(RelInfoArr *rel_arr);
static void print_db_infos(DbInfoArr *db_arr);
static void print_rel_infos(RelInfoArr *rel_arr);
static void print_slot_infos(LogicalSlotInfoArr *slot_arr);
-static char *get_old_cluster_logical_slot_infos_query(void);
+static const char *get_old_cluster_logical_slot_infos_query(ClusterInfo *cluster);
static void process_old_cluster_logical_slot_infos(DbInfo *dbinfo, PGresult *res, void *arg);
@@ -281,7 +281,6 @@ get_db_rel_and_slot_infos(ClusterInfo *cluster)
{
UpgradeTask *task = upgrade_task_create();
char *rel_infos_query = NULL;
- char *logical_slot_infos_query = NULL;
if (cluster->dbarr.dbs != NULL)
free_db_and_rel_infos(&cluster->dbarr);
@@ -306,20 +305,15 @@ get_db_rel_and_slot_infos(ClusterInfo *cluster)
*/
if (cluster == &old_cluster &&
GET_MAJOR_VERSION(cluster->major_version) > 1600)
- {
- logical_slot_infos_query = get_old_cluster_logical_slot_infos_query();
upgrade_task_add_step(task,
- logical_slot_infos_query,
+ get_old_cluster_logical_slot_infos_query(cluster),
process_old_cluster_logical_slot_infos,
true, NULL);
- }
upgrade_task_run(task, cluster);
upgrade_task_free(task);
pg_free(rel_infos_query);
- if (logical_slot_infos_query)
- pg_free(logical_slot_infos_query);
if (cluster == &old_cluster)
pg_log(PG_VERBOSE, "\nsource databases:");
@@ -681,17 +675,15 @@ process_rel_infos(DbInfo *dbinfo, PGresult *res, void *arg)
* get_db_rel_and_slot_infos()'s UpgradeTask. The status of each logical slot
* is checked in check_old_cluster_for_valid_slots().
*/
-static char *
-get_old_cluster_logical_slot_infos_query(void)
+static const char *
+get_old_cluster_logical_slot_infos_query(ClusterInfo *cluster)
{
/*
* Fetch the logical replication slot information. The check whether the
* slot is considered caught up is done by an upgrade function. This
* regards the slot as caught up if we don't find any decodable changes.
- * See binary_upgrade_logical_slot_has_caught_up().
- *
- * Note that we can't ensure whether the slot is caught up during
- * live_check as the new WAL records could be generated.
+ * The implementation of this check varies depending on the server
+ * version.
*
* We intentionally skip checking the WALs for invalidated slots as the
* corresponding WALs could have been removed for such slots.
@@ -701,21 +693,80 @@ get_old_cluster_logical_slot_infos_query(void)
* started and stopped several times causing any temporary slots to be
* removed.
*/
- return psprintf("SELECT slot_name, plugin, two_phase, failover, "
- "%s as caught_up, invalidation_reason IS NOT NULL as invalid "
- "FROM pg_catalog.pg_replication_slots "
- "WHERE slot_type = 'logical' AND "
- "database = current_database() AND "
- "temporary IS FALSE;",
- user_opts.live_check ? "FALSE" :
- "(CASE WHEN invalidation_reason IS NOT NULL THEN FALSE "
- "ELSE (SELECT pg_catalog.binary_upgrade_logical_slot_has_caught_up(slot_name)) "
- "END)");
+
+ if (user_opts.live_check)
+ {
+ /*
+ * We skip the caught-up check during live_check. We cannot verify
+ * whether the slot is caught up in this mode, as new WAL records
+ * could be generated concurrently.
+ */
+ return "SELECT slot_name, plugin, two_phase, failover, "
+ "FALSE as caught_up, "
+ "invalidation_reason IS NOT NULL as invalid "
+ "FROM pg_catalog.pg_replication_slots "
+ "WHERE slot_type = 'logical' AND "
+ "database = current_database() AND "
+ "temporary IS FALSE";
+ }
+ else if (GET_MAJOR_VERSION(cluster->major_version) >= 1900)
+ {
+ /*
+ * For PG19 and later, we optimize the slot caught-up check to avoid
+ * reading the same WAL stream multiple times: execute the caught-up
+ * check only for the slot with the minimum confirmed_flush_lsn, and
+ * apply the same result to all other slots in the same database. This
+ * limits the check to at most one logical slot per database. We also
+ * use the maximum confirmed_flush_lsn among all logical slots on the
+ * database as an early scan cutoff; finding a decodable WAL record
+ * beyond this point implies that no slot has caught up.
+ *
+ * Note that we don't distinguish slots based on their output plugin.
+ * If a plugin applies replication origin filters, we might get a
+ * false positive (i.e., erroneously considering a slot caught up).
+ * However, such cases are very rare, and the impact of a false
+ * positive is minimal.
+ */
+ return "WITH check_caught_up AS ( "
+ " SELECT pg_catalog.binary_upgrade_check_logical_slot_pending_wal(slot_name, "
+ " MAX(confirmed_flush_lsn) OVER ()) as last_pending_wal "
+ " FROM pg_replication_slots "
+ " WHERE slot_type = 'logical' AND "
+ " database = current_database() AND "
+ " temporary IS FALSE AND "
+ " invalidation_reason IS NULL "
+ " ORDER BY confirmed_flush_lsn ASC "
+ " LIMIT 1 "
+ ") "
+ "SELECT slot_name, plugin, two_phase, failover, "
+ "CASE WHEN invalidation_reason IS NOT NULL THEN FALSE "
+ "ELSE last_pending_wal IS NULL OR "
+ " confirmed_flush_lsn > last_pending_wal "
+ "END as caught_up, "
+ "invalidation_reason IS NOT NULL as invalid "
+ "FROM pg_catalog.pg_replication_slots, check_caught_up "
+ "WHERE slot_type = 'logical' AND "
+ "database = current_database() AND "
+ "temporary IS FALSE ";
+ }
+
+ /*
+ * For PG18 and earlier, we call
+ * binary_upgrade_logical_slot_has_caught_up() for each logical slot.
+ */
+ return "SELECT slot_name, plugin, two_phase, failover, "
+ "CASE WHEN invalidation_reason IS NOT NULL THEN FALSE "
+ "ELSE (SELECT pg_catalog.binary_upgrade_logical_slot_has_caught_up(slot_name)) "
+ "END as caught_up, "
+ "invalidation_reason IS NOT NULL as invalid "
+ "FROM pg_catalog.pg_replication_slots "
+ "WHERE slot_type = 'logical' AND "
+ "database = current_database() AND "
+ "temporary IS FALSE ";
}
/*
- * Callback function for processing results of the query returned by
- * get_old_cluster_logical_slot_infos_query(), which is used for
+ * Callback function for processing results of the query, which is used for
* get_db_rel_and_slot_infos()'s UpgradeTask. This function stores the logical
* slot information for later use.
*/
@@ -768,7 +819,7 @@ process_old_cluster_logical_slot_infos(DbInfo *dbinfo, PGresult *res, void *arg)
*
* Note: this function always returns 0 if the old_cluster is PG16 and prior
* because we gather slot information only for cluster versions greater than or
- * equal to PG17. See get_old_cluster_logical_slot_infos().
+ * equal to PG17. See get_db_rel_and_slot_infos().
*/
int
count_old_cluster_logical_slots(void)
diff --git a/src/bin/pg_upgrade/t/003_logical_slots.pl b/src/bin/pg_upgrade/t/003_logical_slots.pl
index b9abc3a2e21..15e6d267f2f 100644
--- a/src/bin/pg_upgrade/t/003_logical_slots.pl
+++ b/src/bin/pg_upgrade/t/003_logical_slots.pl
@@ -64,6 +64,7 @@
'postgres', qq[
SELECT pg_create_logical_replication_slot('test_slot1', 'test_decoding');
SELECT pg_create_logical_replication_slot('test_slot2', 'test_decoding');
+ SELECT pg_create_logical_replication_slot('test_slot3', 'test_decoding');
]);
$oldpub->stop();
@@ -77,7 +78,7 @@
[@pg_upgrade_cmd],
1,
[
- qr/"max_replication_slots" \(1\) must be greater than or equal to the number of logical replication slots \(2\) on the old cluster/
+ qr/"max_replication_slots" \(1\) must be greater than or equal to the number of logical replication slots \(3\) on the old cluster/
],
[qr//],
'run of pg_upgrade where the new cluster has insufficient "max_replication_slots"'
@@ -85,29 +86,31 @@
ok(-d $newpub->data_dir . "/pg_upgrade_output.d",
"pg_upgrade_output.d/ not removed after pg_upgrade failure");
-# Set 'max_replication_slots' to match the number of slots (2) present on the
+# Set 'max_replication_slots' to match the number of slots (3) present on the
# old cluster. Both slots will be used for subsequent tests.
-$newpub->append_conf('postgresql.conf', "max_replication_slots = 2");
+$newpub->append_conf('postgresql.conf', "max_replication_slots = 3");
# ------------------------------
# TEST: Confirm pg_upgrade fails when the slot still has unconsumed WAL records
# Preparations for the subsequent test:
-# 1. Generate extra WAL records. At this point neither test_slot1 nor
-# test_slot2 has consumed them.
+# 1. Generate extra WAL records. At this point none of the slots has consumed them.
#
# 2. Advance the slot test_slot2 up to the current WAL location, but test_slot1
# still has unconsumed WAL records.
#
# 3. Emit a non-transactional message. This will cause test_slot2 to detect the
# unconsumed WAL record.
+#
+# 4. Advance the slot test_slots3 up to the current WAL location.
$oldpub->start;
$oldpub->safe_psql(
'postgres', qq[
CREATE TABLE tbl AS SELECT generate_series(1, 10) AS a;
SELECT pg_replication_slot_advance('test_slot2', pg_current_wal_lsn());
- SELECT count(*) FROM pg_logical_emit_message('false', 'prefix', 'This is a non-transactional message');
+ SELECT count(*) FROM pg_logical_emit_message('false', 'prefix', 'This is a non-transactional message', true);
+ SELECT pg_replication_slot_advance('test_slot3', pg_current_wal_lsn());
]);
$oldpub->stop;
@@ -138,8 +141,9 @@
},
$newpub->data_dir . "/pg_upgrade_output.d");
-# Check the file content. Both slots should be reporting that they have
-# unconsumed WAL records.
+# Check the file content. While both test_slot1 and test_slot2 should be reporting
+# that they have unconsumed WAL records, test_slot3 should not be reported as
+# it has caught up.
like(
slurp_file($slots_filename),
qr/The slot \"test_slot1\" has not consumed the WAL yet/m,
@@ -148,6 +152,10 @@
slurp_file($slots_filename),
qr/The slot \"test_slot2\" has not consumed the WAL yet/m,
'the previous test failed due to unconsumed WALs');
+unlike(
+ slurp_file($slots_filename),
+ qr/test_slot3/m,
+ 'caught-up slot is not reported');
# ------------------------------
@@ -162,6 +170,7 @@
'postgres', qq[
SELECT * FROM pg_drop_replication_slot('test_slot1');
SELECT * FROM pg_drop_replication_slot('test_slot2');
+ SELECT * FROM pg_drop_replication_slot('test_slot3');
CREATE PUBLICATION regress_pub FOR ALL TABLES;
]);
diff --git a/src/bin/psql/prompt.c b/src/bin/psql/prompt.c
index 891cd6374f0..9725d53dfe7 100644
--- a/src/bin/psql/prompt.c
+++ b/src/bin/psql/prompt.c
@@ -44,6 +44,8 @@
* or a ! if session is not connected to a database;
* in prompt2 -, *, ', or ";
* in prompt3 nothing
+ * %i - "standby" or "primary" depending on the server's in_hot_standby
+ * status, or "?" if unavailable (empty if unknown)
* %x - transaction status: empty, *, !, ? (unknown or no connection)
* %l - The line number inside the current statement, starting from 1.
* %? - the error code of the last query (not yet implemented)
@@ -258,7 +260,23 @@ get_prompt(promptStatus_t status, ConditionalStack cstack)
break;
}
break;
+ case 'i':
+ if (pset.db)
+ {
+ const char *hs = PQparameterStatus(pset.db, "in_hot_standby");
+ if (hs)
+ {
+ if (strcmp(hs, "on") == 0)
+ strlcpy(buf, "standby", sizeof(buf));
+ else
+ strlcpy(buf, "primary", sizeof(buf));
+ }
+ /* Use ? for versions that don't report in_hot_standby */
+ else
+ buf[0] = '?';
+ }
+ break;
case 'x':
if (!pset.db)
buf[0] = '?';
diff --git a/src/bin/psql/t/030_pager.pl b/src/bin/psql/t/030_pager.pl
index cf81fb1603c..a35f2b26293 100644
--- a/src/bin/psql/t/030_pager.pl
+++ b/src/bin/psql/t/030_pager.pl
@@ -40,6 +40,36 @@
$node->init;
$node->start;
+# create a view we'll use below
+$node->safe_psql(
+ 'postgres', 'create view public.view_030_pager as select
+1 as a,
+2 as b,
+3 as c,
+4 as d,
+5 as e,
+6 as f,
+7 as g,
+8 as h,
+9 as i,
+10 as j,
+11 as k,
+12 as l,
+13 as m,
+14 as n,
+15 as o,
+16 as p,
+17 as q,
+18 as r,
+19 as s,
+20 as t,
+21 as u,
+22 as v,
+23 as w,
+24 as x,
+25 as y,
+26 as z');
+
# fire up an interactive psql session
my $h = $node->interactive_psql('postgres');
@@ -77,25 +107,28 @@ sub do_command
#
# Note that interactive_psql starts psql with --no-align --tuples-only,
# and that the output string will include psql's prompts and command echo.
+# So we have to test for patterns that can't match the command itself,
+# and we can't assume the match will extend across a whole line (there
+# might be a prompt ahead of it in the output).
do_command(
"SELECT 'test' AS t FROM generate_series(1,23);\n",
- qr/^test\r?$/m,
+ qr/test\r?$/m,
"execute SELECT query that needs no pagination");
do_command(
"SELECT 'test' AS t FROM generate_series(1,24);\n",
- qr/^ *24\r?$/m,
+ qr/24\r?$/m,
"execute SELECT query that needs pagination");
do_command(
"\\pset expanded\nSELECT generate_series(1,20) as g;\n",
- qr/^ *39\r?$/m,
+ qr/39\r?$/m,
"execute SELECT query that needs pagination in expanded mode");
do_command(
- "\\pset tuples_only off\n\\d+ information_schema.referential_constraints\n",
- qr/^ *\d+\r?$/m,
+ "\\pset tuples_only off\n\\d+ public.view_030_pager\n",
+ qr/55\r?$/m,
"execute command with footer that needs pagination");
# send psql an explicit \q to shut it down, else pty won't close properly
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 5631e2c9363..eb15ee59497 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -63,6 +63,9 @@
* subset to the ASCII routines to ensure consistency.
*/
+/* No error-reporting facility. Ignore incomplete trailing byte sequence. */
+#define MB2CHAR_NEED_AT_LEAST(len, need) if ((len) < (need)) break
+
/*
* SQL/ASCII
*/
@@ -108,22 +111,24 @@ pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
while (len > 0 && *from)
{
- if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
- * KANA") */
+ if (*from == SS2) /* JIS X 0201 (so called "1 byte KANA") */
{
+ MB2CHAR_NEED_AT_LEAST(len, 2);
from++;
*to = (SS2 << 8) | *from++;
len -= 2;
}
- else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
+ else if (*from == SS3) /* JIS X 0212 KANJI */
{
+ MB2CHAR_NEED_AT_LEAST(len, 3);
from++;
*to = (SS3 << 16) | (*from++ << 8);
*to |= *from++;
len -= 3;
}
- else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
+ else if (IS_HIGHBIT_SET(*from)) /* JIS X 0208 KANJI */
{
+ MB2CHAR_NEED_AT_LEAST(len, 2);
*to = *from++ << 8;
*to |= *from++;
len -= 2;
@@ -235,22 +240,25 @@ pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
while (len > 0 && *from)
{
- if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
+ if (*from == SS2) /* code set 2 (unused?) */
{
+ MB2CHAR_NEED_AT_LEAST(len, 3);
from++;
*to = (SS2 << 16) | (*from++ << 8);
*to |= *from++;
len -= 3;
}
- else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
+ else if (*from == SS3) /* code set 3 (unused ?) */
{
+ MB2CHAR_NEED_AT_LEAST(len, 3);
from++;
*to = (SS3 << 16) | (*from++ << 8);
*to |= *from++;
len -= 3;
}
- else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
+ else if (IS_HIGHBIT_SET(*from)) /* code set 1 */
{
+ MB2CHAR_NEED_AT_LEAST(len, 2);
*to = *from++ << 8;
*to |= *from++;
len -= 2;
@@ -267,12 +275,22 @@ pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
return cnt;
}
+/*
+ * mbverifychar does not accept SS2 or SS3 (CS2 and CS3 are not defined for
+ * EUC_CN), but mb2wchar_with_len does. Tell a coherent story for code that
+ * relies on agreement between mb2wchar_with_len and mblen. Invalid text
+ * datums (e.g. from shared catalogs) reach this.
+ */
static int
pg_euccn_mblen(const unsigned char *s)
{
int len;
- if (IS_HIGHBIT_SET(*s))
+ if (*s == SS2)
+ len = 3;
+ else if (*s == SS3)
+ len = 3;
+ else if (IS_HIGHBIT_SET(*s))
len = 2;
else
len = 1;
@@ -302,23 +320,26 @@ pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
while (len > 0 && *from)
{
- if (*from == SS2 && len >= 4) /* code set 2 */
+ if (*from == SS2) /* code set 2 */
{
+ MB2CHAR_NEED_AT_LEAST(len, 4);
from++;
*to = (((uint32) SS2) << 24) | (*from++ << 16);
*to |= *from++ << 8;
*to |= *from++;
len -= 4;
}
- else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
+ else if (*from == SS3) /* code set 3 (unused?) */
{
+ MB2CHAR_NEED_AT_LEAST(len, 3);
from++;
*to = (SS3 << 16) | (*from++ << 8);
*to |= *from++;
len -= 3;
}
- else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
+ else if (IS_HIGHBIT_SET(*from)) /* code set 2 */
{
+ MB2CHAR_NEED_AT_LEAST(len, 2);
*to = *from++ << 8;
*to |= *from++;
len -= 2;
@@ -455,8 +476,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
}
else if ((*from & 0xe0) == 0xc0)
{
- if (len < 2)
- break; /* drop trailing incomplete char */
+ MB2CHAR_NEED_AT_LEAST(len, 2);
c1 = *from++ & 0x1f;
c2 = *from++ & 0x3f;
*to = (c1 << 6) | c2;
@@ -464,8 +484,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
}
else if ((*from & 0xf0) == 0xe0)
{
- if (len < 3)
- break; /* drop trailing incomplete char */
+ MB2CHAR_NEED_AT_LEAST(len, 3);
c1 = *from++ & 0x0f;
c2 = *from++ & 0x3f;
c3 = *from++ & 0x3f;
@@ -474,8 +493,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
}
else if ((*from & 0xf8) == 0xf0)
{
- if (len < 4)
- break; /* drop trailing incomplete char */
+ MB2CHAR_NEED_AT_LEAST(len, 4);
c1 = *from++ & 0x07;
c2 = *from++ & 0x3f;
c3 = *from++ & 0x3f;
@@ -677,28 +695,32 @@ pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
while (len > 0 && *from)
{
- if (IS_LC1(*from) && len >= 2)
+ if (IS_LC1(*from))
{
+ MB2CHAR_NEED_AT_LEAST(len, 2);
*to = *from++ << 16;
*to |= *from++;
len -= 2;
}
- else if (IS_LCPRV1(*from) && len >= 3)
+ else if (IS_LCPRV1(*from))
{
+ MB2CHAR_NEED_AT_LEAST(len, 3);
from++;
*to = *from++ << 16;
*to |= *from++;
len -= 3;
}
- else if (IS_LC2(*from) && len >= 3)
+ else if (IS_LC2(*from))
{
+ MB2CHAR_NEED_AT_LEAST(len, 3);
*to = *from++ << 16;
*to |= *from++ << 8;
*to |= *from++;
len -= 3;
}
- else if (IS_LCPRV2(*from) && len >= 4)
+ else if (IS_LCPRV2(*from))
{
+ MB2CHAR_NEED_AT_LEAST(len, 4);
from++;
*to = *from++ << 16;
*to |= *from++ << 8;
@@ -2064,7 +2086,7 @@ pg_encoding_set_invalid(int encoding, char *dst)
const pg_wchar_tbl pg_wchar_table[] = {
[PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
[PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
- [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},
+ [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 3},
[PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
[PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
[PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
diff --git a/src/include/Makefile b/src/include/Makefile
index 4ef060e9050..ac673f4cf17 100644
--- a/src/include/Makefile
+++ b/src/include/Makefile
@@ -105,6 +105,7 @@ uninstall:
clean:
rm -f utils/fmgroids.h utils/fmgrprotos.h utils/guc_tables.inc.c utils/errcodes.h utils/header-stamp
+ rm -f utils/pgstat_wait_event.c utils/wait_event_funcs_data.c
rm -f storage/lwlocknames.h utils/probes.h utils/wait_event_types.h
rm -f nodes/nodetags.h nodes/header-stamp
$(MAKE) -C catalog clean
diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h
index d406825ff22..75f8b159b8a 100644
--- a/src/include/access/htup_details.h
+++ b/src/include/access/htup_details.h
@@ -357,20 +357,6 @@ HeapTupleHeaderXminFrozen(const HeapTupleHeaderData *tup)
return (tup->t_infomask & HEAP_XMIN_FROZEN) == HEAP_XMIN_FROZEN;
}
-static inline void
-HeapTupleHeaderSetXminCommitted(HeapTupleHeaderData *tup)
-{
- Assert(!HeapTupleHeaderXminInvalid(tup));
- tup->t_infomask |= HEAP_XMIN_COMMITTED;
-}
-
-static inline void
-HeapTupleHeaderSetXminInvalid(HeapTupleHeaderData *tup)
-{
- Assert(!HeapTupleHeaderXminCommitted(tup));
- tup->t_infomask |= HEAP_XMIN_INVALID;
-}
-
static inline void
HeapTupleHeaderSetXminFrozen(HeapTupleHeaderData *tup)
{
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index e2ec5289d4d..7260b7b3d52 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -868,6 +868,27 @@ extern TupleTableSlot *table_slot_create(Relation relation, List **reglist);
* ----------------------------------------------------------------------------
*/
+/*
+ * A wrapper around the Table Access Method scan_begin callback, to centralize
+ * error checking. All calls to ->scan_begin() should go through this
+ * function.
+ */
+static TableScanDesc
+table_beginscan_common(Relation rel, Snapshot snapshot, int nkeys,
+ ScanKeyData *key, ParallelTableScanDesc pscan,
+ uint32 flags)
+{
+ /*
+ * We don't allow scans to be started while CheckXidAlive is set, except
+ * via systable_beginscan() et al. See detailed comments in xact.c where
+ * these variables are declared.
+ */
+ if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+ elog(ERROR, "scan started during logical decoding");
+
+ return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, pscan, flags);
+}
+
/*
* Start a scan of `rel`. Returned tuples pass a visibility test of
* `snapshot`, and if nkeys != 0, the results are filtered by those scan keys.
@@ -879,7 +900,7 @@ table_beginscan(Relation rel, Snapshot snapshot,
uint32 flags = SO_TYPE_SEQSCAN |
SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
- return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
+ return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags);
}
/*
@@ -908,7 +929,7 @@ table_beginscan_strat(Relation rel, Snapshot snapshot,
if (allow_sync)
flags |= SO_ALLOW_SYNC;
- return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
+ return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags);
}
/*
@@ -923,8 +944,7 @@ table_beginscan_bm(Relation rel, Snapshot snapshot,
{
uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE;
- return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key,
- NULL, flags);
+ return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags);
}
/*
@@ -949,7 +969,7 @@ table_beginscan_sampling(Relation rel, Snapshot snapshot,
if (allow_pagemode)
flags |= SO_ALLOW_PAGEMODE;
- return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
+ return table_beginscan_common(rel, snapshot, nkeys, key, NULL, flags);
}
/*
@@ -962,7 +982,7 @@ table_beginscan_tid(Relation rel, Snapshot snapshot)
{
uint32 flags = SO_TYPE_TIDSCAN;
- return rel->rd_tableam->scan_begin(rel, snapshot, 0, NULL, NULL, flags);
+ return table_beginscan_common(rel, snapshot, 0, NULL, NULL, flags);
}
/*
@@ -975,7 +995,7 @@ table_beginscan_analyze(Relation rel)
{
uint32 flags = SO_TYPE_ANALYZE;
- return rel->rd_tableam->scan_begin(rel, NULL, 0, NULL, NULL, flags);
+ return table_beginscan_common(rel, NULL, 0, NULL, NULL, flags);
}
/*
@@ -1025,14 +1045,6 @@ table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableS
Assert(direction == ForwardScanDirection ||
direction == BackwardScanDirection);
- /*
- * We don't expect direct calls to table_scan_getnextslot with valid
- * CheckXidAlive for catalog or regular tables. See detailed comments in
- * xact.c where these variables are declared.
- */
- if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
- elog(ERROR, "unexpected table_scan_getnextslot call during logical decoding");
-
return sscan->rs_rd->rd_tableam->scan_getnextslot(sscan, direction, slot);
}
@@ -1053,7 +1065,7 @@ table_beginscan_tidrange(Relation rel, Snapshot snapshot,
TableScanDesc sscan;
uint32 flags = SO_TYPE_TIDRANGESCAN | SO_ALLOW_PAGEMODE;
- sscan = rel->rd_tableam->scan_begin(rel, snapshot, 0, NULL, NULL, flags);
+ sscan = table_beginscan_common(rel, snapshot, 0, NULL, NULL, flags);
/* Set the range of TIDs to scan */
sscan->rs_rd->rd_tableam->scan_set_tidrange(sscan, mintid, maxtid);
@@ -1166,6 +1178,14 @@ table_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
static inline IndexFetchTableData *
table_index_fetch_begin(Relation rel)
{
+ /*
+ * We don't allow scans to be started while CheckXidAlive is set, except
+ * via systable_beginscan() et al. See detailed comments in xact.c where
+ * these variables are declared.
+ */
+ if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+ elog(ERROR, "scan started during logical decoding");
+
return rel->rd_tableam->index_fetch_begin(rel);
}
@@ -1219,14 +1239,6 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan,
TupleTableSlot *slot,
bool *call_again, bool *all_dead)
{
- /*
- * We don't expect direct calls to table_index_fetch_tuple with valid
- * CheckXidAlive for catalog or regular tables. See detailed comments in
- * xact.c where these variables are declared.
- */
- if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
- elog(ERROR, "unexpected table_index_fetch_tuple call during logical decoding");
-
return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot,
slot, call_again,
all_dead);
@@ -1947,14 +1959,6 @@ table_scan_bitmap_next_tuple(TableScanDesc scan,
uint64 *lossy_pages,
uint64 *exact_pages)
{
- /*
- * We don't expect direct calls to table_scan_bitmap_next_tuple with valid
- * CheckXidAlive for catalog or regular tables. See detailed comments in
- * xact.c where these variables are declared.
- */
- if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
- elog(ERROR, "unexpected table_scan_bitmap_next_tuple call during logical decoding");
-
return scan->rs_rd->rd_tableam->scan_bitmap_next_tuple(scan,
slot,
recheck,
@@ -1975,13 +1979,6 @@ static inline bool
table_scan_sample_next_block(TableScanDesc scan,
SampleScanState *scanstate)
{
- /*
- * We don't expect direct calls to table_scan_sample_next_block with valid
- * CheckXidAlive for catalog or regular tables. See detailed comments in
- * xact.c where these variables are declared.
- */
- if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
- elog(ERROR, "unexpected table_scan_sample_next_block call during logical decoding");
return scan->rs_rd->rd_tableam->scan_sample_next_block(scan, scanstate);
}
@@ -1998,13 +1995,6 @@ table_scan_sample_next_tuple(TableScanDesc scan,
SampleScanState *scanstate,
TupleTableSlot *slot)
{
- /*
- * We don't expect direct calls to table_scan_sample_next_tuple with valid
- * CheckXidAlive for catalog or regular tables. See detailed comments in
- * xact.c where these variables are declared.
- */
- if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
- elog(ERROR, "unexpected table_scan_sample_next_tuple call during logical decoding");
return scan->rs_rd->rd_tableam->scan_sample_next_tuple(scan, scanstate,
slot);
}
diff --git a/src/include/access/tupmacs.h b/src/include/access/tupmacs.h
index 3e5530658c9..d64c18b950b 100644
--- a/src/include/access/tupmacs.h
+++ b/src/include/access/tupmacs.h
@@ -71,6 +71,43 @@ fetch_att(const void *T, bool attbyval, int attlen)
}
#endif /* FRONTEND */
+/*
+ * typalign_to_alignby: map a TYPALIGN_xxx value to the numeric alignment
+ * value it represents. (We store TYPALIGN_xxx codes not the real alignment
+ * values mainly so that initial catalog contents can be machine-independent.)
+ */
+static inline uint8
+typalign_to_alignby(char typalign)
+{
+ uint8 alignby;
+
+ switch (typalign)
+ {
+ case TYPALIGN_CHAR:
+ alignby = sizeof(char);
+ break;
+ case TYPALIGN_SHORT:
+ alignby = ALIGNOF_SHORT;
+ break;
+ case TYPALIGN_INT:
+ alignby = ALIGNOF_INT;
+ break;
+ case TYPALIGN_DOUBLE:
+ alignby = ALIGNOF_DOUBLE;
+ break;
+ default:
+#ifndef FRONTEND
+ elog(ERROR, "invalid typalign value: %c", typalign);
+#else
+ fprintf(stderr, "invalid typalign value: %c\n", typalign);
+ exit(1);
+#endif
+ alignby = 0;
+ break;
+ }
+ return alignby;
+}
+
/*
* att_align_datum aligns the given offset as needed for a datum of alignment
* requirement attalign and typlen attlen. attdatum is the Datum variable
@@ -139,19 +176,11 @@ fetch_att(const void *T, bool attbyval, int attlen)
* * within arrays and multiranges, we unconditionally align varlenas (XXX this
* should be revisited, probably).
*
- * The attalign cases are tested in what is hopefully something like their
- * frequency of occurrence.
+ * In performance-critical loops, avoid using this macro; instead use
+ * att_nominal_alignby with a pre-computed alignby value.
*/
#define att_align_nominal(cur_offset, attalign) \
-( \
- ((attalign) == TYPALIGN_INT) ? INTALIGN(cur_offset) : \
- (((attalign) == TYPALIGN_CHAR) ? (uintptr_t) (cur_offset) : \
- (((attalign) == TYPALIGN_DOUBLE) ? DOUBLEALIGN(cur_offset) : \
- ( \
- AssertMacro((attalign) == TYPALIGN_SHORT), \
- SHORTALIGN(cur_offset) \
- ))) \
-)
+ att_nominal_alignby(cur_offset, typalign_to_alignby(attalign))
/*
* Similar to att_align_nominal, but accepts a number of bytes, typically from
diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h
index f896dbe149f..d77b894cb65 100644
--- a/src/include/access/xlogdefs.h
+++ b/src/include/access/xlogdefs.h
@@ -44,7 +44,7 @@ typedef uint64 XLogRecPtr;
* To avoid breaking translatable messages, we're directly applying the
* LSN format instead of using a macro.
*/
-#define LSN_FORMAT_ARGS(lsn) (AssertVariableIsOfTypeMacro((lsn), XLogRecPtr), (uint32) ((lsn) >> 32)), ((uint32) (lsn))
+#define LSN_FORMAT_ARGS(lsn) (StaticAssertVariableIsOfTypeMacro((lsn), XLogRecPtr), (uint32) ((lsn) >> 32)), ((uint32) (lsn))
/*
* XLogSegNo - physical log file sequence number.
diff --git a/src/include/c.h b/src/include/c.h
index 48e4087c09c..063eac9808c 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -955,26 +955,26 @@ pg_noreturn extern void ExceptionalCondition(const char *conditionName,
/*
* Compile-time checks that a variable (or expression) has the specified type.
*
- * AssertVariableIsOfType() can be used as a statement.
- * AssertVariableIsOfTypeMacro() is intended for use in macros, eg
- * #define foo(x) (AssertVariableIsOfTypeMacro(x, int), bar(x))
+ * StaticAssertVariableIsOfType() can be used as a declaration.
+ * StaticAssertVariableIsOfTypeMacro() is intended for use in macros, eg
+ * #define foo(x) (StaticAssertVariableIsOfTypeMacro(x, int), bar(x))
*
* If we don't have __builtin_types_compatible_p, we can still assert that
* the types have the same size. This is far from ideal (especially on 32-bit
* platforms) but it provides at least some coverage.
*/
#ifdef HAVE__BUILTIN_TYPES_COMPATIBLE_P
-#define AssertVariableIsOfType(varname, typename) \
- StaticAssertStmt(__builtin_types_compatible_p(__typeof__(varname), typename), \
+#define StaticAssertVariableIsOfType(varname, typename) \
+ StaticAssertDecl(__builtin_types_compatible_p(__typeof__(varname), typename), \
CppAsString(varname) " does not have type " CppAsString(typename))
-#define AssertVariableIsOfTypeMacro(varname, typename) \
+#define StaticAssertVariableIsOfTypeMacro(varname, typename) \
(StaticAssertExpr(__builtin_types_compatible_p(__typeof__(varname), typename), \
CppAsString(varname) " does not have type " CppAsString(typename)))
#else /* !HAVE__BUILTIN_TYPES_COMPATIBLE_P */
-#define AssertVariableIsOfType(varname, typename) \
- StaticAssertStmt(sizeof(varname) == sizeof(typename), \
+#define StaticAssertVariableIsOfType(varname, typename) \
+ StaticAssertDecl(sizeof(varname) == sizeof(typename), \
CppAsString(varname) " does not have type " CppAsString(typename))
-#define AssertVariableIsOfTypeMacro(varname, typename) \
+#define StaticAssertVariableIsOfTypeMacro(varname, typename) \
(StaticAssertExpr(sizeof(varname) == sizeof(typename), \
CppAsString(varname) " does not have type " CppAsString(typename)))
#endif /* HAVE__BUILTIN_TYPES_COMPATIBLE_P */
@@ -1140,6 +1140,12 @@ typedef struct PGAlignedXLogBlock
alignas(PG_IO_ALIGN_SIZE) char data[XLOG_BLCKSZ];
} PGAlignedXLogBlock;
+#else /* (g++ < 9) */
+
+/* Allow these types to be used as abstract types when using old g++ */
+typedef struct PGIOAlignedBlock PGIOAlignedBlock;
+typedef struct PGAlignedXLogBlock PGAlignedXLogBlock;
+
#endif /* !(g++ < 9) */
/* msb for char */
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index fb577026666..a910b3d04e6 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202601261
+#define CATALOG_VERSION_NO 202602101
#endif
diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h
index 969fd8b23f9..2f3c1eae3c7 100644
--- a/src/include/catalog/dependency.h
+++ b/src/include/catalog/dependency.h
@@ -186,6 +186,8 @@ extern long changeDependenciesOn(Oid refClassId, Oid oldRefObjectId,
extern Oid getExtensionOfObject(Oid classId, Oid objectId);
extern List *getAutoExtensionsOfObject(Oid classId, Oid objectId);
+extern Oid getExtensionType(Oid extensionOid, const char *typname);
+
extern bool sequenceIsOwned(Oid seqId, char deptype, Oid *tableId, int32 *colId);
extern List *getOwnedSequences(Oid relid);
extern Oid getIdentitySequence(Relation rel, AttrNumber attnum, bool missing_ok);
diff --git a/src/include/catalog/pg_constraint.h b/src/include/catalog/pg_constraint.h
index 05933cd9741..d5661b5bdff 100644
--- a/src/include/catalog/pg_constraint.h
+++ b/src/include/catalog/pg_constraint.h
@@ -263,7 +263,7 @@ extern HeapTuple findNotNullConstraintAttnum(Oid relid, AttrNumber attnum);
extern HeapTuple findNotNullConstraint(Oid relid, const char *colname);
extern HeapTuple findDomainNotNullConstraint(Oid typid);
extern AttrNumber extractNotNullColumn(HeapTuple constrTup);
-extern bool AdjustNotNullInheritance(Oid relid, AttrNumber attnum,
+extern bool AdjustNotNullInheritance(Oid relid, AttrNumber attnum, const char *new_conname,
bool is_local, bool is_no_inherit, bool is_notvalid);
extern List *RelationGetNotNullConstraints(Oid relid, bool cooked,
bool include_noinh);
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 5e5e33f64fc..83f6501df38 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -11832,9 +11832,9 @@
proparallel => 'u', prorettype => 'void', proargtypes => 'oid',
prosrc => 'binary_upgrade_set_next_pg_tablespace_oid' },
{ oid => '6312', descr => 'for use by pg_upgrade',
- proname => 'binary_upgrade_logical_slot_has_caught_up', provolatile => 'v',
- proparallel => 'u', prorettype => 'bool', proargtypes => 'name',
- prosrc => 'binary_upgrade_logical_slot_has_caught_up' },
+ proname => 'binary_upgrade_check_logical_slot_pending_wal', provolatile => 'v',
+ proparallel => 'u', prorettype => 'pg_lsn', proargtypes => 'name pg_lsn',
+ prosrc => 'binary_upgrade_check_logical_slot_pending_wal' },
{ oid => '6319',
descr => 'for use by pg_upgrade (relation for pg_subscription_rel)',
proname => 'binary_upgrade_add_sub_rel_state', proisstrict => 'f',
diff --git a/src/include/commands/extension.h b/src/include/commands/extension.h
index 4ebc2bac223..7a76bdebcfa 100644
--- a/src/include/commands/extension.h
+++ b/src/include/commands/extension.h
@@ -52,6 +52,8 @@ extern char *get_extension_name(Oid ext_oid);
extern Oid get_extension_schema(Oid ext_oid);
extern bool extension_file_exists(const char *extensionName);
+extern Oid get_function_sibling_type(Oid funcoid, const char *typname);
+
extern ObjectAddress AlterExtensionNamespace(const char *extensionName, const char *newschema,
Oid *oldschema);
diff --git a/src/include/executor/execdebug.h b/src/include/executor/execdebug.h
index 20ac9be0b92..3e110551914 100644
--- a/src/include/executor/execdebug.h
+++ b/src/include/executor/execdebug.h
@@ -34,22 +34,22 @@
* EXEC_NESTLOOPDEBUG is a flag which turns on debugging of the
* nest loop node by NL_printf() and ENL_printf() in nodeNestloop.c
* ----------------
-#undef EXEC_NESTLOOPDEBUG
*/
+/* #define EXEC_NESTLOOPDEBUG */
/* ----------------
* EXEC_SORTDEBUG is a flag which turns on debugging of
* the ExecSort() stuff by SO_printf() in nodeSort.c
* ----------------
-#undef EXEC_SORTDEBUG
*/
+/* #define EXEC_SORTDEBUG */
/* ----------------
* EXEC_MERGEJOINDEBUG is a flag which turns on debugging of
* the ExecMergeJoin() stuff by MJ_printf() in nodeMergejoin.c
* ----------------
-#undef EXEC_MERGEJOINDEBUG
*/
+/* #define EXEC_MERGEJOINDEBUG */
/* ----------------------------------------------------------------
* #defines controlled by above definitions
diff --git a/src/include/lib/ilist.h b/src/include/lib/ilist.h
index d49ec0ffbc5..fc298a6c1d7 100644
--- a/src/include/lib/ilist.h
+++ b/src/include/lib/ilist.h
@@ -591,8 +591,8 @@ dlist_tail_node(dlist_head *head)
* This is used to convert a dlist_node * back to its containing struct.
*/
#define dlist_container(type, membername, ptr) \
- (AssertVariableIsOfTypeMacro(ptr, dlist_node *), \
- AssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \
+ (StaticAssertVariableIsOfTypeMacro(ptr, dlist_node *), \
+ StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \
((type *) ((char *) (ptr) - offsetof(type, membername))))
/*
@@ -601,7 +601,7 @@ dlist_tail_node(dlist_head *head)
* The list must not be empty.
*/
#define dlist_head_element(type, membername, lhead) \
- (AssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \
+ (StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \
(type *) dlist_head_element_off(lhead, offsetof(type, membername)))
/*
@@ -610,7 +610,7 @@ dlist_tail_node(dlist_head *head)
* The list must not be empty.
*/
#define dlist_tail_element(type, membername, lhead) \
- (AssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \
+ (StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \
((type *) dlist_tail_element_off(lhead, offsetof(type, membername))))
/*
@@ -621,8 +621,8 @@ dlist_tail_node(dlist_head *head)
* It is *not* allowed to manipulate the list during iteration.
*/
#define dlist_foreach(iter, lhead) \
- for (AssertVariableIsOfTypeMacro(iter, dlist_iter), \
- AssertVariableIsOfTypeMacro(lhead, dlist_head *), \
+ for (StaticAssertVariableIsOfTypeMacro(iter, dlist_iter), \
+ StaticAssertVariableIsOfTypeMacro(lhead, dlist_head *), \
(iter).end = &(lhead)->head, \
(iter).cur = (iter).end->next ? (iter).end->next : (iter).end; \
(iter).cur != (iter).end; \
@@ -638,8 +638,8 @@ dlist_tail_node(dlist_head *head)
* fine to insert or delete adjacent nodes.
*/
#define dlist_foreach_modify(iter, lhead) \
- for (AssertVariableIsOfTypeMacro(iter, dlist_mutable_iter), \
- AssertVariableIsOfTypeMacro(lhead, dlist_head *), \
+ for (StaticAssertVariableIsOfTypeMacro(iter, dlist_mutable_iter), \
+ StaticAssertVariableIsOfTypeMacro(lhead, dlist_head *), \
(iter).end = &(lhead)->head, \
(iter).cur = (iter).end->next ? (iter).end->next : (iter).end, \
(iter).next = (iter).cur->next; \
@@ -652,8 +652,8 @@ dlist_tail_node(dlist_head *head)
* It is *not* allowed to manipulate the list during iteration.
*/
#define dlist_reverse_foreach(iter, lhead) \
- for (AssertVariableIsOfTypeMacro(iter, dlist_iter), \
- AssertVariableIsOfTypeMacro(lhead, dlist_head *), \
+ for (StaticAssertVariableIsOfTypeMacro(iter, dlist_iter), \
+ StaticAssertVariableIsOfTypeMacro(lhead, dlist_head *), \
(iter).end = &(lhead)->head, \
(iter).cur = (iter).end->prev ? (iter).end->prev : (iter).end; \
(iter).cur != (iter).end; \
@@ -953,7 +953,7 @@ dclist_count(const dclist_head *head)
* The list must not be empty.
*/
#define dclist_head_element(type, membername, lhead) \
- (AssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \
+ (StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \
(type *) dclist_head_element_off(lhead, offsetof(type, membername)))
/*
@@ -962,7 +962,7 @@ dclist_count(const dclist_head *head)
* The list must not be empty.
*/
#define dclist_tail_element(type, membername, lhead) \
- (AssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \
+ (StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \
((type *) dclist_tail_element_off(lhead, offsetof(type, membername))))
@@ -1104,8 +1104,8 @@ slist_delete_current(slist_mutable_iter *iter)
* This is used to convert a slist_node * back to its containing struct.
*/
#define slist_container(type, membername, ptr) \
- (AssertVariableIsOfTypeMacro(ptr, slist_node *), \
- AssertVariableIsOfTypeMacro(((type *) NULL)->membername, slist_node), \
+ (StaticAssertVariableIsOfTypeMacro(ptr, slist_node *), \
+ StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, slist_node), \
((type *) ((char *) (ptr) - offsetof(type, membername))))
/*
@@ -1114,7 +1114,7 @@ slist_delete_current(slist_mutable_iter *iter)
* The list must not be empty.
*/
#define slist_head_element(type, membername, lhead) \
- (AssertVariableIsOfTypeMacro(((type *) NULL)->membername, slist_node), \
+ (StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, slist_node), \
(type *) slist_head_element_off(lhead, offsetof(type, membername)))
/*
@@ -1130,8 +1130,8 @@ slist_delete_current(slist_mutable_iter *iter)
* not safe.)
*/
#define slist_foreach(iter, lhead) \
- for (AssertVariableIsOfTypeMacro(iter, slist_iter), \
- AssertVariableIsOfTypeMacro(lhead, slist_head *), \
+ for (StaticAssertVariableIsOfTypeMacro(iter, slist_iter), \
+ StaticAssertVariableIsOfTypeMacro(lhead, slist_head *), \
(iter).cur = (lhead)->head.next; \
(iter).cur != NULL; \
(iter).cur = (iter).cur->next)
@@ -1146,8 +1146,8 @@ slist_delete_current(slist_mutable_iter *iter)
* deletion of nodes adjacent to the current node would misbehave.
*/
#define slist_foreach_modify(iter, lhead) \
- for (AssertVariableIsOfTypeMacro(iter, slist_mutable_iter), \
- AssertVariableIsOfTypeMacro(lhead, slist_head *), \
+ for (StaticAssertVariableIsOfTypeMacro(iter, slist_mutable_iter), \
+ StaticAssertVariableIsOfTypeMacro(lhead, slist_head *), \
(iter).prev = &(lhead)->head, \
(iter).cur = (iter).prev->next, \
(iter).next = (iter).cur ? (iter).cur->next : NULL; \
diff --git a/src/include/lib/pairingheap.h b/src/include/lib/pairingheap.h
index b93ea5b638d..f1582c98626 100644
--- a/src/include/lib/pairingheap.h
+++ b/src/include/lib/pairingheap.h
@@ -41,16 +41,16 @@ typedef struct pairingheap_node
* This is used to convert a pairingheap_node * back to its containing struct.
*/
#define pairingheap_container(type, membername, ptr) \
- (AssertVariableIsOfTypeMacro(ptr, pairingheap_node *), \
- AssertVariableIsOfTypeMacro(((type *) NULL)->membername, pairingheap_node), \
+ (StaticAssertVariableIsOfTypeMacro(ptr, pairingheap_node *), \
+ StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, pairingheap_node), \
((type *) ((char *) (ptr) - offsetof(type, membername))))
/*
* Like pairingheap_container, but used when the pointer is 'const ptr'
*/
#define pairingheap_const_container(type, membername, ptr) \
- (AssertVariableIsOfTypeMacro(ptr, const pairingheap_node *), \
- AssertVariableIsOfTypeMacro(((type *) NULL)->membername, pairingheap_node), \
+ (StaticAssertVariableIsOfTypeMacro(ptr, const pairingheap_node *), \
+ StaticAssertVariableIsOfTypeMacro(((type *) NULL)->membername, pairingheap_node), \
((const type *) ((const char *) (ptr) - offsetof(type, membername))))
/*
diff --git a/src/include/lib/sort_template.h b/src/include/lib/sort_template.h
index e02aa73cd4d..22b2092d03b 100644
--- a/src/include/lib/sort_template.h
+++ b/src/include/lib/sort_template.h
@@ -311,6 +311,14 @@ ST_SORT(ST_ELEMENT_TYPE * data, size_t n
DO_CHECK_FOR_INTERRUPTS();
if (n < 7)
{
+ /*
+ * Not strictly necessary, but a caller may pass a NULL pointer input
+ * and zero length, and this silences warnings about applying offsets
+ * to NULL pointers.
+ */
+ if (n < 2)
+ return;
+
for (pm = a + ST_POINTER_STEP; pm < a + n * ST_POINTER_STEP;
pm += ST_POINTER_STEP)
for (pl = pm; pl > a && DO_COMPARE(pl - ST_POINTER_STEP, pl) > 0;
@@ -387,29 +395,23 @@ ST_SORT(ST_ELEMENT_TYPE * data, size_t n
if (d1 <= d2)
{
/* Recurse on left partition, then iterate on right partition */
- if (d1 > ST_POINTER_STEP)
- DO_SORT(a, d1 / ST_POINTER_STEP);
- if (d2 > ST_POINTER_STEP)
- {
- /* Iterate rather than recurse to save stack space */
- /* DO_SORT(pn - d2, d2 / ST_POINTER_STEP) */
- a = pn - d2;
- n = d2 / ST_POINTER_STEP;
- goto loop;
- }
+ DO_SORT(a, d1 / ST_POINTER_STEP);
+
+ /* Iterate rather than recurse to save stack space */
+ /* DO_SORT(pn - d2, d2 / ST_POINTER_STEP) */
+ a = pn - d2;
+ n = d2 / ST_POINTER_STEP;
+ goto loop;
}
else
{
/* Recurse on right partition, then iterate on left partition */
- if (d2 > ST_POINTER_STEP)
- DO_SORT(pn - d2, d2 / ST_POINTER_STEP);
- if (d1 > ST_POINTER_STEP)
- {
- /* Iterate rather than recurse to save stack space */
- /* DO_SORT(a, d1 / ST_POINTER_STEP) */
- n = d1 / ST_POINTER_STEP;
- goto loop;
- }
+ DO_SORT(pn - d2, d2 / ST_POINTER_STEP);
+
+ /* Iterate rather than recurse to save stack space */
+ /* DO_SORT(a, d1 / ST_POINTER_STEP) */
+ n = d1 / ST_POINTER_STEP;
+ goto loop;
}
}
#endif
diff --git a/src/include/libpq/pqcomm.h b/src/include/libpq/pqcomm.h
index 1bbe5b9ee45..a29c9c94d79 100644
--- a/src/include/libpq/pqcomm.h
+++ b/src/include/libpq/pqcomm.h
@@ -104,6 +104,16 @@ is_unixsock_path(const char *path)
*/
#define PG_PROTOCOL_RESERVED_31 PG_PROTOCOL(3,1)
+/*
+ * PG_PROTOCOL_GREASE is an intentionally unsupported protocol version used
+ * for "greasing" (the practice of sending valid, but extraneous or otherwise
+ * unusual, messages to keep peer implementations honest). This helps ensure
+ * that servers properly implement protocol version negotiation. Version 3.9999
+ * was chosen since it is safely within the valid range, it is representable
+ * via PQfullProtocolVersion, and it is unlikely to ever be needed in practice.
+ */
+#define PG_PROTOCOL_GREASE PG_PROTOCOL(3,9999)
+
/*
* A client can send a cancel-current-operation request to the postmaster.
* This is uglier than sending it directly to the client's backend, but it
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index a5b7b49e4b5..e1655fe61d6 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -695,7 +695,14 @@ extern int pg_char_and_wchar_strcmp(const char *s1, const pg_wchar *s2);
extern int pg_wchar_strncmp(const pg_wchar *s1, const pg_wchar *s2, size_t n);
extern int pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n);
extern size_t pg_wchar_strlen(const pg_wchar *str);
+extern int pg_mblen_cstr(const char *mbstr);
+extern int pg_mblen_range(const char *mbstr, const char *end);
+extern int pg_mblen_with_len(const char *mbstr, int limit);
+extern int pg_mblen_unbounded(const char *mbstr);
+
+/* deprecated */
extern int pg_mblen(const char *mbstr);
+
extern int pg_dsplen(const char *mbstr);
extern int pg_mbstrlen(const char *mbstr);
extern int pg_mbstrlen_with_len(const char *mbstr, int limit);
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index fb808823acf..27758ec16fe 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -208,6 +208,9 @@ typedef struct PlannerGlobal
/* "flat" list of RTEPermissionInfos */
List *finalrteperminfos;
+ /* list of SubPlanRTInfo nodes */
+ List *subrtinfos;
+
/* "flat" list of PlanRowMarks */
List *finalrowmarks;
@@ -229,6 +232,9 @@ typedef struct PlannerGlobal
/* type OIDs for PARAM_EXEC Params */
List *paramExecTypes;
+ /* info about nodes elided from the plan during setrefs processing */
+ List *elidedNodes;
+
/* highest PlaceHolderVar ID assigned */
Index lastPHId;
@@ -1406,6 +1412,8 @@ typedef struct IndexOptInfo
bool nullsnotdistinct;
/* is uniqueness enforced immediately? */
bool immediate;
+ /* true if paths using this index should be marked disabled */
+ bool disabled;
/* true if index doesn't really exist */
bool hypothetical;
@@ -2244,6 +2252,12 @@ typedef struct CustomPath
* For partial Append, 'subpaths' contains non-partial subpaths followed by
* partial subpaths.
*
+ * Whenever accumulate_append_subpath() allows us to consolidate multiple
+ * levels of Append paths down to one, we store the RTI sets for the omitted
+ * paths in child_append_relid_sets. This is not necessary for planning or
+ * execution; we do it for the benefit of code that wants to inspect the
+ * final plan and understand how it came to be.
+ *
* Note: it is possible for "subpaths" to contain only one, or even no,
* elements. These cases are optimized during create_append_plan.
* In particular, an AppendPath with no subpaths is a "dummy" path that
@@ -2259,6 +2273,7 @@ typedef struct AppendPath
/* Index of first partial path in subpaths; list_length(subpaths) if none */
int first_partial_path;
Cardinality limit_tuples; /* hard limit on output tuples, or -1 */
+ List *child_append_relid_sets;
} AppendPath;
#define IS_DUMMY_APPEND(p) \
@@ -2275,12 +2290,15 @@ extern bool is_dummy_rel(RelOptInfo *rel);
/*
* MergeAppendPath represents a MergeAppend plan, ie, the merging of sorted
* results from several member plans to produce similarly-sorted output.
+ *
+ * child_append_relid_sets has the same meaning here as for AppendPath.
*/
typedef struct MergeAppendPath
{
Path path;
List *subpaths; /* list of component Paths */
Cardinality limit_tuples; /* hard limit on output tuples, or -1 */
+ List *child_append_relid_sets;
} MergeAppendPath;
/*
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 4bc6fb5670e..485bec5aabd 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -131,6 +131,9 @@ typedef struct PlannedStmt
*/
List *subplans;
+ /* a list of SubPlanRTInfo objects */
+ List *subrtinfos;
+
/* indices of subplans that require REWIND */
Bitmapset *rewindPlanIDs;
@@ -149,6 +152,9 @@ typedef struct PlannedStmt
/* non-null if this is utility stmt */
Node *utilityStmt;
+ /* info about nodes elided from the plan during setrefs processing */
+ List *elidedNodes;
+
/*
* DefElem objects added by extensions, e.g. using planner_shutdown_hook
*
@@ -388,9 +394,16 @@ struct PartitionPruneInfo; /* forward reference to struct below */
typedef struct Append
{
Plan plan;
+
/* RTIs of appendrel(s) formed by this node */
Bitmapset *apprelids;
+
+ /* sets of RTIs of appendrels consolidated into this node */
+ List *child_append_relid_sets;
+
+ /* plans to run */
List *appendplans;
+
/* # of asynchronous plans */
int nasyncplans;
@@ -420,6 +433,10 @@ typedef struct MergeAppend
/* RTIs of appendrel(s) formed by this node */
Bitmapset *apprelids;
+ /* sets of RTIs of appendrels consolidated into this node */
+ List *child_append_relid_sets;
+
+ /* plans to run */
List *mergeplans;
/* these fields are just like the sort-key info in struct Sort: */
@@ -1821,4 +1838,35 @@ typedef enum MonotonicFunction
MONOTONICFUNC_BOTH = MONOTONICFUNC_INCREASING | MONOTONICFUNC_DECREASING,
} MonotonicFunction;
+/*
+ * SubPlanRTInfo
+ *
+ * Information about which range table entries came from which subquery
+ * planning cycles.
+ */
+typedef struct SubPlanRTInfo
+{
+ NodeTag type;
+ char *plan_name;
+ Index rtoffset;
+ bool dummy;
+} SubPlanRTInfo;
+
+/*
+ * ElidedNode
+ *
+ * Information about nodes elided from the final plan tree: trivial subquery
+ * scans, and single-child Append and MergeAppend nodes.
+ *
+ * plan_node_id is that of the surviving plan node, the sole child of the
+ * one which was elided.
+ */
+typedef struct ElidedNode
+{
+ NodeTag type;
+ int plan_node_id;
+ NodeTag elided_type;
+ Bitmapset *relids;
+} ElidedNode;
+
#endif /* PLANNODES_H */
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 224750859c3..938510400cc 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -17,6 +17,20 @@
#include "nodes/bitmapset.h"
#include "nodes/pathnodes.h"
+/*
+ * Everything in subpaths or partial_subpaths will become part of the
+ * Append node's subpaths list. Partial and non-partial subpaths can be
+ * mixed in the same Append node only if it is parallel-aware.
+ *
+ * See the comments for AppendPath for the meaning and purpose of the
+ * child_append_relid_sets field.
+ */
+typedef struct AppendPathInput
+{
+ List *subpaths;
+ List *partial_subpaths;
+ List *child_append_relid_sets;
+} AppendPathInput;
/* Hook for plugins to get control during joinrel setup */
typedef void (*joinrel_setup_hook_type) (PlannerInfo *root,
@@ -41,7 +55,7 @@ extern bool add_path_precheck(RelOptInfo *parent_rel, int disabled_nodes,
List *pathkeys, Relids required_outer);
extern void add_partial_path(RelOptInfo *parent_rel, Path *new_path);
extern bool add_partial_path_precheck(RelOptInfo *parent_rel,
- int disabled_nodes,
+ int disabled_nodes, Cost startup_cost,
Cost total_cost, List *pathkeys);
extern Path *create_seqscan_path(PlannerInfo *root, RelOptInfo *rel,
@@ -78,14 +92,16 @@ extern TidRangePath *create_tidrangescan_path(PlannerInfo *root,
List *tidrangequals,
Relids required_outer,
int parallel_workers);
+
extern AppendPath *create_append_path(PlannerInfo *root, RelOptInfo *rel,
- List *subpaths, List *partial_subpaths,
+ AppendPathInput input,
List *pathkeys, Relids required_outer,
int parallel_workers, bool parallel_aware,
double rows);
extern MergeAppendPath *create_merge_append_path(PlannerInfo *root,
RelOptInfo *rel,
List *subpaths,
+ List *child_append_relid_sets,
List *pathkeys,
Relids required_outer);
extern GroupResultPath *create_group_result_path(PlannerInfo *root,
diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h
index ae3f7f2edb6..80509773c01 100644
--- a/src/include/optimizer/planner.h
+++ b/src/include/optimizer/planner.h
@@ -35,6 +35,7 @@ extern PGDLLIMPORT planner_hook_type planner_hook;
/* Hook for plugins to get control after PlannerGlobal is initialized */
typedef void (*planner_setup_hook_type) (PlannerGlobal *glob, Query *parse,
const char *query_string,
+ int cursorOptions,
double *tuple_fraction,
ExplainState *es);
extern PGDLLIMPORT planner_setup_hook_type planner_setup_hook;
diff --git a/src/include/postgres.h b/src/include/postgres.h
index 8b92f453e7a..a7a6584e762 100644
--- a/src/include/postgres.h
+++ b/src/include/postgres.h
@@ -533,9 +533,9 @@ Float8GetDatum(float8 X)
*/
#define Int64GetDatumFast(X) \
- (AssertVariableIsOfTypeMacro(X, int64), Int64GetDatum(X))
+ (StaticAssertVariableIsOfTypeMacro(X, int64), Int64GetDatum(X))
#define Float8GetDatumFast(X) \
- (AssertVariableIsOfTypeMacro(X, double), Float8GetDatum(X))
+ (StaticAssertVariableIsOfTypeMacro(X, double), Float8GetDatum(X))
/* ----------------------------------------------------------------
diff --git a/src/include/postmaster/proctypelist.h b/src/include/postmaster/proctypelist.h
index 0b99eaabfd0..4e259e84c2d 100644
--- a/src/include/postmaster/proctypelist.h
+++ b/src/include/postmaster/proctypelist.h
@@ -25,27 +25,27 @@
*/
/*
- * List of process types (symbol, description, Main function, shmem_attach)
- * entries.
+ * List of process types (symbol, category, description, Main function,
+ * shmem_attach, message level) entries.
*/
-/* bktype, description, main_func, shmem_attach */
-PG_PROCTYPE(B_ARCHIVER, gettext_noop("archiver"), PgArchiverMain, true)
-PG_PROCTYPE(B_AUTOVAC_LAUNCHER, gettext_noop("autovacuum launcher"), AutoVacLauncherMain, true)
-PG_PROCTYPE(B_AUTOVAC_WORKER, gettext_noop("autovacuum worker"), AutoVacWorkerMain, true)
-PG_PROCTYPE(B_BACKEND, gettext_noop("client backend"), BackendMain, true)
-PG_PROCTYPE(B_BG_WORKER, gettext_noop("background worker"), BackgroundWorkerMain, true)
-PG_PROCTYPE(B_BG_WRITER, gettext_noop("background writer"), BackgroundWriterMain, true)
-PG_PROCTYPE(B_CHECKPOINTER, gettext_noop("checkpointer"), CheckpointerMain, true)
-PG_PROCTYPE(B_DEAD_END_BACKEND, gettext_noop("dead-end client backend"), BackendMain, true)
-PG_PROCTYPE(B_INVALID, gettext_noop("unrecognized"), NULL, false)
-PG_PROCTYPE(B_IO_WORKER, gettext_noop("io worker"), IoWorkerMain, true)
-PG_PROCTYPE(B_LOGGER, gettext_noop("syslogger"), SysLoggerMain, false)
-PG_PROCTYPE(B_SLOTSYNC_WORKER, gettext_noop("slotsync worker"), ReplSlotSyncWorkerMain, true)
-PG_PROCTYPE(B_STANDALONE_BACKEND, gettext_noop("standalone backend"), NULL, false)
-PG_PROCTYPE(B_STARTUP, gettext_noop("startup"), StartupProcessMain, true)
-PG_PROCTYPE(B_WAL_RECEIVER, gettext_noop("walreceiver"), WalReceiverMain, true)
-PG_PROCTYPE(B_WAL_SENDER, gettext_noop("walsender"), NULL, true)
-PG_PROCTYPE(B_WAL_SUMMARIZER, gettext_noop("walsummarizer"), WalSummarizerMain, true)
-PG_PROCTYPE(B_WAL_WRITER, gettext_noop("walwriter"), WalWriterMain, true)
+/* bktype, bkcategory, description, main_func, shmem_attach */
+PG_PROCTYPE(B_ARCHIVER, "archiver", gettext_noop("archiver"), PgArchiverMain, true)
+PG_PROCTYPE(B_AUTOVAC_LAUNCHER, "autovacuum", gettext_noop("autovacuum launcher"), AutoVacLauncherMain, true)
+PG_PROCTYPE(B_AUTOVAC_WORKER, "autovacuum", gettext_noop("autovacuum worker"), AutoVacWorkerMain, true)
+PG_PROCTYPE(B_BACKEND, "backend", gettext_noop("client backend"), BackendMain, true)
+PG_PROCTYPE(B_BG_WORKER, "bgworker", gettext_noop("background worker"), BackgroundWorkerMain, true)
+PG_PROCTYPE(B_BG_WRITER, "bgwriter", gettext_noop("background writer"), BackgroundWriterMain, true)
+PG_PROCTYPE(B_CHECKPOINTER, "checkpointer", gettext_noop("checkpointer"), CheckpointerMain, true)
+PG_PROCTYPE(B_DEAD_END_BACKEND, "backend", gettext_noop("dead-end client backend"), BackendMain, true)
+PG_PROCTYPE(B_INVALID, "postmaster", gettext_noop("unrecognized"), NULL, false)
+PG_PROCTYPE(B_IO_WORKER, "ioworker", gettext_noop("io worker"), IoWorkerMain, true)
+PG_PROCTYPE(B_LOGGER, "syslogger", gettext_noop("syslogger"), SysLoggerMain, false)
+PG_PROCTYPE(B_SLOTSYNC_WORKER, "slotsyncworker", gettext_noop("slotsync worker"), ReplSlotSyncWorkerMain, true)
+PG_PROCTYPE(B_STANDALONE_BACKEND, "backend", gettext_noop("standalone backend"), NULL, false)
+PG_PROCTYPE(B_STARTUP, "startup", gettext_noop("startup"), StartupProcessMain, true)
+PG_PROCTYPE(B_WAL_RECEIVER, "walreceiver", gettext_noop("walreceiver"), WalReceiverMain, true)
+PG_PROCTYPE(B_WAL_SENDER, "walsender", gettext_noop("walsender"), NULL, true)
+PG_PROCTYPE(B_WAL_SUMMARIZER, "walsummarizer", gettext_noop("walsummarizer"), WalSummarizerMain, true)
+PG_PROCTYPE(B_WAL_WRITER, "walwriter", gettext_noop("walwriter"), WalWriterMain, true)
diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h
index 7f03537bda7..bc9d4ece672 100644
--- a/src/include/replication/logical.h
+++ b/src/include/replication/logical.h
@@ -148,7 +148,8 @@ extern bool filter_by_origin_cb_wrapper(LogicalDecodingContext *ctx, ReplOriginI
extern void ResetLogicalStreamingState(void);
extern void UpdateDecodingStats(LogicalDecodingContext *ctx);
-extern bool LogicalReplicationSlotHasPendingWal(XLogRecPtr end_of_wal);
+extern XLogRecPtr LogicalReplicationSlotCheckPendingWal(XLogRecPtr end_of_wal,
+ XLogRecPtr scan_cutoff_lsn);
extern XLogRecPtr LogicalSlotAdvanceAndCheckSnapState(XLogRecPtr moveto,
bool *found_consistent_snapshot);
diff --git a/src/include/replication/slot.h b/src/include/replication/slot.h
index f465e430cc6..72f8be629f3 100644
--- a/src/include/replication/slot.h
+++ b/src/include/replication/slot.h
@@ -185,8 +185,11 @@ typedef struct ReplicationSlot
/* is this slot defined */
bool in_use;
- /* Who is streaming out changes for this slot? 0 in unused slots. */
- pid_t active_pid;
+ /*
+ * Who is streaming out changes for this slot? INVALID_PROC_NUMBER in
+ * unused slots.
+ */
+ ProcNumber active_proc;
/* any outstanding modifications? */
bool just_dirtied;
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 413233bcd39..8ac466fd346 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -55,12 +55,23 @@ typedef int File;
#define IO_DIRECT_WAL 0x02
#define IO_DIRECT_WAL_INIT 0x04
+enum FileExtendMethod
+{
+#ifdef HAVE_POSIX_FALLOCATE
+ FILE_EXTEND_METHOD_POSIX_FALLOCATE,
+#endif
+ FILE_EXTEND_METHOD_WRITE_ZEROS,
+};
+
+/* Default to the first available file_extend_method. */
+#define DEFAULT_FILE_EXTEND_METHOD 0
/* GUC parameter */
extern PGDLLIMPORT int max_files_per_process;
extern PGDLLIMPORT bool data_sync_retry;
extern PGDLLIMPORT int recovery_init_sync_method;
extern PGDLLIMPORT int io_direct_flags;
+extern PGDLLIMPORT int file_extend_method;
/*
* This is private to fd.c, but exported for save/restore_backend_variables()
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index 3aeada554b2..10c7b065861 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -32,9 +32,9 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */
#define PGShmemMagic 679834894
pid_t creatorPID; /* PID of creating process (set but unread) */
Size totalsize; /* total size of segment */
- Size freeoffset; /* offset to first free space */
+ Size content_offset; /* offset to the data, i.e. size of this
+ * header */
dsm_handle dsm_control; /* ID of dynamic shared memory control seg */
- void *index; /* pointer to ShmemIndex table */
#ifndef WIN32 /* Windows doesn't have useful inode#s */
dev_t device; /* device data directory is on */
ino_t inode; /* inode number of data directory */
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 039bc8353be..ac0df4aeaaa 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -17,6 +17,7 @@
#include "access/clog.h"
#include "access/xlogdefs.h"
#include "lib/ilist.h"
+#include "miscadmin.h"
#include "storage/latch.h"
#include "storage/lock.h"
#include "storage/pg_sema.h"
@@ -166,7 +167,7 @@ typedef enum
* but its myProcLocks[] lists are valid.
*
* We allow many fields of this struct to be accessed without locks, such as
- * delayChkptFlags and isRegularBackend. However, keep in mind that writing
+ * delayChkptFlags and backendType. However, keep in mind that writing
* mirrored ones (see below) requires holding ProcArrayLock or XidGenLock in
* at least shared mode, so that pgxactoff does not change concurrently.
*
@@ -233,14 +234,17 @@ struct PGPROC
Oid tempNamespaceId; /* OID of temp schema this backend is
* using */
- bool isRegularBackend; /* true if it's a regular backend. */
+ BackendType backendType; /* what kind of process is this? */
/*
* While in hot standby mode, shows that a conflict signal has been sent
* for the current transaction. Set/cleared while holding ProcArrayLock,
* though not required. Accessed without lock, if needed.
+ *
+ * This is a bitmask; each bit corresponds to a RecoveryConflictReason
+ * enum value.
*/
- bool recoveryConflictPending;
+ pg_atomic_uint32 pendingRecoveryConflicts;
/*
* Info about LWLock the process is currently waiting for, if any.
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index da7b5e78d30..c5ab1574fe3 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -77,14 +77,15 @@ extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin,
bool excludeXmin0, bool allDbs, int excludeVacuum,
int *nvxids);
extern VirtualTransactionId *GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid);
-extern pid_t CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode);
-extern pid_t SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode,
- bool conflictPending);
+
+extern bool SignalRecoveryConflict(PGPROC *proc, pid_t pid, RecoveryConflictReason reason);
+extern bool SignalRecoveryConflictWithVirtualXID(VirtualTransactionId vxid, RecoveryConflictReason reason);
+extern void SignalRecoveryConflictWithDatabase(Oid databaseid, RecoveryConflictReason reason);
+
extern bool MinimumActiveBackends(int min);
extern int CountDBBackends(Oid databaseid);
extern int CountDBConnections(Oid databaseid);
-extern void CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending);
extern int CountUserBackends(Oid roleid);
extern bool CountOtherDBBackends(Oid databaseId,
int *nbackends, int *nprepared);
diff --git a/src/include/storage/proclist.h b/src/include/storage/proclist.h
index 965609145e4..9caf109a845 100644
--- a/src/include/storage/proclist.h
+++ b/src/include/storage/proclist.h
@@ -204,8 +204,8 @@ proclist_pop_head_node_offset(proclist_head *list, size_t node_offset)
* node with proclist_delete(list, iter.cur, node_offset).
*/
#define proclist_foreach_modify(iter, lhead, link_member) \
- for (AssertVariableIsOfTypeMacro(iter, proclist_mutable_iter), \
- AssertVariableIsOfTypeMacro(lhead, proclist_head *), \
+ for (StaticAssertVariableIsOfTypeMacro(iter, proclist_mutable_iter), \
+ StaticAssertVariableIsOfTypeMacro(lhead, proclist_head *), \
(iter).cur = (lhead)->head, \
(iter).next = (iter).cur == INVALID_PROC_NUMBER ? INVALID_PROC_NUMBER : \
proclist_node_get((iter).cur, \
diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h
index e52b8eb7697..348fba53a93 100644
--- a/src/include/storage/procsignal.h
+++ b/src/include/storage/procsignal.h
@@ -36,20 +36,12 @@ typedef enum
PROCSIG_BARRIER, /* global barrier interrupt */
PROCSIG_LOG_MEMORY_CONTEXT, /* ask backend to log the memory contexts */
PROCSIG_PARALLEL_APPLY_MESSAGE, /* Message from parallel apply workers */
-
- /* Recovery conflict reasons */
- PROCSIG_RECOVERY_CONFLICT_FIRST,
- PROCSIG_RECOVERY_CONFLICT_DATABASE = PROCSIG_RECOVERY_CONFLICT_FIRST,
- PROCSIG_RECOVERY_CONFLICT_TABLESPACE,
- PROCSIG_RECOVERY_CONFLICT_LOCK,
- PROCSIG_RECOVERY_CONFLICT_SNAPSHOT,
- PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT,
- PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
- PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
- PROCSIG_RECOVERY_CONFLICT_LAST = PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
+ PROCSIG_RECOVERY_CONFLICT, /* backend is blocking recovery, check
+ * PGPROC->pendingRecoveryConflicts for the
+ * reason */
} ProcSignalReason;
-#define NUM_PROCSIGNALS (PROCSIG_RECOVERY_CONFLICT_LAST + 1)
+#define NUM_PROCSIGNALS (PROCSIG_RECOVERY_CONFLICT + 1)
typedef enum
{
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
index 2522cae0c31..3d9070e79d4 100644
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@@ -119,6 +119,10 @@
* gcc from thinking it can cache the values of shared-memory fields
* across the asm code. Add "cc" if your asm code changes the condition
* code register, and also list any temp registers the code uses.
+ *
+ * If you need branch target labels within the asm block, include "%="
+ * in the label names to make them distinct across multiple asm blocks
+ * within a source file.
*----------
*/
@@ -147,11 +151,11 @@ tas(volatile slock_t *lock)
* leave it alone.
*/
__asm__ __volatile__(
- " cmpb $0,%1 \n"
- " jne 1f \n"
- " lock \n"
- " xchgb %0,%1 \n"
- "1: \n"
+ " cmpb $0,%1 \n"
+ " jne TAS%=_out \n"
+ " lock \n"
+ " xchgb %0,%1 \n"
+ "TAS%=_out: \n"
: "+q"(_res), "+m"(*lock)
: /* no inputs */
: "memory", "cc");
@@ -421,17 +425,17 @@ tas(volatile slock_t *lock)
__asm__ __volatile__(
" lwarx %0,0,%3,1 \n"
" cmpwi %0,0 \n"
-" bne 1f \n"
+" bne TAS%=_fail \n"
" addi %0,%0,1 \n"
" stwcx. %0,0,%3 \n"
-" beq 2f \n"
-"1: \n"
+" beq TAS%=_ok \n"
+"TAS%=_fail: \n"
" li %1,1 \n"
-" b 3f \n"
-"2: \n"
+" b TAS%=_out \n"
+"TAS%=_ok: \n"
" lwsync \n"
" li %1,0 \n"
-"3: \n"
+"TAS%=_out: \n"
: "=&b"(_t), "=r"(_res), "+m"(*lock)
: "r"(lock)
: "memory", "cc");
diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h
index e71a51dfe84..89d45287c17 100644
--- a/src/include/storage/shmem.h
+++ b/src/include/storage/shmem.h
@@ -29,8 +29,7 @@
extern PGDLLIMPORT slock_t *ShmemLock;
typedef struct PGShmemHeader PGShmemHeader; /* avoid including
* storage/pg_shmem.h here */
-extern void InitShmemAccess(PGShmemHeader *seghdr);
-extern void InitShmemAllocation(void);
+extern void InitShmemAllocator(PGShmemHeader *seghdr);
extern void *ShmemAlloc(Size size);
extern void *ShmemAllocNoError(Size size);
extern bool ShmemAddrIsValid(const void *addr);
diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h
index 7b10932635a..c63a4f2cc6a 100644
--- a/src/include/storage/standby.h
+++ b/src/include/storage/standby.h
@@ -16,7 +16,6 @@
#include "datatype/timestamp.h"
#include "storage/lock.h"
-#include "storage/procsignal.h"
#include "storage/relfilelocator.h"
#include "storage/standbydefs.h"
@@ -25,6 +24,45 @@ extern PGDLLIMPORT int max_standby_archive_delay;
extern PGDLLIMPORT int max_standby_streaming_delay;
extern PGDLLIMPORT bool log_recovery_conflict_waits;
+/* Recovery conflict reasons */
+typedef enum
+{
+ /* Backend is connected to a database that is being dropped */
+ RECOVERY_CONFLICT_DATABASE,
+
+ /* Backend is using a tablespace that is being dropped */
+ RECOVERY_CONFLICT_TABLESPACE,
+
+ /* Backend is holding a lock that is blocking recovery */
+ RECOVERY_CONFLICT_LOCK,
+
+ /* Backend is holding a snapshot that is blocking recovery */
+ RECOVERY_CONFLICT_SNAPSHOT,
+
+ /* Backend is using a logical replication slot that must be invalidated */
+ RECOVERY_CONFLICT_LOGICALSLOT,
+
+ /* Backend is holding a pin on a buffer that is blocking recovery */
+ RECOVERY_CONFLICT_BUFFERPIN,
+
+ /*
+ * The backend is requested to check for deadlocks. The startup process
+ * doesn't check for deadlock directly, because we want to kill one of the
+ * other backends instead of the startup process.
+ */
+ RECOVERY_CONFLICT_STARTUP_DEADLOCK,
+
+ /*
+ * Like RECOVERY_CONFLICT_STARTUP_DEADLOCK is, but the suspected deadlock
+ * involves a buffer pin that some other backend is holding. That needs
+ * special checking because the normal deadlock detector doesn't track the
+ * buffer pins.
+ */
+ RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK,
+} RecoveryConflictReason;
+
+#define NUM_RECOVERY_CONFLICT_REASONS (RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK + 1)
+
extern void InitRecoveryTransactionEnvironment(void);
extern void ShutdownRecoveryTransactionEnvironment(void);
@@ -43,7 +81,7 @@ extern void CheckRecoveryConflictDeadlock(void);
extern void StandbyDeadLockHandler(void);
extern void StandbyTimeoutHandler(void);
extern void StandbyLockTimeoutHandler(void);
-extern void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
+extern void LogRecoveryConflict(RecoveryConflictReason reason, TimestampTz wait_start,
TimestampTz now, VirtualTransactionId *wait_list,
bool still_waiting);
diff --git a/src/include/tcop/tcopprot.h b/src/include/tcop/tcopprot.h
index 54ddee875ed..5bc5bcfb20d 100644
--- a/src/include/tcop/tcopprot.h
+++ b/src/include/tcop/tcopprot.h
@@ -74,7 +74,7 @@ extern void die(SIGNAL_ARGS);
pg_noreturn extern void quickdie(SIGNAL_ARGS);
extern void StatementCancelHandler(SIGNAL_ARGS);
pg_noreturn extern void FloatExceptionHandler(SIGNAL_ARGS);
-extern void HandleRecoveryConflictInterrupt(ProcSignalReason reason);
+extern void HandleRecoveryConflictInterrupt(void);
extern void ProcessClientReadInterrupt(bool blocked);
extern void ProcessClientWriteInterrupt(bool blocked);
diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h
index cea417a91b5..6e2d67ee4a5 100644
--- a/src/include/tsearch/ts_locale.h
+++ b/src/include/tsearch/ts_locale.h
@@ -37,10 +37,34 @@ typedef struct
/* The second argument of t_iseq() must be a plain ASCII character */
#define t_iseq(x,c) (TOUCHAR(x) == (unsigned char) (c))
-#define COPYCHAR(d,s) memcpy(d, s, pg_mblen(s))
+/* Copy multibyte character of known byte length, return byte length. */
+static inline int
+ts_copychar_with_len(void *dest, const void *src, int length)
+{
+ memcpy(dest, src, length);
+ return length;
+}
+
+/* Copy multibyte character from null-terminated string, return byte length. */
+static inline int
+ts_copychar_cstr(void *dest, const void *src)
+{
+ return ts_copychar_with_len(dest, src, pg_mblen_cstr((const char *) src));
+}
+
+/* Historical macro for the above. */
+#define COPYCHAR ts_copychar_cstr
+
+#define GENERATE_T_ISCLASS_DECL(character_class) \
+extern int t_is##character_class##_with_len(const char *ptr, int len); \
+extern int t_is##character_class##_cstr(const char *ptr); \
+extern int t_is##character_class##_unbounded(const char *ptr); \
+\
+/* deprecated */ \
+extern int t_is##character_class(const char *ptr);
-extern int t_isalpha(const char *ptr);
-extern int t_isalnum(const char *ptr);
+GENERATE_T_ISCLASS_DECL(alnum);
+GENERATE_T_ISCLASS_DECL(alpha);
extern bool tsearch_readline_begin(tsearch_readline_state *stp,
const char *filename);
diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h
index b0d1dbab6da..3eb0770f9c2 100644
--- a/src/include/tsearch/ts_utils.h
+++ b/src/include/tsearch/ts_utils.h
@@ -40,14 +40,12 @@ extern bool gettoken_tsvector(TSVectorParseState state,
extern void close_tsvector_parser(TSVectorParseState state);
/* phrase operator begins with '<' */
-#define ISOPERATOR(x) \
- ( pg_mblen(x) == 1 && ( *(x) == '!' || \
- *(x) == '&' || \
- *(x) == '|' || \
- *(x) == '(' || \
- *(x) == ')' || \
- *(x) == '<' \
- ) )
+#define ISOPERATOR(x) (*(x) == '!' || \
+ *(x) == '&' || \
+ *(x) == '|' || \
+ *(x) == '(' || \
+ *(x) == ')' || \
+ *(x) == '<')
/* parse_tsquery */
diff --git a/src/include/utils/.gitignore b/src/include/utils/.gitignore
index 30f921429c6..ff6f61cd7ee 100644
--- a/src/include/utils/.gitignore
+++ b/src/include/utils/.gitignore
@@ -4,4 +4,6 @@
/probes.h
/errcodes.h
/header-stamp
+/pgstat_wait_event.c
+/wait_event_funcs_data.c
/wait_event_types.h
diff --git a/src/include/utils/arrayaccess.h b/src/include/utils/arrayaccess.h
index abb8659de02..a325ae52574 100644
--- a/src/include/utils/arrayaccess.h
+++ b/src/include/utils/arrayaccess.h
@@ -22,8 +22,8 @@
* Functions for iterating through elements of a flat or expanded array.
* These require a state struct "array_iter iter".
*
- * Use "array_iter_setup(&iter, arrayptr);" to prepare to iterate, and
- * "datumvar = array_iter_next(&iter, &isnullvar, index, ...);" to fetch
+ * Use "array_iter_setup(&iter, arrayptr, ...);" to prepare to iterate,
+ * and "datumvar = array_iter_next(&iter, &isnullvar, index);" to fetch
* the next element into datumvar/isnullvar.
* "index" must be the zero-origin element number; we make caller provide
* this since caller is generally counting the elements anyway. Despite
@@ -42,11 +42,17 @@ typedef struct array_iter
char *dataptr; /* Current spot in the data area */
bits8 *bitmapptr; /* Current byte of the nulls bitmap, or NULL */
int bitmask; /* mask for current bit in nulls bitmap */
+
+ /* Fields used in both cases: data about array's element type */
+ int elmlen;
+ bool elmbyval;
+ uint8 elmalignby;
} array_iter;
static inline void
-array_iter_setup(array_iter *it, AnyArrayType *a)
+array_iter_setup(array_iter *it, AnyArrayType *a,
+ int elmlen, bool elmbyval, char elmalign)
{
if (VARATT_IS_EXPANDED_HEADER(a))
{
@@ -75,11 +81,13 @@ array_iter_setup(array_iter *it, AnyArrayType *a)
it->bitmapptr = ARR_NULLBITMAP((ArrayType *) a);
}
it->bitmask = 1;
+ it->elmlen = elmlen;
+ it->elmbyval = elmbyval;
+ it->elmalignby = typalign_to_alignby(elmalign);
}
static inline Datum
-array_iter_next(array_iter *it, bool *isnull, int i,
- int elmlen, bool elmbyval, char elmalign)
+array_iter_next(array_iter *it, bool *isnull, int i)
{
Datum ret;
@@ -98,10 +106,11 @@ array_iter_next(array_iter *it, bool *isnull, int i,
else
{
*isnull = false;
- ret = fetch_att(it->dataptr, elmbyval, elmlen);
- it->dataptr = att_addlength_pointer(it->dataptr, elmlen,
+ ret = fetch_att(it->dataptr, it->elmbyval, it->elmlen);
+ it->dataptr = att_addlength_pointer(it->dataptr, it->elmlen,
it->dataptr);
- it->dataptr = (char *) att_align_nominal(it->dataptr, elmalign);
+ it->dataptr = (char *) att_nominal_alignby(it->dataptr,
+ it->elmalignby);
}
it->bitmask <<= 1;
if (it->bitmask == 0x100)
diff --git a/src/include/utils/backend_status.h b/src/include/utils/backend_status.h
index 781e48c0c10..ddd06304e97 100644
--- a/src/include/utils/backend_status.h
+++ b/src/include/utils/backend_status.h
@@ -331,7 +331,6 @@ extern const char *pgstat_get_crashed_backend_activity(int pid, char *buffer,
int buflen);
extern int64 pgstat_get_my_query_id(void);
extern int64 pgstat_get_my_plan_id(void);
-extern BackendType pgstat_get_backend_type_by_proc_number(ProcNumber procNumber);
/* ----------
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index cf57819ebdc..5dcd788ff80 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -68,6 +68,7 @@ extern char *pg_ultostr(char *str, uint32 value);
/* oid.c */
extern oidvector *buildoidvector(const Oid *oids, int n);
+extern void check_valid_oidvector(const oidvector *oidArray);
extern Oid oidparse(Node *node);
extern int oid_cmp(const void *p1, const void *p2);
diff --git a/src/include/utils/freepage.h b/src/include/utils/freepage.h
index 8c0e0edd791..2681fd6d5ea 100644
--- a/src/include/utils/freepage.h
+++ b/src/include/utils/freepage.h
@@ -65,7 +65,7 @@ struct FreePageManager
/* Macros to convert between page numbers (expressed as Size) and pointers. */
#define fpm_page_to_pointer(base, page) \
- (AssertVariableIsOfTypeMacro(page, Size), \
+ (StaticAssertVariableIsOfTypeMacro(page, Size), \
(base) + FPM_PAGE_SIZE * (page))
#define fpm_pointer_to_page(base, ptr) \
(((Size) (((char *) (ptr)) - (base))) / FPM_PAGE_SIZE)
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index bf39878c43e..8acbdba7ff5 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -295,7 +295,7 @@ extern PGDLLIMPORT bool log_duration;
extern PGDLLIMPORT int log_parameter_max_length;
extern PGDLLIMPORT int log_parameter_max_length_on_error;
extern PGDLLIMPORT int log_min_error_statement;
-extern PGDLLIMPORT int log_min_messages;
+extern PGDLLIMPORT int log_min_messages[];
extern PGDLLIMPORT int client_min_messages;
extern PGDLLIMPORT int log_min_duration_sample;
extern PGDLLIMPORT int log_min_duration_statement;
@@ -329,6 +329,8 @@ extern PGDLLIMPORT bool trace_sort;
extern PGDLLIMPORT bool optimize_bounded_sort;
#endif
+extern PGDLLIMPORT const char *const log_min_messages_process_types[];
+
/*
* Declarations for options for enum values
*
@@ -344,6 +346,7 @@ extern PGDLLIMPORT const struct config_enum_entry archive_mode_options[];
extern PGDLLIMPORT const struct config_enum_entry dynamic_shared_memory_options[];
extern PGDLLIMPORT const struct config_enum_entry io_method_options[];
extern PGDLLIMPORT const struct config_enum_entry recovery_target_action_options[];
+extern PGDLLIMPORT const struct config_enum_entry server_message_level_options[];
extern PGDLLIMPORT const struct config_enum_entry wal_level_options[];
extern PGDLLIMPORT const struct config_enum_entry wal_sync_method_options[];
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index b6ecb0e769f..9c90670d9b8 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -177,5 +177,7 @@ extern void assign_wal_sync_method(int new_wal_sync_method, void *extra);
extern bool check_synchronized_standby_slots(char **newval, void **extra,
GucSource source);
extern void assign_synchronized_standby_slots(const char *newval, void *extra);
+extern bool check_log_min_messages(char **newval, void **extra, GucSource source);
+extern void assign_log_min_messages(const char *newval, void *extra);
#endif /* GUC_HOOKS_H */
diff --git a/src/include/utils/meson.build b/src/include/utils/meson.build
index 318a6aec0d0..fd3a2352df5 100644
--- a/src/include/utils/meson.build
+++ b/src/include/utils/meson.build
@@ -79,8 +79,6 @@ generated_backend_headers += fmgrtab_target[1]
# autoconf generates the file there, ensure we get a conflict
generated_sources_ac += {
- 'src/backend/utils': fmgrtab_output + ['errcodes.h', 'probes.h', 'fmgr-stamp'],
+ 'src/backend/utils': fmgrtab_output + ['errcodes.h', 'wait_event_types.h', 'probes.h', 'fmgr-stamp'],
'src/include/utils': ['header-stamp'],
}
-
-generated_sources_ac += {'src/backend/utils/activity': ['wait_event_types.h']}
diff --git a/src/include/utils/relptr.h b/src/include/utils/relptr.h
index aeb17fa24a5..94975f2f237 100644
--- a/src/include/utils/relptr.h
+++ b/src/include/utils/relptr.h
@@ -40,12 +40,12 @@
#ifdef HAVE_TYPEOF
#define relptr_access(base, rp) \
- (AssertVariableIsOfTypeMacro(base, char *), \
+ (StaticAssertVariableIsOfTypeMacro(base, char *), \
(typeof((rp).relptr_type)) ((rp).relptr_off == 0 ? NULL : \
(base) + (rp).relptr_off - 1))
#else
#define relptr_access(base, rp) \
- (AssertVariableIsOfTypeMacro(base, char *), \
+ (StaticAssertVariableIsOfTypeMacro(base, char *), \
(void *) ((rp).relptr_off == 0 ? NULL : (base) + (rp).relptr_off - 1))
#endif
@@ -70,12 +70,12 @@ relptr_store_eval(char *base, char *val)
#ifdef HAVE_TYPEOF
#define relptr_store(base, rp, val) \
- (AssertVariableIsOfTypeMacro(base, char *), \
- AssertVariableIsOfTypeMacro(val, typeof((rp).relptr_type)), \
+ (StaticAssertVariableIsOfTypeMacro(base, char *), \
+ StaticAssertVariableIsOfTypeMacro(val, typeof((rp).relptr_type)), \
(rp).relptr_off = relptr_store_eval((base), (char *) (val)))
#else
#define relptr_store(base, rp, val) \
- (AssertVariableIsOfTypeMacro(base, char *), \
+ (StaticAssertVariableIsOfTypeMacro(base, char *), \
(rp).relptr_off = relptr_store_eval((base), (char *) (val)))
#endif
diff --git a/src/interfaces/libpq/fe-protocol3.c b/src/interfaces/libpq/fe-protocol3.c
index 103428033ef..90bbb2eba1f 100644
--- a/src/interfaces/libpq/fe-protocol3.c
+++ b/src/interfaces/libpq/fe-protocol3.c
@@ -1451,7 +1451,19 @@ pqGetNegotiateProtocolVersion3(PGconn *conn)
if (pqGetInt(&num, 4, conn) != 0)
goto eof;
- /* Check the protocol version */
+ /*
+ * Check the protocol version.
+ *
+ * PG_PROTOCOL_GREASE is intentionally unsupported and reserved. It's
+ * higher than any real version, so check for that first, to get the most
+ * specific error message. Then check the upper and lower bounds.
+ */
+ if (their_version == PG_PROTOCOL_GREASE)
+ {
+ libpq_append_conn_error(conn, "received invalid protocol negotiation message: server requested \"grease\" protocol version 3.9999");
+ goto failure;
+ }
+
if (their_version > conn->pversion)
{
libpq_append_conn_error(conn, "received invalid protocol negotiation message: server requested downgrade to a higher-numbered version");
diff --git a/src/pl/plpython/plpy_typeio.c b/src/pl/plpython/plpy_typeio.c
index 1f69109b081..44055de6aeb 100644
--- a/src/pl/plpython/plpy_typeio.c
+++ b/src/pl/plpython/plpy_typeio.c
@@ -735,6 +735,7 @@ PLyList_FromArray_recurse(PLyDatumToOb *elm, int *dims, int ndim, int dim,
char *dataptr = *dataptr_p;
bits8 *bitmap = *bitmap_p;
int bitmask = *bitmask_p;
+ uint8 typalignby = typalign_to_alignby(elm->typalign);
for (i = 0; i < dims[dim]; i++)
{
@@ -751,7 +752,7 @@ PLyList_FromArray_recurse(PLyDatumToOb *elm, int *dims, int ndim, int dim,
itemvalue = fetch_att(dataptr, elm->typbyval, elm->typlen);
PyList_SetItem(list, i, elm->func(elm, itemvalue));
dataptr = att_addlength_pointer(dataptr, elm->typlen, dataptr);
- dataptr = (char *) att_align_nominal(dataptr, elm->typalign);
+ dataptr = (char *) att_nominal_alignby(dataptr, typalignby);
}
/* advance bitmap pointer if any */
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 44c7163c1cd..e8c31ec8e74 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -38,6 +38,7 @@ SUBDIRS = \
test_oat_hooks \
test_parser \
test_pg_dump \
+ test_plan_advice \
test_predtest \
test_radixtree \
test_rbtree \
diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build
index 2634a519935..6998a226fa7 100644
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@@ -39,6 +39,7 @@ subdir('test_misc')
subdir('test_oat_hooks')
subdir('test_parser')
subdir('test_pg_dump')
+subdir('test_plan_advice')
subdir('test_predtest')
subdir('test_radixtree')
subdir('test_rbtree')
diff --git a/src/test/modules/test_plan_advice/Makefile b/src/test/modules/test_plan_advice/Makefile
new file mode 100644
index 00000000000..be026ce34bf
--- /dev/null
+++ b/src/test/modules/test_plan_advice/Makefile
@@ -0,0 +1,28 @@
+# src/test/modules/test_plan_advice/Makefile
+
+PGFILEDESC = "test_plan_advice - test whether generated plan advice works"
+
+MODULE_big = test_plan_advice
+OBJS = \
+ $(WIN32RES) \
+ test_plan_advice.o
+
+EXTRA_INSTALL = contrib/pg_plan_advice
+
+TAP_TESTS = 1
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_plan_advice
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+override CPPFLAGS += -I$(top_srcdir)/contrib/pg_plan_advice
+
+REGRESS_SHLIB=$(abs_top_builddir)/src/test/regress/regress$(DLSUFFIX)
+export REGRESS_SHLIB
diff --git a/src/test/modules/test_plan_advice/meson.build b/src/test/modules/test_plan_advice/meson.build
new file mode 100644
index 00000000000..afde420baed
--- /dev/null
+++ b/src/test/modules/test_plan_advice/meson.build
@@ -0,0 +1,29 @@
+# Copyright (c) 2022-2026, PostgreSQL Global Development Group
+
+test_plan_advice_sources = files(
+ 'test_plan_advice.c',
+)
+
+if host_system == 'windows'
+ test_plan_advice_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+ '--NAME', 'test_plan_advice',
+ '--FILEDESC', 'test_plan_advice - test whether generated plan advice works',])
+endif
+
+test_plan_advice = shared_module('test_plan_advice',
+ test_plan_advice_sources,
+ include_directories: pg_plan_advice_inc,
+ kwargs: pg_test_mod_args,
+)
+test_install_libs += test_plan_advice
+
+tests += {
+ 'name': 'test_plan_advice',
+ 'sd': meson.current_source_dir(),
+ 'bd': meson.current_build_dir(),
+ 'tap': {
+ 'tests': [
+ 't/001_replan_regress.pl',
+ ],
+ },
+}
diff --git a/src/test/modules/test_plan_advice/t/001_replan_regress.pl b/src/test/modules/test_plan_advice/t/001_replan_regress.pl
new file mode 100644
index 00000000000..13b1a225700
--- /dev/null
+++ b/src/test/modules/test_plan_advice/t/001_replan_regress.pl
@@ -0,0 +1,64 @@
+# Copyright (c) 2021-2025, PostgreSQL Global Development Group
+
+# Run the core regression tests under pg_plan_advice to check for problems.
+use strict;
+use warnings FATAL => 'all';
+
+use Cwd qw(abs_path);
+use File::Basename qw(dirname);
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Initialize the primary node
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init();
+
+# Set up our desired configuration.
+$node->append_conf('postgresql.conf', <start;
+
+my $srcdir = abs_path("../../../..");
+
+# --dlpath is needed to be able to find the location of regress.so
+# and any libraries the regression tests require.
+my $dlpath = dirname($ENV{REGRESS_SHLIB});
+
+# --outputdir points to the path where to place the output files.
+my $outputdir = $PostgreSQL::Test::Utils::tmp_check;
+
+# --inputdir points to the path of the input files.
+my $inputdir = "$srcdir/src/test/regress";
+
+# Run the tests.
+my $rc =
+ system($ENV{PG_REGRESS} . " "
+ . "--bindir= "
+ . "--dlpath=\"$dlpath\" "
+ . "--host=" . $node->host . " "
+ . "--port=" . $node->port . " "
+ . "--schedule=$srcdir/src/test/regress/parallel_schedule "
+ . "--max-concurrent-tests=20 "
+ . "--inputdir=\"$inputdir\" "
+ . "--outputdir=\"$outputdir\"");
+
+# Dump out the regression diffs file, if there is one
+if ($rc != 0)
+{
+ my $diffs = "$outputdir/regression.diffs";
+ if (-e $diffs)
+ {
+ print "=== dumping $diffs ===\n";
+ print slurp_file($diffs);
+ print "=== EOF ===\n";
+ }
+}
+
+# Report results
+is($rc, 0, 'regression tests pass');
+
+done_testing();
diff --git a/src/test/modules/test_plan_advice/test_plan_advice.c b/src/test/modules/test_plan_advice/test_plan_advice.c
new file mode 100644
index 00000000000..996675dc386
--- /dev/null
+++ b/src/test/modules/test_plan_advice/test_plan_advice.c
@@ -0,0 +1,143 @@
+/*-------------------------------------------------------------------------
+ *
+ * test_plan_advice.c
+ * Test pg_plan_advice by planning every query with generated advice.
+ *
+ * With this module loaded, every time a query is executed, we end up
+ * planning it twice. The first time we plan it, we generate plan advice,
+ * which we then feed back to pg_plan_advice as the supplied plan advice.
+ * It is then planned a second time using that advice. This hopefully
+ * allows us to detect cases where the advice is incorrect or causes
+ * failures or plan changes for some reason.
+ *
+ * Copyright (c) 2016-2024, PostgreSQL Global Development Group
+ *
+ * src/test/modules/test_plan_advice/test_plan_advice.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xact.h"
+#include "fmgr.h"
+#include "optimizer/optimizer.h"
+#include "pg_plan_advice.h"
+#include "utils/guc.h"
+
+PG_MODULE_MAGIC;
+
+static bool in_recursion = false;
+
+static char *test_plan_advice_advisor(PlannerGlobal *glob,
+ Query *parse,
+ const char *query_string,
+ int cursorOptions,
+ ExplainState *es);
+static DefElem *find_defelem_by_defname(List *deflist, char *defname);
+
+/*
+ * Initialize this module.
+ */
+void
+_PG_init(void)
+{
+ void *(*add_advisor_fn) (pg_plan_advice_advisor_hook hook);
+
+ /*
+ * Ask pg_plan_advice to get advice strings from test_plan_advice_advisor
+ */
+ add_advisor_fn =
+ load_external_function("pg_plan_advice", "pg_plan_advice_add_advisor",
+ true, NULL);
+
+ (*add_advisor_fn) (test_plan_advice_advisor);
+}
+
+/*
+ * Re-plan the given query and return the generated advice string as the
+ * supplied advice.
+ */
+static char *
+test_plan_advice_advisor(PlannerGlobal *glob, Query *parse,
+ const char *query_string, int cursorOptions,
+ ExplainState *es)
+{
+ PlannedStmt *pstmt;
+ int save_nestlevel = 0;
+ DefElem *pgpa_item;
+ DefElem *advice_string_item;
+
+ /*
+ * Since this function is called from the planner and triggers planning,
+ * we need a recursion guard.
+ */
+ if (in_recursion)
+ return NULL;
+
+ PG_TRY();
+ {
+ in_recursion = true;
+
+ /*
+ * Planning can trigger expression evaluation, which can result in
+ * sending NOTICE messages or other output to the client. To avoid
+ * that, we set client_min_messages = ERROR in the hopes of getting
+ * the same output with and without this module.
+ *
+ * We also need to set pg_plan_advice.always_store_advice_details so
+ * that pg_plan_advice will generate an advice string, since the whole
+ * point of this function is to get access to that.
+ */
+ save_nestlevel = NewGUCNestLevel();
+ set_config_option("client_min_messages", "error",
+ PGC_SUSET, PGC_S_SESSION,
+ GUC_ACTION_SAVE, true, 0, false);
+ set_config_option("pg_plan_advice.always_store_advice_details", "true",
+ PGC_SUSET, PGC_S_SESSION,
+ GUC_ACTION_SAVE, true, 0, false);
+
+ /*
+ * Replan. We must copy the Query, because the planner modifies it.
+ * (As noted elsewhere, that's unfortunate; perhaps it will be fixed
+ * some day.)
+ */
+ pstmt = planner(copyObject(parse), query_string, cursorOptions,
+ glob->boundParams, es);
+ }
+ PG_FINALLY();
+ {
+ in_recursion = false;
+ }
+ PG_END_TRY();
+
+ /* Roll back any GUC changes */
+ if (save_nestlevel > 0)
+ AtEOXact_GUC(false, save_nestlevel);
+
+ /* Extract and return the advice string */
+ pgpa_item = find_defelem_by_defname(pstmt->extension_state,
+ "pg_plan_advice");
+ if (pgpa_item == NULL)
+ elog(ERROR, "extension state for pg_plan_advice not found");
+ advice_string_item = find_defelem_by_defname((List *) pgpa_item->arg,
+ "advice_string");
+ if (advice_string_item == NULL)
+ elog(ERROR,
+ "advice string for pg_plan_advice not found in extension state");
+ return strVal(advice_string_item->arg);
+}
+
+/*
+ * Search a list of DefElem objects for a given defname.
+ */
+static DefElem *
+find_defelem_by_defname(List *deflist, char *defname)
+{
+ foreach_node(DefElem, item, deflist)
+ {
+ if (strcmp(item->defname, defname) == 0)
+ return item;
+ }
+
+ return NULL;
+}
diff --git a/src/test/modules/test_regex/test_regex.c b/src/test/modules/test_regex/test_regex.c
index 070464a341e..4e97cde65a6 100644
--- a/src/test/modules/test_regex/test_regex.c
+++ b/src/test/modules/test_regex/test_regex.c
@@ -411,7 +411,8 @@ parse_test_flags(test_re_flags *flags, text *opts)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid regular expression test option: \"%.*s\"",
- pg_mblen(opt_p + i), opt_p + i)));
+ pg_mblen_range(opt_p + i, opt_p + opt_len),
+ opt_p + i)));
break;
}
}
diff --git a/src/test/modules/test_shm_mq/setup.c b/src/test/modules/test_shm_mq/setup.c
index ba2fd746d73..579e5933d28 100644
--- a/src/test/modules/test_shm_mq/setup.c
+++ b/src/test/modules/test_shm_mq/setup.c
@@ -228,6 +228,7 @@ setup_background_workers(int nworkers, dsm_segment *seg)
/* Register the workers. */
for (i = 0; i < nworkers; ++i)
{
+ snprintf(worker.bgw_name, BGW_MAXLEN, "test_shm_mq worker %d", i + 1);
if (!RegisterDynamicBackgroundWorker(&worker, &wstate->handle[i]))
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
diff --git a/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm b/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm
index 5bd41a278dd..c6ff2dbde4c 100644
--- a/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm
+++ b/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm
@@ -155,11 +155,11 @@ sub wait_connect
#
# See query() for details about why/how the banner is used.
my $banner = "background_psql: ready";
- my $banner_match = qr/(^|\n)$banner\r?\n/;
- $self->{stdin} .= "\\echo $banner\n\\warn $banner\n";
+ my $banner_match = qr/$banner\r?\n/;
+ $self->{stdin} .= "\\echo '$banner'\n\\warn '$banner'\n";
$self->{run}->pump()
until ($self->{stdout} =~ /$banner_match/
- && $self->{stderr} =~ /$banner\r?\n/)
+ && $self->{stderr} =~ /$banner_match/)
|| $self->{timeout}->is_expired;
note "connect output:\n",
@@ -264,22 +264,17 @@ sub query
# stderr (or vice versa), even if psql printed them in the opposite
# order. We therefore wait on both.
#
- # We need to match for the newline, because we try to remove it below, and
- # it's possible to consume just the input *without* the newline. In
- # interactive psql we emit \r\n, so we need to allow for that. Also need
- # to be careful that we don't e.g. match the echoed \echo command, rather
- # than its output.
+ # In interactive psql we emit \r\n, so we need to allow for that.
+ # Also, include quotes around the banner string in the \echo and \warn
+ # commands, not because the string needs quoting but so that $banner_match
+ # can't match readline's echoing of these commands.
my $banner = "background_psql: QUERY_SEPARATOR $query_cnt:";
- my $banner_match = qr/(^|\n)$banner\r?\n/;
- $self->{stdin} .= "$query\n;\n\\echo $banner\n\\warn $banner\n";
- pump_until(
- $self->{run}, $self->{timeout},
- \$self->{stdout}, qr/$banner_match/);
- pump_until(
- $self->{run}, $self->{timeout},
- \$self->{stderr}, qr/$banner_match/);
-
- die "psql query timed out" if $self->{timeout}->is_expired;
+ my $banner_match = qr/$banner\r?\n/;
+ $self->{stdin} .= "$query\n;\n\\echo '$banner'\n\\warn '$banner'\n";
+ $self->{run}->pump()
+ until ($self->{stdout} =~ /$banner_match/
+ && $self->{stderr} =~ /$banner_match/)
+ || $self->{timeout}->is_expired;
note "results query $query_cnt:\n",
explain {
@@ -287,9 +282,12 @@ sub query
stderr => $self->{stderr},
} unless !$params{verbose};
- # Remove banner from stdout and stderr, our caller doesn't care. The
- # first newline is optional, as there would not be one if consuming an
- # empty query result.
+ die "psql query timed out" if $self->{timeout}->is_expired;
+
+ # Remove banner from stdout and stderr, our caller doesn't want it.
+ # Also remove the query output's trailing newline, if present (there
+ # would not be one if consuming an empty query result).
+ $banner_match = qr/\r?\n?$banner\r?\n/;
$output = $self->{stdout};
$output =~ s/$banner_match//;
$self->{stderr} =~ s/$banner_match//;
diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out
index e1ab6dc278a..66439d427a3 100644
--- a/src/test/regress/expected/arrays.out
+++ b/src/test/regress/expected/arrays.out
@@ -1737,6 +1737,11 @@ select '[-2147483648:-2147483647]={1,2}'::int[];
(1 row)
-- all of the above should be accepted
+-- some day we might allow these cases, but for now they're errors:
+select array[]::oidvector;
+ERROR: array is not a valid oidvector
+select array[]::int2vector;
+ERROR: array is not a valid int2vector
-- tests for array aggregates
CREATE TEMP TABLE arraggtest ( f1 INT[], f2 TEXT[][], f3 FLOAT[]);
INSERT INTO arraggtest (f1, f2, f3) VALUES
diff --git a/src/test/regress/expected/constraints.out b/src/test/regress/expected/constraints.out
index 1bbf59cca02..ebc892a2a42 100644
--- a/src/test/regress/expected/constraints.out
+++ b/src/test/regress/expected/constraints.out
@@ -846,8 +846,12 @@ CREATE TABLE notnull_tbl1 (a INTEGER NOT NULL NOT NULL);
Not-null constraints:
"notnull_tbl1_a_not_null" NOT NULL "a"
--- no-op
+-- specifying an existing constraint is a no-op
+ALTER TABLE notnull_tbl1 ADD CONSTRAINT notnull_tbl1_a_not_null NOT NULL a;
+-- but using a different constraint name is not allowed
ALTER TABLE notnull_tbl1 ADD CONSTRAINT nn NOT NULL a;
+ERROR: cannot create not-null constraint "nn" on column "a" of table "notnull_tbl1"
+DETAIL: A not-null constraint named "notnull_tbl1_a_not_null" already exists for this column.
\d+ notnull_tbl1
Table "public.notnull_tbl1"
Column | Type | Collation | Nullable | Default | Storage | Stats target | Description
diff --git a/src/test/regress/expected/copyencoding.out b/src/test/regress/expected/copyencoding.out
index cfa2ed6df00..76ea0e7cf04 100644
--- a/src/test/regress/expected/copyencoding.out
+++ b/src/test/regress/expected/copyencoding.out
@@ -17,6 +17,13 @@ CREATE TABLE copy_encoding_tab (t text);
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
-- Read UTF8 data as LATIN1: no error
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'LATIN1');
+-- Non-server encodings have distinct code paths.
+\set fname :abs_builddir '/results/copyencoding_gb18030.csv'
+COPY (SELECT E'\u3042,') TO :'fname' WITH (FORMAT csv, ENCODING 'GB18030');
+COPY copy_encoding_tab FROM :'fname' WITH (FORMAT csv, ENCODING 'GB18030');
+\set fname :abs_builddir '/results/copyencoding_gb18030.data'
+COPY (SELECT E'\u3042,') TO :'fname' WITH (FORMAT text, ENCODING 'GB18030');
+COPY copy_encoding_tab FROM :'fname' WITH (FORMAT text, ENCODING 'GB18030');
-- Use client_encoding
SET client_encoding TO UTF8;
-- U+3042 HIRAGANA LETTER A
diff --git a/src/test/regress/expected/encoding.out b/src/test/regress/expected/encoding.out
new file mode 100644
index 00000000000..ea1f38cff41
--- /dev/null
+++ b/src/test/regress/expected/encoding.out
@@ -0,0 +1,401 @@
+/* skip test if not UTF8 server encoding */
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+\getenv libdir PG_LIBDIR
+\getenv dlsuffix PG_DLSUFFIX
+\set regresslib :libdir '/regress' :dlsuffix
+CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text
+ AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea
+ AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int
+ AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[]
+ AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text
+ AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean
+ AS :'regresslib' LANGUAGE C STRICT;
+CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text);
+INSERT INTO regress_encoding
+VALUES ('café',
+ 'caf' || test_bytea_to_text('\xc3'),
+ 'café' || test_bytea_to_text('\x00') || 'dcba',
+ 'caf' || test_bytea_to_text('\xc300') || 'dcba');
+SELECT good, truncated, with_nul FROM regress_encoding;
+ good | truncated | with_nul
+------+-----------+----------
+ café | caf | café
+(1 row)
+
+SELECT length(good) FROM regress_encoding;
+ length
+--------
+ 4
+(1 row)
+
+SELECT substring(good, 3, 1) FROM regress_encoding;
+ substring
+-----------
+ f
+(1 row)
+
+SELECT substring(good, 4, 1) FROM regress_encoding;
+ substring
+-----------
+ é
+(1 row)
+
+SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding;
+ regexp_replace
+----------------
+ é
+(1 row)
+
+SELECT reverse(good) FROM regress_encoding;
+ reverse
+---------
+ éfac
+(1 row)
+
+-- invalid short mb character = error
+SELECT length(truncated) FROM regress_encoding;
+ERROR: invalid byte sequence for encoding "UTF8": 0xc3
+SELECT substring(truncated, 1, 1) FROM regress_encoding;
+ERROR: invalid byte sequence for encoding "UTF8": 0xc3
+SELECT reverse(truncated) FROM regress_encoding;
+ERROR: invalid byte sequence for encoding "UTF8": 0xc3
+-- invalid short mb character = silently dropped
+SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding;
+ regexp_replace
+----------------
+ caf
+(1 row)
+
+-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string
+-- contains NUL at a character boundary position, some functions treat it as a
+-- character while others treat it as a terminator, as implementation details.
+-- NUL = terminator
+SELECT length(with_nul) FROM regress_encoding;
+ length
+--------
+ 4
+(1 row)
+
+SELECT substring(with_nul, 3, 1) FROM regress_encoding;
+ substring
+-----------
+ f
+(1 row)
+
+SELECT substring(with_nul, 4, 1) FROM regress_encoding;
+ substring
+-----------
+ é
+(1 row)
+
+SELECT substring(with_nul, 5, 1) FROM regress_encoding;
+ substring
+-----------
+
+(1 row)
+
+SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding;
+ convert_to
+------------
+ \x
+(1 row)
+
+SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding;
+ regexp_replace
+----------------
+ é
+(1 row)
+
+-- NUL = character
+SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding;
+ with_nul | reverse | reverse
+----------+---------+---------
+ café | abcd | café
+(1 row)
+
+-- If a corrupted string contains NUL in the tail bytes of a multibyte
+-- character (invalid in all encodings), it is considered part of the
+-- character for length purposes. An error will only be raised in code paths
+-- that convert or verify encodings.
+SELECT length(truncated_with_nul) FROM regress_encoding;
+ length
+--------
+ 8
+(1 row)
+
+SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding;
+ substring
+-----------
+ f
+(1 row)
+
+SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding;
+ substring
+-----------
+
+(1 row)
+
+SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding;
+ERROR: invalid byte sequence for encoding "UTF8": 0xc3 0x00
+SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding;
+ substring
+-----------
+ d
+(1 row)
+
+SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT reverse(truncated_with_nul) FROM regress_encoding;
+ reverse
+---------
+ abcd
+(1 row)
+
+-- unbounded: sequence would overrun the string!
+SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3)
+FROM regress_encoding;
+ test_mblen_func
+-----------------
+ 2
+(1 row)
+
+-- condition detected when using the length/range variants
+SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3)
+FROM regress_encoding;
+ERROR: invalid byte sequence for encoding "UTF8": 0xc3
+SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3)
+FROM regress_encoding;
+ERROR: invalid byte sequence for encoding "UTF8": 0xc3
+-- unbounded: sequence would overrun the string, if the terminator were really
+-- the end of it
+SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3)
+FROM regress_encoding;
+ test_mblen_func
+-----------------
+ 2
+(1 row)
+
+SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3)
+FROM regress_encoding;
+ test_mblen_func
+-----------------
+ 2
+(1 row)
+
+-- condition detected when using the cstr variants
+SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3)
+FROM regress_encoding;
+ERROR: invalid byte sequence for encoding "UTF8": 0xc3
+DROP TABLE regress_encoding;
+-- mb<->wchar conversions
+CREATE FUNCTION test_encoding(encoding text, description text, input bytea)
+RETURNS VOID LANGUAGE plpgsql AS
+$$
+DECLARE
+ prefix text;
+ len int;
+ wchars int[];
+ round_trip bytea;
+ result text;
+BEGIN
+ prefix := rpad(encoding || ' ' || description || ':', 28);
+
+ -- XXX could also test validation, length functions and include client
+ -- only encodings with these test cases
+
+ IF test_valid_server_encoding(encoding) THEN
+ wchars := test_text_to_wchars(encoding, test_bytea_to_text(input));
+ round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars));
+ if input = round_trip then
+ result := 'OK';
+ elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then
+ result := 'truncated';
+ else
+ result := 'failed';
+ end if;
+ RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result;
+ END IF;
+END;
+$$;
+-- No validation is done on the encoding itself, just the length to avoid
+-- overruns, so some of the byte sequences below are bogus. They cover
+-- all code branches, server encodings only for now.
+CREATE TABLE encoding_tests (encoding text, description text, input bytea);
+INSERT INTO encoding_tests VALUES
+ -- LATIN1, other single-byte encodings
+ ('LATIN1', 'ASCII', 'a'),
+ ('LATIN1', 'extended', '\xe9'),
+ -- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion):
+ -- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
+ -- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
+ -- 2 80..ff (CS1)
+ ('EUC_JP', 'ASCII', 'a'),
+ ('EUC_JP', 'CS1, short', '\x80'),
+ ('EUC_JP', 'CS1', '\x8002'),
+ ('EUC_JP', 'CS2, short', '\x8e'),
+ ('EUC_JP', 'CS2', '\x8e02'),
+ ('EUC_JP', 'CS3, short', '\x8f'),
+ ('EUC_JP', 'CS3, short', '\x8f02'),
+ ('EUC_JP', 'CS3', '\x8f0203'),
+ -- EUC_CN
+ -- 3 8e (CS2, not used but arbitrarily considered to have length 3)
+ -- 3 8f (CS3, not used but arbitrarily considered to have length 3)
+ -- 2 80..ff (CS1)
+ ('EUC_CN', 'ASCII', 'a'),
+ ('EUC_CN', 'CS1, short', '\x80'),
+ ('EUC_CN', 'CS1', '\x8002'),
+ ('EUC_CN', 'CS2, short', '\x8e'),
+ ('EUC_CN', 'CS2, short', '\x8e02'),
+ ('EUC_CN', 'CS2', '\x8e0203'),
+ ('EUC_CN', 'CS3, short', '\x8f'),
+ ('EUC_CN', 'CS3, short', '\x8f02'),
+ ('EUC_CN', 'CS3', '\x8f0203'),
+ -- EUC_TW:
+ -- 4 8e (CS2)
+ -- 3 8f (CS3, not used but arbitrarily considered to have length 3)
+ -- 2 80..ff (CS1)
+ ('EUC_TW', 'ASCII', 'a'),
+ ('EUC_TW', 'CS1, short', '\x80'),
+ ('EUC_TW', 'CS1', '\x8002'),
+ ('EUC_TW', 'CS2, short', '\x8e'),
+ ('EUC_TW', 'CS2, short', '\x8e02'),
+ ('EUC_TW', 'CS2, short', '\x8e0203'),
+ ('EUC_TW', 'CS2', '\x8e020304'),
+ ('EUC_TW', 'CS3, short', '\x8f'),
+ ('EUC_TW', 'CS3, short', '\x8f02'),
+ ('EUC_TW', 'CS3', '\x8f0203'),
+ -- UTF8
+ -- 2 c0..df
+ -- 3 e0..ef
+ -- 4 f0..f7 (but maximum real codepoint U+10ffff has f4)
+ -- 5 f8..fb (not supported)
+ -- 6 fc..fd (not supported)
+ ('UTF8', 'ASCII', 'a'),
+ ('UTF8', '2 byte, short', '\xdf'),
+ ('UTF8', '2 byte', '\xdf82'),
+ ('UTF8', '3 byte, short', '\xef'),
+ ('UTF8', '3 byte, short', '\xef82'),
+ ('UTF8', '3 byte', '\xef8283'),
+ ('UTF8', '4 byte, short', '\xf7'),
+ ('UTF8', '4 byte, short', '\xf782'),
+ ('UTF8', '4 byte, short', '\xf78283'),
+ ('UTF8', '4 byte', '\xf7828384'),
+ ('UTF8', '5 byte, unsupported', '\xfb'),
+ ('UTF8', '5 byte, unsupported', '\xfb82'),
+ ('UTF8', '5 byte, unsupported', '\xfb8283'),
+ ('UTF8', '5 byte, unsupported', '\xfb828384'),
+ ('UTF8', '5 byte, unsupported', '\xfb82838485'),
+ ('UTF8', '6 byte, unsupported', '\xfd'),
+ ('UTF8', '6 byte, unsupported', '\xfd82'),
+ ('UTF8', '6 byte, unsupported', '\xfd8283'),
+ ('UTF8', '6 byte, unsupported', '\xfd828384'),
+ ('UTF8', '6 byte, unsupported', '\xfd82838485'),
+ ('UTF8', '6 byte, unsupported', '\xfd8283848586'),
+ -- MULE_INTERNAL
+ -- 2 81..8d LC1
+ -- 3 90..99 LC2
+ ('MULE_INTERNAL', 'ASCII', 'a'),
+ ('MULE_INTERNAL', 'LC1, short', '\x81'),
+ ('MULE_INTERNAL', 'LC1', '\x8182'),
+ ('MULE_INTERNAL', 'LC2, short', '\x90'),
+ ('MULE_INTERNAL', 'LC2, short', '\x9082'),
+ ('MULE_INTERNAL', 'LC2', '\x908283');
+SELECT COUNT(test_encoding(encoding, description, input)) > 0
+FROM encoding_tests;
+NOTICE: LATIN1 ASCII: \x61 -> {97} -> \x61 = OK
+NOTICE: LATIN1 extended: \xe9 -> {233} -> \xe9 = OK
+NOTICE: EUC_JP ASCII: \x61 -> {97} -> \x61 = OK
+NOTICE: EUC_JP CS1, short: \x80 -> {} -> \x = truncated
+NOTICE: EUC_JP CS1: \x8002 -> {32770} -> \x8002 = OK
+NOTICE: EUC_JP CS2, short: \x8e -> {} -> \x = truncated
+NOTICE: EUC_JP CS2: \x8e02 -> {36354} -> \x8e02 = OK
+NOTICE: EUC_JP CS3, short: \x8f -> {} -> \x = truncated
+NOTICE: EUC_JP CS3, short: \x8f02 -> {} -> \x = truncated
+NOTICE: EUC_JP CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK
+NOTICE: EUC_CN ASCII: \x61 -> {97} -> \x61 = OK
+NOTICE: EUC_CN CS1, short: \x80 -> {} -> \x = truncated
+NOTICE: EUC_CN CS1: \x8002 -> {32770} -> \x8002 = OK
+NOTICE: EUC_CN CS2, short: \x8e -> {} -> \x = truncated
+NOTICE: EUC_CN CS2, short: \x8e02 -> {} -> \x = truncated
+NOTICE: EUC_CN CS2: \x8e0203 -> {9306627} -> \x8e0203 = OK
+NOTICE: EUC_CN CS3, short: \x8f -> {} -> \x = truncated
+NOTICE: EUC_CN CS3, short: \x8f02 -> {} -> \x = truncated
+NOTICE: EUC_CN CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK
+NOTICE: EUC_TW ASCII: \x61 -> {97} -> \x61 = OK
+NOTICE: EUC_TW CS1, short: \x80 -> {} -> \x = truncated
+NOTICE: EUC_TW CS1: \x8002 -> {32770} -> \x8002 = OK
+NOTICE: EUC_TW CS2, short: \x8e -> {} -> \x = truncated
+NOTICE: EUC_TW CS2, short: \x8e02 -> {} -> \x = truncated
+NOTICE: EUC_TW CS2, short: \x8e0203 -> {} -> \x = truncated
+NOTICE: EUC_TW CS2: \x8e020304 -> {-1912470780} -> \x8e020304 = OK
+NOTICE: EUC_TW CS3, short: \x8f -> {} -> \x = truncated
+NOTICE: EUC_TW CS3, short: \x8f02 -> {} -> \x = truncated
+NOTICE: EUC_TW CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK
+NOTICE: UTF8 ASCII: \x61 -> {97} -> \x61 = OK
+NOTICE: UTF8 2 byte, short: \xdf -> {} -> \x = truncated
+NOTICE: UTF8 2 byte: \xdf82 -> {1986} -> \xdf82 = OK
+NOTICE: UTF8 3 byte, short: \xef -> {} -> \x = truncated
+NOTICE: UTF8 3 byte, short: \xef82 -> {} -> \x = truncated
+NOTICE: UTF8 3 byte: \xef8283 -> {61571} -> \xef8283 = OK
+NOTICE: UTF8 4 byte, short: \xf7 -> {} -> \x = truncated
+NOTICE: UTF8 4 byte, short: \xf782 -> {} -> \x = truncated
+NOTICE: UTF8 4 byte, short: \xf78283 -> {} -> \x = truncated
+NOTICE: UTF8 4 byte: \xf7828384 -> {1843396} -> \xf7828384 = OK
+NOTICE: UTF8 5 byte, unsupported: \xfb -> {251} -> \xc3bb = failed
+NOTICE: UTF8 5 byte, unsupported: \xfb82 -> {251,130} -> \xc3bbc282 = failed
+NOTICE: UTF8 5 byte, unsupported: \xfb8283 -> {251,130,131} -> \xc3bbc282c283 = failed
+NOTICE: UTF8 5 byte, unsupported: \xfb828384 -> {251,130,131,132} -> \xc3bbc282c283c284 = failed
+NOTICE: UTF8 5 byte, unsupported: \xfb82838485 -> {251,130,131,132,133} -> \xc3bbc282c283c284c285 = failed
+NOTICE: UTF8 6 byte, unsupported: \xfd -> {253} -> \xc3bd = failed
+NOTICE: UTF8 6 byte, unsupported: \xfd82 -> {253,130} -> \xc3bdc282 = failed
+NOTICE: UTF8 6 byte, unsupported: \xfd8283 -> {253,130,131} -> \xc3bdc282c283 = failed
+NOTICE: UTF8 6 byte, unsupported: \xfd828384 -> {253,130,131,132} -> \xc3bdc282c283c284 = failed
+NOTICE: UTF8 6 byte, unsupported: \xfd82838485 -> {253,130,131,132,133} -> \xc3bdc282c283c284c285 = failed
+NOTICE: UTF8 6 byte, unsupported: \xfd8283848586 -> {253,130,131,132,133,134} -> \xc3bdc282c283c284c285c286 = failed
+NOTICE: MULE_INTERNAL ASCII: \x61 -> {97} -> \x61 = OK
+NOTICE: MULE_INTERNAL LC1, short: \x81 -> {} -> \x = truncated
+NOTICE: MULE_INTERNAL LC1: \x8182 -> {8454274} -> \x8182 = OK
+NOTICE: MULE_INTERNAL LC2, short: \x90 -> {} -> \x = truncated
+NOTICE: MULE_INTERNAL LC2, short: \x9082 -> {} -> \x = truncated
+NOTICE: MULE_INTERNAL LC2: \x908283 -> {9470595} -> \x908283 = OK
+ ?column?
+----------
+ t
+(1 row)
+
+DROP TABLE encoding_tests;
+DROP FUNCTION test_encoding;
+DROP FUNCTION test_text_to_wchars;
+DROP FUNCTION test_mblen_func;
+DROP FUNCTION test_bytea_to_text;
+DROP FUNCTION test_text_to_bytea;
+-- substring slow path: multi-byte escape char vs. multi-byte pattern char.
+SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7');
+ substring
+-----------
+
+(1 row)
+
+-- Levenshtein distance metric: exercise character length cache.
+SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name);
+ERROR: column "real§_name" does not exist
+LINE 1: SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name);
+ ^
+HINT: Perhaps you meant to reference the column "x.real_name".
+-- JSON errcontext: truncate long data.
+SELECT repeat(U&'\00A7', 30)::json;
+ERROR: invalid input syntax for type json
+DETAIL: Token "§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§" is invalid.
+CONTEXT: JSON data, line 1: ...§§§§§§§§§§§§§§§§§§§§§§§§
diff --git a/src/test/regress/expected/encoding_1.out b/src/test/regress/expected/encoding_1.out
new file mode 100644
index 00000000000..a5b02090901
--- /dev/null
+++ b/src/test/regress/expected/encoding_1.out
@@ -0,0 +1,4 @@
+/* skip test if not UTF8 server encoding */
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/src/test/regress/expected/euc_kr.out b/src/test/regress/expected/euc_kr.out
new file mode 100644
index 00000000000..7a61c89a43a
--- /dev/null
+++ b/src/test/regress/expected/euc_kr.out
@@ -0,0 +1,16 @@
+-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent
+-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all
+-- of EUC_KR, also run the test in UTF8.
+SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+-- Exercise is_multibyte_char_in_char (non-UTF8) slow path.
+SELECT POSITION(
+ convert_from('\xbcf6c7d0', 'EUC_KR') IN
+ convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR'));
+ position
+----------
+ 5
+(1 row)
+
diff --git a/src/test/regress/expected/euc_kr_1.out b/src/test/regress/expected/euc_kr_1.out
new file mode 100644
index 00000000000..faaac5d6355
--- /dev/null
+++ b/src/test/regress/expected/euc_kr_1.out
@@ -0,0 +1,6 @@
+-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent
+-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all
+-- of EUC_KR, also run the test in UTF8.
+SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/src/test/regress/expected/guc.out b/src/test/regress/expected/guc.out
index d6fb879f500..3fa2562f231 100644
--- a/src/test/regress/expected/guc.out
+++ b/src/test/regress/expected/guc.out
@@ -711,6 +711,63 @@ select current_schemas(false);
reset search_path;
--
+-- Test parsing of log_min_messages
+--
+SET log_min_messages TO foo; -- fail
+ERROR: invalid value for parameter "log_min_messages": "foo"
+DETAIL: Unrecognized log level: "foo".
+SET log_min_messages TO fatal;
+SHOW log_min_messages;
+ log_min_messages
+------------------
+ fatal
+(1 row)
+
+SET log_min_messages TO 'fatal';
+SHOW log_min_messages;
+ log_min_messages
+------------------
+ fatal
+(1 row)
+
+SET log_min_messages TO 'checkpointer:debug2, autovacuum:debug1'; -- fail
+ERROR: invalid value for parameter "log_min_messages": "checkpointer:debug2, autovacuum:debug1"
+DETAIL: Default log level was not defined.
+SET log_min_messages TO 'debug1, backend:error, fatal'; -- fail
+ERROR: invalid value for parameter "log_min_messages": "debug1, backend:error, fatal"
+DETAIL: Redundant specification of default log level.
+SET log_min_messages TO 'backend:error, debug1, backend:warning'; -- fail
+ERROR: invalid value for parameter "log_min_messages": "backend:error, debug1, backend:warning"
+DETAIL: Redundant log level specification for process type "backend".
+SET log_min_messages TO 'backend:error, foo:fatal, archiver:debug1'; -- fail
+ERROR: invalid value for parameter "log_min_messages": "backend:error, foo:fatal, archiver:debug1"
+DETAIL: Unrecognized process type "foo".
+SET log_min_messages TO 'backend:error, checkpointer:bar, archiver:debug1'; -- fail
+ERROR: invalid value for parameter "log_min_messages": "backend:error, checkpointer:bar, archiver:debug1"
+DETAIL: Unrecognized log level for process type "checkpointer": "bar".
+SET log_min_messages TO 'backend:error, checkpointer:debug3, fatal, archiver:debug2, autovacuum:debug1, walsender:debug3';
+SHOW log_min_messages;
+ log_min_messages
+-------------------------------------------------------------------------------------------------
+ fatal, archiver:debug2, autovacuum:debug1, backend:error, checkpointer:debug3, walsender:debug3
+(1 row)
+
+SET log_min_messages TO 'warning, autovacuum:debug1';
+SHOW log_min_messages;
+ log_min_messages
+----------------------------
+ warning, autovacuum:debug1
+(1 row)
+
+SET log_min_messages TO 'autovacuum:debug1, warning';
+SHOW log_min_messages;
+ log_min_messages
+----------------------------
+ warning, autovacuum:debug1
+(1 row)
+
+RESET log_min_messages;
+--
-- Tests for function-local GUC settings
--
set work_mem = '3MB';
diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out
index fdec5b9ba52..29090dca1ba 100644
--- a/src/test/regress/expected/incremental_sort.out
+++ b/src/test/regress/expected/incremental_sort.out
@@ -1450,21 +1450,23 @@ explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1
set enable_incremental_sort = on;
explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
- QUERY PLAN
-----------------------------------------------------------------------
+ QUERY PLAN
+----------------------------------------------------------------------------
Limit
-> Incremental Sort
Sort Key: a, b, (sum(c))
Presorted Key: a, b
- -> GroupAggregate
+ -> Finalize GroupAggregate
Group Key: a, b
-> Gather Merge
Workers Planned: 2
- -> Incremental Sort
- Sort Key: a, b
- Presorted Key: a
- -> Parallel Index Scan using t_a_idx on t
-(12 rows)
+ -> Partial GroupAggregate
+ Group Key: a, b
+ -> Incremental Sort
+ Sort Key: a, b
+ Presorted Key: a
+ -> Parallel Index Scan using t_a_idx on t
+(14 rows)
-- Incremental sort vs. set operations with varno 0
set enable_hashagg to off;
diff --git a/src/test/regress/expected/join_hash.out b/src/test/regress/expected/join_hash.out
index 4749f6ed70d..bc7cc76467f 100644
--- a/src/test/regress/expected/join_hash.out
+++ b/src/test/regress/expected/join_hash.out
@@ -76,8 +76,8 @@ insert into extremely_skewed
update pg_class
set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192
where relname = 'extremely_skewed';
--- Make a relation with a couple of enormous tuples.
-create table wide as select generate_series(1, 2) as id, rpad('', 320000, 'x') as t;
+-- Make a relation with several enormous tuples.
+create table wide as select generate_series(1, 3) as id, rpad('', 320000, 'x') as t;
alter table wide set (parallel_workers = 2);
-- The "optimal" case: the hash table fits in memory; we plan for 1
-- batch, we stick to that number, and peak memory usage stays within
@@ -922,7 +922,7 @@ set work_mem = '128kB';
set hash_mem_multiplier = 1.0;
explain (costs off)
select length(max(s.t))
- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
+ from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id);
QUERY PLAN
----------------------------------------------------------------
Finalize Aggregate
@@ -934,10 +934,11 @@ explain (costs off)
-> Parallel Seq Scan on wide
-> Parallel Hash
-> Parallel Seq Scan on wide wide_1
-(9 rows)
+ Filter: (id < 3)
+(10 rows)
select length(max(s.t))
-from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
+from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id);
length
--------
320000
@@ -947,7 +948,7 @@ select final > 1 as multibatch
from hash_join_batches(
$$
select length(max(s.t))
- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
+ from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id);
$$);
multibatch
------------
diff --git a/src/test/regress/expected/predicate.out b/src/test/regress/expected/predicate.out
index 8ff1172008e..feae77cb840 100644
--- a/src/test/regress/expected/predicate.out
+++ b/src/test/regress/expected/predicate.out
@@ -632,3 +632,325 @@ SELECT * FROM pred_tab WHERE (a::oid) IS NULL;
(3 rows)
DROP TABLE pred_tab;
+--
+-- Test optimization of IS [NOT] DISTINCT FROM
+--
+CREATE TYPE dist_row_t AS (a int, b int);
+CREATE TABLE dist_tab (id int, val_nn int NOT NULL, val_null int, row_nn dist_row_t NOT NULL);
+INSERT INTO dist_tab VALUES (1, 10, 10, ROW(1, 1));
+INSERT INTO dist_tab VALUES (2, 20, NULL, ROW(2, 2));
+INSERT INTO dist_tab VALUES (3, 30, 30, ROW(1, NULL));
+CREATE INDEX dist_tab_nn_idx ON dist_tab (val_nn);
+ANALYZE dist_tab;
+-- Ensure that the predicate folds to constant TRUE
+EXPLAIN(COSTS OFF)
+SELECT id FROM dist_tab WHERE val_nn IS DISTINCT FROM NULL::INT;
+ QUERY PLAN
+----------------------
+ Seq Scan on dist_tab
+(1 row)
+
+SELECT id FROM dist_tab WHERE val_nn IS DISTINCT FROM NULL::INT;
+ id
+----
+ 1
+ 2
+ 3
+(3 rows)
+
+-- Ensure that the predicate folds to constant FALSE
+EXPLAIN(COSTS OFF)
+SELECT id FROM dist_tab WHERE val_nn IS NOT DISTINCT FROM NULL::INT;
+ QUERY PLAN
+------------------------------
+ Result
+ Replaces: Scan on dist_tab
+ One-Time Filter: false
+(3 rows)
+
+SELECT id FROM dist_tab WHERE val_nn IS NOT DISTINCT FROM NULL::INT;
+ id
+----
+(0 rows)
+
+-- Ensure that the predicate is converted to an inequality operator
+EXPLAIN (COSTS OFF)
+SELECT id FROM dist_tab WHERE val_nn IS DISTINCT FROM 10;
+ QUERY PLAN
+--------------------------
+ Seq Scan on dist_tab
+ Filter: (val_nn <> 10)
+(2 rows)
+
+SELECT id FROM dist_tab WHERE val_nn IS DISTINCT FROM 10;
+ id
+----
+ 2
+ 3
+(2 rows)
+
+-- Ensure that the predicate is converted to an equality operator, and thus can
+-- use index scan
+SET enable_seqscan TO off;
+EXPLAIN (COSTS OFF)
+SELECT id FROM dist_tab WHERE val_nn IS NOT DISTINCT FROM 10;
+ QUERY PLAN
+----------------------------------------------
+ Index Scan using dist_tab_nn_idx on dist_tab
+ Index Cond: (val_nn = 10)
+(2 rows)
+
+SELECT id FROM dist_tab WHERE val_nn IS NOT DISTINCT FROM 10;
+ id
+----
+ 1
+(1 row)
+
+RESET enable_seqscan;
+-- Ensure that the predicate is preserved as "IS DISTINCT FROM"
+EXPLAIN (COSTS OFF)
+SELECT id FROM dist_tab WHERE val_null IS DISTINCT FROM 20;
+ QUERY PLAN
+------------------------------------------
+ Seq Scan on dist_tab
+ Filter: (val_null IS DISTINCT FROM 20)
+(2 rows)
+
+SELECT id FROM dist_tab WHERE val_null IS DISTINCT FROM 20;
+ id
+----
+ 1
+ 2
+ 3
+(3 rows)
+
+-- Safety check for rowtypes
+-- Ensure that the predicate is converted to an inequality operator
+EXPLAIN (COSTS OFF)
+SELECT id FROM dist_tab WHERE row_nn IS DISTINCT FROM ROW(1, 5)::dist_row_t;
+ QUERY PLAN
+-------------------------------------------
+ Seq Scan on dist_tab
+ Filter: (row_nn <> '(1,5)'::dist_row_t)
+(2 rows)
+
+-- ... and that all 3 rows are returned
+SELECT id FROM dist_tab WHERE row_nn IS DISTINCT FROM ROW(1, 5)::dist_row_t;
+ id
+----
+ 1
+ 2
+ 3
+(3 rows)
+
+-- Ensure that the predicate is converted to an equality operator, and thus
+-- mergejoinable or hashjoinable
+SET enable_nestloop TO off;
+EXPLAIN (COSTS OFF)
+SELECT * FROM dist_tab t1 JOIN dist_tab t2 ON t1.val_nn IS NOT DISTINCT FROM t2.val_nn;
+ QUERY PLAN
+--------------------------------------
+ Hash Join
+ Hash Cond: (t1.val_nn = t2.val_nn)
+ -> Seq Scan on dist_tab t1
+ -> Hash
+ -> Seq Scan on dist_tab t2
+(5 rows)
+
+SELECT * FROM dist_tab t1 JOIN dist_tab t2 ON t1.val_nn IS NOT DISTINCT FROM t2.val_nn;
+ id | val_nn | val_null | row_nn | id | val_nn | val_null | row_nn
+----+--------+----------+--------+----+--------+----------+--------
+ 1 | 10 | 10 | (1,1) | 1 | 10 | 10 | (1,1)
+ 2 | 20 | | (2,2) | 2 | 20 | | (2,2)
+ 3 | 30 | 30 | (1,) | 3 | 30 | 30 | (1,)
+(3 rows)
+
+RESET enable_nestloop;
+-- Ensure that the predicate is converted to IS NOT NULL
+EXPLAIN (COSTS OFF)
+SELECT id FROM dist_tab WHERE val_null IS DISTINCT FROM NULL::INT;
+ QUERY PLAN
+----------------------------------
+ Seq Scan on dist_tab
+ Filter: (val_null IS NOT NULL)
+(2 rows)
+
+SELECT id FROM dist_tab WHERE val_null IS DISTINCT FROM NULL::INT;
+ id
+----
+ 1
+ 3
+(2 rows)
+
+-- Ensure that the predicate is converted to IS NULL
+EXPLAIN (COSTS OFF)
+SELECT id FROM dist_tab WHERE val_null IS NOT DISTINCT FROM NULL::INT;
+ QUERY PLAN
+------------------------------
+ Seq Scan on dist_tab
+ Filter: (val_null IS NULL)
+(2 rows)
+
+SELECT id FROM dist_tab WHERE val_null IS NOT DISTINCT FROM NULL::INT;
+ id
+----
+ 2
+(1 row)
+
+-- Safety check for rowtypes
+-- The predicate is converted to IS NOT NULL, and get_rule_expr prints it as IS
+-- DISTINCT FROM because argisrow is false, indicating that we're applying a
+-- scalar test
+EXPLAIN (COSTS OFF)
+SELECT id FROM dist_tab WHERE (val_null, val_null) IS DISTINCT FROM NULL::RECORD;
+ QUERY PLAN
+-----------------------------------------------------------
+ Seq Scan on dist_tab
+ Filter: (ROW(val_null, val_null) IS DISTINCT FROM NULL)
+(2 rows)
+
+SELECT id FROM dist_tab WHERE (val_null, val_null) IS DISTINCT FROM NULL::RECORD;
+ id
+----
+ 1
+ 2
+ 3
+(3 rows)
+
+-- The predicate is converted to IS NULL, and get_rule_expr prints it as IS NOT
+-- DISTINCT FROM because argisrow is false, indicating that we're applying a
+-- scalar test
+EXPLAIN (COSTS OFF)
+SELECT id FROM dist_tab WHERE (val_null, val_null) IS NOT DISTINCT FROM NULL::RECORD;
+ QUERY PLAN
+---------------------------------------------------------------
+ Seq Scan on dist_tab
+ Filter: (ROW(val_null, val_null) IS NOT DISTINCT FROM NULL)
+(2 rows)
+
+SELECT id FROM dist_tab WHERE (val_null, val_null) IS NOT DISTINCT FROM NULL::RECORD;
+ id
+----
+(0 rows)
+
+DROP TABLE dist_tab;
+DROP TYPE dist_row_t;
+--
+-- Test optimization of BooleanTest (IS [NOT] TRUE/FALSE/UNKNOWN) on
+-- non-nullable input
+--
+CREATE TABLE bool_tab (id int, flag_nn boolean NOT NULL, flag_null boolean);
+INSERT INTO bool_tab VALUES (1, true, true);
+INSERT INTO bool_tab VALUES (2, false, NULL);
+CREATE INDEX bool_tab_nn_idx ON bool_tab (flag_nn);
+ANALYZE bool_tab;
+-- Ensure that the predicate folds to constant FALSE
+EXPLAIN (COSTS OFF)
+SELECT id FROM bool_tab WHERE flag_nn IS UNKNOWN;
+ QUERY PLAN
+------------------------------
+ Result
+ Replaces: Scan on bool_tab
+ One-Time Filter: false
+(3 rows)
+
+SELECT id FROM bool_tab WHERE flag_nn IS UNKNOWN;
+ id
+----
+(0 rows)
+
+-- Ensure that the predicate folds to constant TRUE
+EXPLAIN (COSTS OFF)
+SELECT id FROM bool_tab WHERE flag_nn IS NOT UNKNOWN;
+ QUERY PLAN
+----------------------
+ Seq Scan on bool_tab
+(1 row)
+
+SELECT id FROM bool_tab WHERE flag_nn IS NOT UNKNOWN;
+ id
+----
+ 1
+ 2
+(2 rows)
+
+-- Ensure that the predicate folds to flag_nn
+EXPLAIN (COSTS OFF)
+SELECT id FROM bool_tab WHERE flag_nn IS TRUE;
+ QUERY PLAN
+----------------------
+ Seq Scan on bool_tab
+ Filter: flag_nn
+(2 rows)
+
+SELECT id FROM bool_tab WHERE flag_nn IS TRUE;
+ id
+----
+ 1
+(1 row)
+
+-- Ensure that the predicate folds to flag_nn, and thus can use index scan
+SET enable_seqscan TO off;
+EXPLAIN (COSTS OFF)
+SELECT id FROM bool_tab WHERE flag_nn IS NOT FALSE;
+ QUERY PLAN
+----------------------------------------------
+ Index Scan using bool_tab_nn_idx on bool_tab
+ Index Cond: (flag_nn = true)
+(2 rows)
+
+SELECT id FROM bool_tab WHERE flag_nn IS NOT FALSE;
+ id
+----
+ 1
+(1 row)
+
+RESET enable_seqscan;
+-- Ensure that the predicate folds to not flag_nn
+EXPLAIN (COSTS OFF)
+SELECT id FROM bool_tab WHERE flag_nn IS FALSE;
+ QUERY PLAN
+-------------------------
+ Seq Scan on bool_tab
+ Filter: (NOT flag_nn)
+(2 rows)
+
+SELECT id FROM bool_tab WHERE flag_nn IS FALSE;
+ id
+----
+ 2
+(1 row)
+
+-- Ensure that the predicate folds to not flag_nn, and thus can use index scan
+SET enable_seqscan TO off;
+EXPLAIN (COSTS OFF)
+SELECT id FROM bool_tab WHERE flag_nn IS NOT TRUE;
+ QUERY PLAN
+----------------------------------------------
+ Index Scan using bool_tab_nn_idx on bool_tab
+ Index Cond: (flag_nn = false)
+(2 rows)
+
+SELECT id FROM bool_tab WHERE flag_nn IS NOT TRUE;
+ id
+----
+ 2
+(1 row)
+
+RESET enable_seqscan;
+-- Ensure that the predicate is preserved as a BooleanTest
+EXPLAIN (COSTS OFF)
+SELECT id FROM bool_tab WHERE flag_null IS UNKNOWN;
+ QUERY PLAN
+----------------------------------
+ Seq Scan on bool_tab
+ Filter: (flag_null IS UNKNOWN)
+(2 rows)
+
+SELECT id FROM bool_tab WHERE flag_null IS UNKNOWN;
+ id
+----
+ 2
+(1 row)
+
+DROP TABLE bool_tab;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index f4ee2bd7459..f9bc213e5a1 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2696,7 +2696,31 @@ pg_stats_ext_exprs| SELECT cn.nspname AS schemaname,
WHEN ((stat.a).stakind4 = 5) THEN (stat.a).stanumbers4
WHEN ((stat.a).stakind5 = 5) THEN (stat.a).stanumbers5
ELSE NULL::real[]
- END AS elem_count_histogram
+ END AS elem_count_histogram,
+ CASE
+ WHEN ((stat.a).stakind1 = 6) THEN (stat.a).stavalues1
+ WHEN ((stat.a).stakind2 = 6) THEN (stat.a).stavalues2
+ WHEN ((stat.a).stakind3 = 6) THEN (stat.a).stavalues3
+ WHEN ((stat.a).stakind4 = 6) THEN (stat.a).stavalues4
+ WHEN ((stat.a).stakind5 = 6) THEN (stat.a).stavalues5
+ ELSE NULL::anyarray
+ END AS range_length_histogram,
+ CASE
+ WHEN ((stat.a).stakind1 = 6) THEN (stat.a).stanumbers1[1]
+ WHEN ((stat.a).stakind2 = 6) THEN (stat.a).stanumbers2[1]
+ WHEN ((stat.a).stakind3 = 6) THEN (stat.a).stanumbers3[1]
+ WHEN ((stat.a).stakind4 = 6) THEN (stat.a).stanumbers4[1]
+ WHEN ((stat.a).stakind5 = 6) THEN (stat.a).stanumbers5[1]
+ ELSE NULL::real
+ END AS range_empty_frac,
+ CASE
+ WHEN ((stat.a).stakind1 = 7) THEN (stat.a).stavalues1
+ WHEN ((stat.a).stakind2 = 7) THEN (stat.a).stavalues2
+ WHEN ((stat.a).stakind3 = 7) THEN (stat.a).stavalues3
+ WHEN ((stat.a).stakind4 = 7) THEN (stat.a).stavalues4
+ WHEN ((stat.a).stakind5 = 7) THEN (stat.a).stavalues5
+ ELSE NULL::anyarray
+ END AS range_bounds_histogram
FROM (((((pg_statistic_ext s
JOIN pg_class c ON ((c.oid = s.stxrelid)))
LEFT JOIN pg_statistic_ext_data sd ON ((s.oid = sd.stxoid)))
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index b2a06579135..cb8856ac50f 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -3628,3 +3628,30 @@ SELECT * FROM check_estimated_rows('SELECT * FROM sb_2 WHERE numeric_lt(y, 1.0)'
-- Tidy up
DROP TABLE sb_1, sb_2 CASCADE;
+-- Check statistics generated for range type and expressions.
+CREATE TABLE stats_ext_tbl_range(name text, irange int4range);
+INSERT INTO stats_ext_tbl_range VALUES
+ ('red', '[1,7)'::int4range),
+ ('blue', '[2,8]'::int4range),
+ ('green', '[3,9)'::int4range);
+CREATE STATISTICS stats_ext_range (mcv)
+ ON irange, (irange + '[4,10)'::int4range)
+ FROM stats_ext_tbl_range;
+ANALYZE stats_ext_tbl_range;
+SELECT attnames, most_common_vals
+ FROM pg_stats_ext
+ WHERE statistics_name = 'stats_ext_range';
+ attnames | most_common_vals
+----------+------------------------------------------------------------
+ {irange} | {{"[1,7)","[1,10)"},{"[2,9)","[2,10)"},{"[3,9)","[3,10)"}}
+(1 row)
+
+SELECT range_length_histogram, range_empty_frac, range_bounds_histogram
+ FROM pg_stats_ext_exprs
+ WHERE statistics_name = 'stats_ext_range';
+ range_length_histogram | range_empty_frac | range_bounds_histogram
+------------------------+------------------+------------------------------
+ {7,8,9} | 0 | {"[1,10)","[2,10)","[3,10)"}
+(1 row)
+
+DROP TABLE stats_ext_tbl_range;
diff --git a/src/test/regress/expected/stats_import.out b/src/test/regress/expected/stats_import.out
index 37131f9ceab..d6cc701500e 100644
--- a/src/test/regress/expected/stats_import.out
+++ b/src/test/regress/expected/stats_import.out
@@ -1481,7 +1481,7 @@ SELECT pg_clear_extended_stats(schemaname => 'stats_import',
statistics_schemaname => 'stats_import',
statistics_name => 'ext_stats_not_exist',
inherited => false);
-WARNING: could not find extended statistics object "stats_import"."ext_stats_not_exist"
+WARNING: could not find extended statistics object "stats_import.ext_stats_not_exist"
pg_clear_extended_stats
-------------------------
@@ -1493,7 +1493,7 @@ SELECT pg_clear_extended_stats(schemaname => 'stats_import',
statistics_schemaname => 'stats_import',
statistics_name => 'test_stat_clone',
inherited => false);
-WARNING: could not clear extended statistics object "stats_import"."test_stat_clone": incorrect relation "stats_import"."test" specified
+WARNING: could not clear extended statistics object "stats_import.test_stat_clone": incorrect relation "stats_import.test" specified
pg_clear_extended_stats
-------------------------
@@ -1678,7 +1678,7 @@ SELECT pg_catalog.pg_restore_extended_stats(
'statistics_schemaname', 'stats_import',
'statistics_name', 'ext_stats_not_exist',
'inherited', false);
-WARNING: could not find extended statistics object "stats_import"."ext_stats_not_exist"
+WARNING: could not find extended statistics object "stats_import.ext_stats_not_exist"
pg_restore_extended_stats
---------------------------
f
@@ -1691,7 +1691,7 @@ SELECT pg_catalog.pg_restore_extended_stats(
'statistics_schemaname', 'stats_import',
'statistics_name', 'test_stat_clone',
'inherited', false);
-WARNING: could not restore extended statistics object "stats_import"."test_stat_clone": incorrect relation "stats_import"."test" specified
+WARNING: could not restore extended statistics object "stats_import.test_stat_clone": incorrect relation "stats_import.test" specified
pg_restore_extended_stats
---------------------------
f
@@ -1762,7 +1762,7 @@ SELECT pg_catalog.pg_restore_extended_stats(
'inherited', false,
'n_distinct', '[{"attributes" : [1,3], "ndistinct" : 4}]'::pg_ndistinct);
WARNING: cannot specify parameter "n_distinct"
-HINT: Extended statistics object "stats_import"."test_stat_dependencies" does not support statistics of this type.
+HINT: Extended statistics object "stats_import.test_stat_dependencies" does not support statistics of this type.
pg_restore_extended_stats
---------------------------
f
@@ -1778,7 +1778,7 @@ SELECT pg_catalog.pg_restore_extended_stats(
'dependencies', '[{"attributes": [2], "dependency": 3, "degree": 1.000000},
{"attributes": [3], "dependency": 2, "degree": 1.000000}]'::pg_dependencies);
WARNING: cannot specify parameter "dependencies"
-HINT: Extended statistics object "stats_import"."test_stat_ndistinct" does not support statistics of this type.
+HINT: Extended statistics object "stats_import.test_stat_ndistinct" does not support statistics of this type.
pg_restore_extended_stats
---------------------------
f
@@ -1966,7 +1966,7 @@ SELECT pg_catalog.pg_restore_extended_stats(
'most_common_freqs', '{0.25,0.25,0.25,0.25}'::double precision[],
'most_common_base_freqs', '{0.0625,0.0625,0.0625,0.0625}'::double precision[]);
WARNING: cannot specify parameters "most_common_vals", "most_common_freqs" or "most_common_base_freqs"
-HINT: Extended statistics object "stats_import"."test_stat_dependencies" does not support statistics of this type.
+HINT: Extended statistics object "stats_import.test_stat_dependencies" does not support statistics of this type.
pg_restore_extended_stats
---------------------------
f
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 021d57f66bb..549e9b2d7be 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -28,7 +28,7 @@ test: strings md5 numerology point lseg line box path polygon circle date time t
# geometry depends on point, lseg, line, box, path, polygon, circle
# horology depends on date, time, timetz, timestamp, timestamptz, interval
# ----------
-test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies oid8
+test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc database stats_import pg_ndistinct pg_dependencies oid8 encoding euc_kr
# ----------
# Load huge amounts of data
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index ce5f5f9eb19..bea858f03c1 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -1115,6 +1115,145 @@ test_enc_conversion(PG_FUNCTION_ARGS)
PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
}
+/* Convert bytea to text without validation for corruption tests from SQL. */
+PG_FUNCTION_INFO_V1(test_bytea_to_text);
+Datum
+test_bytea_to_text(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_TEXT_P(PG_GETARG_BYTEA_PP(0));
+}
+
+/* And the reverse. */
+PG_FUNCTION_INFO_V1(test_text_to_bytea);
+Datum
+test_text_to_bytea(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_BYTEA_P(PG_GETARG_TEXT_PP(0));
+}
+
+/* Corruption tests in C. */
+PG_FUNCTION_INFO_V1(test_mblen_func);
+Datum
+test_mblen_func(PG_FUNCTION_ARGS)
+{
+ const char *func = text_to_cstring(PG_GETARG_BYTEA_PP(0));
+ const char *encoding = text_to_cstring(PG_GETARG_BYTEA_PP(1));
+ text *string = PG_GETARG_BYTEA_PP(2);
+ int offset = PG_GETARG_INT32(3);
+ const char *data = VARDATA_ANY(string);
+ size_t size = VARSIZE_ANY_EXHDR(string);
+ int result = 0;
+
+ if (strcmp(func, "pg_mblen_unbounded") == 0)
+ result = pg_mblen_unbounded(data + offset);
+ else if (strcmp(func, "pg_mblen_cstr") == 0)
+ result = pg_mblen_cstr(data + offset);
+ else if (strcmp(func, "pg_mblen_with_len") == 0)
+ result = pg_mblen_with_len(data + offset, size - offset);
+ else if (strcmp(func, "pg_mblen_range") == 0)
+ result = pg_mblen_range(data + offset, data + size);
+ else if (strcmp(func, "pg_encoding_mblen") == 0)
+ result = pg_encoding_mblen(pg_char_to_encoding(encoding), data + offset);
+ else
+ elog(ERROR, "unknown function");
+
+ PG_RETURN_INT32(result);
+}
+
+PG_FUNCTION_INFO_V1(test_text_to_wchars);
+Datum
+test_text_to_wchars(PG_FUNCTION_ARGS)
+{
+ const char *encoding_name = text_to_cstring(PG_GETARG_BYTEA_PP(0));
+ text *string = PG_GETARG_TEXT_PP(1);
+ const char *data = VARDATA_ANY(string);
+ size_t size = VARSIZE_ANY_EXHDR(string);
+ pg_wchar *wchars = palloc(sizeof(pg_wchar) * (size + 1));
+ Datum *datums;
+ int wlen;
+ int encoding;
+
+ encoding = pg_char_to_encoding(encoding_name);
+ if (encoding < 0)
+ elog(ERROR, "unknown encoding name: %s", encoding_name);
+
+ if (size > 0)
+ {
+ datums = palloc(sizeof(Datum) * size);
+ wlen = pg_encoding_mb2wchar_with_len(encoding,
+ data,
+ wchars,
+ size);
+ Assert(wlen >= 0);
+ Assert(wlen <= size);
+ Assert(wchars[wlen] == 0);
+
+ for (int i = 0; i < wlen; ++i)
+ datums[i] = UInt32GetDatum(wchars[i]);
+ }
+ else
+ {
+ datums = NULL;
+ wlen = 0;
+ }
+
+ PG_RETURN_ARRAYTYPE_P(construct_array_builtin(datums, wlen, INT4OID));
+}
+
+PG_FUNCTION_INFO_V1(test_wchars_to_text);
+Datum
+test_wchars_to_text(PG_FUNCTION_ARGS)
+{
+ const char *encoding_name = text_to_cstring(PG_GETARG_BYTEA_PP(0));
+ ArrayType *array = PG_GETARG_ARRAYTYPE_P(1);
+ Datum *datums;
+ bool *nulls;
+ char *mb;
+ text *result;
+ int wlen;
+ int bytes;
+ int encoding;
+
+ encoding = pg_char_to_encoding(encoding_name);
+ if (encoding < 0)
+ elog(ERROR, "unknown encoding name: %s", encoding_name);
+
+ deconstruct_array_builtin(array, INT4OID, &datums, &nulls, &wlen);
+
+ if (wlen > 0)
+ {
+ pg_wchar *wchars = palloc(sizeof(pg_wchar) * wlen);
+
+ for (int i = 0; i < wlen; ++i)
+ {
+ if (nulls[i])
+ elog(ERROR, "unexpected NULL in array");
+ wchars[i] = DatumGetInt32(datums[i]);
+ }
+
+ mb = palloc(pg_encoding_max_length(encoding) * wlen + 1);
+ bytes = pg_encoding_wchar2mb_with_len(encoding, wchars, mb, wlen);
+ }
+ else
+ {
+ mb = "";
+ bytes = 0;
+ }
+
+ result = palloc(bytes + VARHDRSZ);
+ SET_VARSIZE(result, bytes + VARHDRSZ);
+ memcpy(VARDATA(result), mb, bytes);
+
+ PG_RETURN_TEXT_P(result);
+}
+
+PG_FUNCTION_INFO_V1(test_valid_server_encoding);
+Datum
+test_valid_server_encoding(PG_FUNCTION_ARGS)
+{
+ return pg_valid_server_encoding(text_to_cstring(PG_GETARG_TEXT_PP(0)));
+}
+
/* Provide SQL access to IsBinaryCoercible() */
PG_FUNCTION_INFO_V1(binary_coercible);
Datum
diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql
index 450389831a0..82837af7c4a 100644
--- a/src/test/regress/sql/arrays.sql
+++ b/src/test/regress/sql/arrays.sql
@@ -528,6 +528,10 @@ select '[2147483646:2147483646]={1}'::int[];
select '[-2147483648:-2147483647]={1,2}'::int[];
-- all of the above should be accepted
+-- some day we might allow these cases, but for now they're errors:
+select array[]::oidvector;
+select array[]::int2vector;
+
-- tests for array aggregates
CREATE TEMP TABLE arraggtest ( f1 INT[], f2 TEXT[][], f3 FLOAT[]);
diff --git a/src/test/regress/sql/constraints.sql b/src/test/regress/sql/constraints.sql
index 733a1dbccfe..1e9989698b6 100644
--- a/src/test/regress/sql/constraints.sql
+++ b/src/test/regress/sql/constraints.sql
@@ -623,7 +623,9 @@ DROP TABLE deferred_excl;
-- verify constraints created for NOT NULL clauses
CREATE TABLE notnull_tbl1 (a INTEGER NOT NULL NOT NULL);
\d+ notnull_tbl1
--- no-op
+-- specifying an existing constraint is a no-op
+ALTER TABLE notnull_tbl1 ADD CONSTRAINT notnull_tbl1_a_not_null NOT NULL a;
+-- but using a different constraint name is not allowed
ALTER TABLE notnull_tbl1 ADD CONSTRAINT nn NOT NULL a;
\d+ notnull_tbl1
-- duplicate name
diff --git a/src/test/regress/sql/copyencoding.sql b/src/test/regress/sql/copyencoding.sql
index 4e96a4d6505..64718245b94 100644
--- a/src/test/regress/sql/copyencoding.sql
+++ b/src/test/regress/sql/copyencoding.sql
@@ -23,6 +23,13 @@ CREATE TABLE copy_encoding_tab (t text);
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
-- Read UTF8 data as LATIN1: no error
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'LATIN1');
+-- Non-server encodings have distinct code paths.
+\set fname :abs_builddir '/results/copyencoding_gb18030.csv'
+COPY (SELECT E'\u3042,') TO :'fname' WITH (FORMAT csv, ENCODING 'GB18030');
+COPY copy_encoding_tab FROM :'fname' WITH (FORMAT csv, ENCODING 'GB18030');
+\set fname :abs_builddir '/results/copyencoding_gb18030.data'
+COPY (SELECT E'\u3042,') TO :'fname' WITH (FORMAT text, ENCODING 'GB18030');
+COPY copy_encoding_tab FROM :'fname' WITH (FORMAT text, ENCODING 'GB18030');
-- Use client_encoding
SET client_encoding TO UTF8;
diff --git a/src/test/regress/sql/encoding.sql b/src/test/regress/sql/encoding.sql
new file mode 100644
index 00000000000..b9543c0cb32
--- /dev/null
+++ b/src/test/regress/sql/encoding.sql
@@ -0,0 +1,228 @@
+/* skip test if not UTF8 server encoding */
+SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+\getenv libdir PG_LIBDIR
+\getenv dlsuffix PG_DLSUFFIX
+
+\set regresslib :libdir '/regress' :dlsuffix
+
+CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text
+ AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea
+ AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int
+ AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[]
+ AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text
+ AS :'regresslib' LANGUAGE C STRICT;
+CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean
+ AS :'regresslib' LANGUAGE C STRICT;
+
+
+CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text);
+INSERT INTO regress_encoding
+VALUES ('café',
+ 'caf' || test_bytea_to_text('\xc3'),
+ 'café' || test_bytea_to_text('\x00') || 'dcba',
+ 'caf' || test_bytea_to_text('\xc300') || 'dcba');
+
+SELECT good, truncated, with_nul FROM regress_encoding;
+
+SELECT length(good) FROM regress_encoding;
+SELECT substring(good, 3, 1) FROM regress_encoding;
+SELECT substring(good, 4, 1) FROM regress_encoding;
+SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding;
+SELECT reverse(good) FROM regress_encoding;
+
+-- invalid short mb character = error
+SELECT length(truncated) FROM regress_encoding;
+SELECT substring(truncated, 1, 1) FROM regress_encoding;
+SELECT reverse(truncated) FROM regress_encoding;
+-- invalid short mb character = silently dropped
+SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding;
+
+-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string
+-- contains NUL at a character boundary position, some functions treat it as a
+-- character while others treat it as a terminator, as implementation details.
+
+-- NUL = terminator
+SELECT length(with_nul) FROM regress_encoding;
+SELECT substring(with_nul, 3, 1) FROM regress_encoding;
+SELECT substring(with_nul, 4, 1) FROM regress_encoding;
+SELECT substring(with_nul, 5, 1) FROM regress_encoding;
+SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding;
+SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding;
+-- NUL = character
+SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding;
+
+-- If a corrupted string contains NUL in the tail bytes of a multibyte
+-- character (invalid in all encodings), it is considered part of the
+-- character for length purposes. An error will only be raised in code paths
+-- that convert or verify encodings.
+
+SELECT length(truncated_with_nul) FROM regress_encoding;
+SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding;
+SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding;
+SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding;
+SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding;
+SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding;
+SELECT reverse(truncated_with_nul) FROM regress_encoding;
+
+-- unbounded: sequence would overrun the string!
+SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3)
+FROM regress_encoding;
+
+-- condition detected when using the length/range variants
+SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3)
+FROM regress_encoding;
+SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3)
+FROM regress_encoding;
+
+-- unbounded: sequence would overrun the string, if the terminator were really
+-- the end of it
+SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3)
+FROM regress_encoding;
+SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3)
+FROM regress_encoding;
+
+-- condition detected when using the cstr variants
+SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3)
+FROM regress_encoding;
+
+DROP TABLE regress_encoding;
+
+-- mb<->wchar conversions
+CREATE FUNCTION test_encoding(encoding text, description text, input bytea)
+RETURNS VOID LANGUAGE plpgsql AS
+$$
+DECLARE
+ prefix text;
+ len int;
+ wchars int[];
+ round_trip bytea;
+ result text;
+BEGIN
+ prefix := rpad(encoding || ' ' || description || ':', 28);
+
+ -- XXX could also test validation, length functions and include client
+ -- only encodings with these test cases
+
+ IF test_valid_server_encoding(encoding) THEN
+ wchars := test_text_to_wchars(encoding, test_bytea_to_text(input));
+ round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars));
+ if input = round_trip then
+ result := 'OK';
+ elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then
+ result := 'truncated';
+ else
+ result := 'failed';
+ end if;
+ RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result;
+ END IF;
+END;
+$$;
+-- No validation is done on the encoding itself, just the length to avoid
+-- overruns, so some of the byte sequences below are bogus. They cover
+-- all code branches, server encodings only for now.
+CREATE TABLE encoding_tests (encoding text, description text, input bytea);
+INSERT INTO encoding_tests VALUES
+ -- LATIN1, other single-byte encodings
+ ('LATIN1', 'ASCII', 'a'),
+ ('LATIN1', 'extended', '\xe9'),
+ -- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion):
+ -- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
+ -- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
+ -- 2 80..ff (CS1)
+ ('EUC_JP', 'ASCII', 'a'),
+ ('EUC_JP', 'CS1, short', '\x80'),
+ ('EUC_JP', 'CS1', '\x8002'),
+ ('EUC_JP', 'CS2, short', '\x8e'),
+ ('EUC_JP', 'CS2', '\x8e02'),
+ ('EUC_JP', 'CS3, short', '\x8f'),
+ ('EUC_JP', 'CS3, short', '\x8f02'),
+ ('EUC_JP', 'CS3', '\x8f0203'),
+ -- EUC_CN
+ -- 3 8e (CS2, not used but arbitrarily considered to have length 3)
+ -- 3 8f (CS3, not used but arbitrarily considered to have length 3)
+ -- 2 80..ff (CS1)
+ ('EUC_CN', 'ASCII', 'a'),
+ ('EUC_CN', 'CS1, short', '\x80'),
+ ('EUC_CN', 'CS1', '\x8002'),
+ ('EUC_CN', 'CS2, short', '\x8e'),
+ ('EUC_CN', 'CS2, short', '\x8e02'),
+ ('EUC_CN', 'CS2', '\x8e0203'),
+ ('EUC_CN', 'CS3, short', '\x8f'),
+ ('EUC_CN', 'CS3, short', '\x8f02'),
+ ('EUC_CN', 'CS3', '\x8f0203'),
+ -- EUC_TW:
+ -- 4 8e (CS2)
+ -- 3 8f (CS3, not used but arbitrarily considered to have length 3)
+ -- 2 80..ff (CS1)
+ ('EUC_TW', 'ASCII', 'a'),
+ ('EUC_TW', 'CS1, short', '\x80'),
+ ('EUC_TW', 'CS1', '\x8002'),
+ ('EUC_TW', 'CS2, short', '\x8e'),
+ ('EUC_TW', 'CS2, short', '\x8e02'),
+ ('EUC_TW', 'CS2, short', '\x8e0203'),
+ ('EUC_TW', 'CS2', '\x8e020304'),
+ ('EUC_TW', 'CS3, short', '\x8f'),
+ ('EUC_TW', 'CS3, short', '\x8f02'),
+ ('EUC_TW', 'CS3', '\x8f0203'),
+ -- UTF8
+ -- 2 c0..df
+ -- 3 e0..ef
+ -- 4 f0..f7 (but maximum real codepoint U+10ffff has f4)
+ -- 5 f8..fb (not supported)
+ -- 6 fc..fd (not supported)
+ ('UTF8', 'ASCII', 'a'),
+ ('UTF8', '2 byte, short', '\xdf'),
+ ('UTF8', '2 byte', '\xdf82'),
+ ('UTF8', '3 byte, short', '\xef'),
+ ('UTF8', '3 byte, short', '\xef82'),
+ ('UTF8', '3 byte', '\xef8283'),
+ ('UTF8', '4 byte, short', '\xf7'),
+ ('UTF8', '4 byte, short', '\xf782'),
+ ('UTF8', '4 byte, short', '\xf78283'),
+ ('UTF8', '4 byte', '\xf7828384'),
+ ('UTF8', '5 byte, unsupported', '\xfb'),
+ ('UTF8', '5 byte, unsupported', '\xfb82'),
+ ('UTF8', '5 byte, unsupported', '\xfb8283'),
+ ('UTF8', '5 byte, unsupported', '\xfb828384'),
+ ('UTF8', '5 byte, unsupported', '\xfb82838485'),
+ ('UTF8', '6 byte, unsupported', '\xfd'),
+ ('UTF8', '6 byte, unsupported', '\xfd82'),
+ ('UTF8', '6 byte, unsupported', '\xfd8283'),
+ ('UTF8', '6 byte, unsupported', '\xfd828384'),
+ ('UTF8', '6 byte, unsupported', '\xfd82838485'),
+ ('UTF8', '6 byte, unsupported', '\xfd8283848586'),
+ -- MULE_INTERNAL
+ -- 2 81..8d LC1
+ -- 3 90..99 LC2
+ ('MULE_INTERNAL', 'ASCII', 'a'),
+ ('MULE_INTERNAL', 'LC1, short', '\x81'),
+ ('MULE_INTERNAL', 'LC1', '\x8182'),
+ ('MULE_INTERNAL', 'LC2, short', '\x90'),
+ ('MULE_INTERNAL', 'LC2, short', '\x9082'),
+ ('MULE_INTERNAL', 'LC2', '\x908283');
+
+SELECT COUNT(test_encoding(encoding, description, input)) > 0
+FROM encoding_tests;
+
+DROP TABLE encoding_tests;
+DROP FUNCTION test_encoding;
+DROP FUNCTION test_text_to_wchars;
+DROP FUNCTION test_mblen_func;
+DROP FUNCTION test_bytea_to_text;
+DROP FUNCTION test_text_to_bytea;
+
+
+-- substring slow path: multi-byte escape char vs. multi-byte pattern char.
+SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7');
+-- Levenshtein distance metric: exercise character length cache.
+SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name);
+-- JSON errcontext: truncate long data.
+SELECT repeat(U&'\00A7', 30)::json;
diff --git a/src/test/regress/sql/euc_kr.sql b/src/test/regress/sql/euc_kr.sql
new file mode 100644
index 00000000000..1851b2a8c14
--- /dev/null
+++ b/src/test/regress/sql/euc_kr.sql
@@ -0,0 +1,12 @@
+-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent
+-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all
+-- of EUC_KR, also run the test in UTF8.
+SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+-- Exercise is_multibyte_char_in_char (non-UTF8) slow path.
+SELECT POSITION(
+ convert_from('\xbcf6c7d0', 'EUC_KR') IN
+ convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR'));
diff --git a/src/test/regress/sql/guc.sql b/src/test/regress/sql/guc.sql
index bafaf067e82..dfb843fd3ae 100644
--- a/src/test/regress/sql/guc.sql
+++ b/src/test/regress/sql/guc.sql
@@ -232,6 +232,28 @@ drop schema not_there_initially;
select current_schemas(false);
reset search_path;
+--
+-- Test parsing of log_min_messages
+--
+
+SET log_min_messages TO foo; -- fail
+SET log_min_messages TO fatal;
+SHOW log_min_messages;
+SET log_min_messages TO 'fatal';
+SHOW log_min_messages;
+SET log_min_messages TO 'checkpointer:debug2, autovacuum:debug1'; -- fail
+SET log_min_messages TO 'debug1, backend:error, fatal'; -- fail
+SET log_min_messages TO 'backend:error, debug1, backend:warning'; -- fail
+SET log_min_messages TO 'backend:error, foo:fatal, archiver:debug1'; -- fail
+SET log_min_messages TO 'backend:error, checkpointer:bar, archiver:debug1'; -- fail
+SET log_min_messages TO 'backend:error, checkpointer:debug3, fatal, archiver:debug2, autovacuum:debug1, walsender:debug3';
+SHOW log_min_messages;
+SET log_min_messages TO 'warning, autovacuum:debug1';
+SHOW log_min_messages;
+SET log_min_messages TO 'autovacuum:debug1, warning';
+SHOW log_min_messages;
+RESET log_min_messages;
+
--
-- Tests for function-local GUC settings
--
diff --git a/src/test/regress/sql/join_hash.sql b/src/test/regress/sql/join_hash.sql
index 49d3fd61856..53db1754bb2 100644
--- a/src/test/regress/sql/join_hash.sql
+++ b/src/test/regress/sql/join_hash.sql
@@ -83,8 +83,8 @@ update pg_class
set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192
where relname = 'extremely_skewed';
--- Make a relation with a couple of enormous tuples.
-create table wide as select generate_series(1, 2) as id, rpad('', 320000, 'x') as t;
+-- Make a relation with several enormous tuples.
+create table wide as select generate_series(1, 3) as id, rpad('', 320000, 'x') as t;
alter table wide set (parallel_workers = 2);
-- The "optimal" case: the hash table fits in memory; we plan for 1
@@ -496,14 +496,14 @@ set work_mem = '128kB';
set hash_mem_multiplier = 1.0;
explain (costs off)
select length(max(s.t))
- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
+ from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id);
select length(max(s.t))
-from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
+from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id);
select final > 1 as multibatch
from hash_join_batches(
$$
select length(max(s.t))
- from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id);
+ from wide left join (select id, coalesce(t, '') || '' as t from wide where id < 3) s using (id);
$$);
rollback to settings;
diff --git a/src/test/regress/sql/predicate.sql b/src/test/regress/sql/predicate.sql
index db72b11bb22..0f92bb52435 100644
--- a/src/test/regress/sql/predicate.sql
+++ b/src/test/regress/sql/predicate.sql
@@ -308,3 +308,143 @@ EXPLAIN (COSTS OFF)
SELECT * FROM pred_tab WHERE (a::oid) IS NULL;
DROP TABLE pred_tab;
+
+--
+-- Test optimization of IS [NOT] DISTINCT FROM
+--
+
+CREATE TYPE dist_row_t AS (a int, b int);
+CREATE TABLE dist_tab (id int, val_nn int NOT NULL, val_null int, row_nn dist_row_t NOT NULL);
+
+INSERT INTO dist_tab VALUES (1, 10, 10, ROW(1, 1));
+INSERT INTO dist_tab VALUES (2, 20, NULL, ROW(2, 2));
+INSERT INTO dist_tab VALUES (3, 30, 30, ROW(1, NULL));
+
+CREATE INDEX dist_tab_nn_idx ON dist_tab (val_nn);
+
+ANALYZE dist_tab;
+
+-- Ensure that the predicate folds to constant TRUE
+EXPLAIN(COSTS OFF)
+SELECT id FROM dist_tab WHERE val_nn IS DISTINCT FROM NULL::INT;
+SELECT id FROM dist_tab WHERE val_nn IS DISTINCT FROM NULL::INT;
+
+-- Ensure that the predicate folds to constant FALSE
+EXPLAIN(COSTS OFF)
+SELECT id FROM dist_tab WHERE val_nn IS NOT DISTINCT FROM NULL::INT;
+SELECT id FROM dist_tab WHERE val_nn IS NOT DISTINCT FROM NULL::INT;
+
+-- Ensure that the predicate is converted to an inequality operator
+EXPLAIN (COSTS OFF)
+SELECT id FROM dist_tab WHERE val_nn IS DISTINCT FROM 10;
+SELECT id FROM dist_tab WHERE val_nn IS DISTINCT FROM 10;
+
+-- Ensure that the predicate is converted to an equality operator, and thus can
+-- use index scan
+SET enable_seqscan TO off;
+EXPLAIN (COSTS OFF)
+SELECT id FROM dist_tab WHERE val_nn IS NOT DISTINCT FROM 10;
+SELECT id FROM dist_tab WHERE val_nn IS NOT DISTINCT FROM 10;
+RESET enable_seqscan;
+
+-- Ensure that the predicate is preserved as "IS DISTINCT FROM"
+EXPLAIN (COSTS OFF)
+SELECT id FROM dist_tab WHERE val_null IS DISTINCT FROM 20;
+SELECT id FROM dist_tab WHERE val_null IS DISTINCT FROM 20;
+
+-- Safety check for rowtypes
+-- Ensure that the predicate is converted to an inequality operator
+EXPLAIN (COSTS OFF)
+SELECT id FROM dist_tab WHERE row_nn IS DISTINCT FROM ROW(1, 5)::dist_row_t;
+-- ... and that all 3 rows are returned
+SELECT id FROM dist_tab WHERE row_nn IS DISTINCT FROM ROW(1, 5)::dist_row_t;
+
+-- Ensure that the predicate is converted to an equality operator, and thus
+-- mergejoinable or hashjoinable
+SET enable_nestloop TO off;
+EXPLAIN (COSTS OFF)
+SELECT * FROM dist_tab t1 JOIN dist_tab t2 ON t1.val_nn IS NOT DISTINCT FROM t2.val_nn;
+SELECT * FROM dist_tab t1 JOIN dist_tab t2 ON t1.val_nn IS NOT DISTINCT FROM t2.val_nn;
+RESET enable_nestloop;
+
+-- Ensure that the predicate is converted to IS NOT NULL
+EXPLAIN (COSTS OFF)
+SELECT id FROM dist_tab WHERE val_null IS DISTINCT FROM NULL::INT;
+SELECT id FROM dist_tab WHERE val_null IS DISTINCT FROM NULL::INT;
+
+-- Ensure that the predicate is converted to IS NULL
+EXPLAIN (COSTS OFF)
+SELECT id FROM dist_tab WHERE val_null IS NOT DISTINCT FROM NULL::INT;
+SELECT id FROM dist_tab WHERE val_null IS NOT DISTINCT FROM NULL::INT;
+
+-- Safety check for rowtypes
+-- The predicate is converted to IS NOT NULL, and get_rule_expr prints it as IS
+-- DISTINCT FROM because argisrow is false, indicating that we're applying a
+-- scalar test
+EXPLAIN (COSTS OFF)
+SELECT id FROM dist_tab WHERE (val_null, val_null) IS DISTINCT FROM NULL::RECORD;
+SELECT id FROM dist_tab WHERE (val_null, val_null) IS DISTINCT FROM NULL::RECORD;
+
+-- The predicate is converted to IS NULL, and get_rule_expr prints it as IS NOT
+-- DISTINCT FROM because argisrow is false, indicating that we're applying a
+-- scalar test
+EXPLAIN (COSTS OFF)
+SELECT id FROM dist_tab WHERE (val_null, val_null) IS NOT DISTINCT FROM NULL::RECORD;
+SELECT id FROM dist_tab WHERE (val_null, val_null) IS NOT DISTINCT FROM NULL::RECORD;
+
+DROP TABLE dist_tab;
+DROP TYPE dist_row_t;
+
+--
+-- Test optimization of BooleanTest (IS [NOT] TRUE/FALSE/UNKNOWN) on
+-- non-nullable input
+--
+CREATE TABLE bool_tab (id int, flag_nn boolean NOT NULL, flag_null boolean);
+
+INSERT INTO bool_tab VALUES (1, true, true);
+INSERT INTO bool_tab VALUES (2, false, NULL);
+
+CREATE INDEX bool_tab_nn_idx ON bool_tab (flag_nn);
+
+ANALYZE bool_tab;
+
+-- Ensure that the predicate folds to constant FALSE
+EXPLAIN (COSTS OFF)
+SELECT id FROM bool_tab WHERE flag_nn IS UNKNOWN;
+SELECT id FROM bool_tab WHERE flag_nn IS UNKNOWN;
+
+-- Ensure that the predicate folds to constant TRUE
+EXPLAIN (COSTS OFF)
+SELECT id FROM bool_tab WHERE flag_nn IS NOT UNKNOWN;
+SELECT id FROM bool_tab WHERE flag_nn IS NOT UNKNOWN;
+
+-- Ensure that the predicate folds to flag_nn
+EXPLAIN (COSTS OFF)
+SELECT id FROM bool_tab WHERE flag_nn IS TRUE;
+SELECT id FROM bool_tab WHERE flag_nn IS TRUE;
+
+-- Ensure that the predicate folds to flag_nn, and thus can use index scan
+SET enable_seqscan TO off;
+EXPLAIN (COSTS OFF)
+SELECT id FROM bool_tab WHERE flag_nn IS NOT FALSE;
+SELECT id FROM bool_tab WHERE flag_nn IS NOT FALSE;
+RESET enable_seqscan;
+
+-- Ensure that the predicate folds to not flag_nn
+EXPLAIN (COSTS OFF)
+SELECT id FROM bool_tab WHERE flag_nn IS FALSE;
+SELECT id FROM bool_tab WHERE flag_nn IS FALSE;
+
+-- Ensure that the predicate folds to not flag_nn, and thus can use index scan
+SET enable_seqscan TO off;
+EXPLAIN (COSTS OFF)
+SELECT id FROM bool_tab WHERE flag_nn IS NOT TRUE;
+SELECT id FROM bool_tab WHERE flag_nn IS NOT TRUE;
+RESET enable_seqscan;
+
+-- Ensure that the predicate is preserved as a BooleanTest
+EXPLAIN (COSTS OFF)
+SELECT id FROM bool_tab WHERE flag_null IS UNKNOWN;
+SELECT id FROM bool_tab WHERE flag_null IS UNKNOWN;
+
+DROP TABLE bool_tab;
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index 76ee9d29c08..9dcce3440c8 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1866,3 +1866,21 @@ SELECT * FROM check_estimated_rows('SELECT * FROM sb_2 WHERE numeric_lt(y, 1.0)'
-- Tidy up
DROP TABLE sb_1, sb_2 CASCADE;
+
+-- Check statistics generated for range type and expressions.
+CREATE TABLE stats_ext_tbl_range(name text, irange int4range);
+INSERT INTO stats_ext_tbl_range VALUES
+ ('red', '[1,7)'::int4range),
+ ('blue', '[2,8]'::int4range),
+ ('green', '[3,9)'::int4range);
+CREATE STATISTICS stats_ext_range (mcv)
+ ON irange, (irange + '[4,10)'::int4range)
+ FROM stats_ext_tbl_range;
+ANALYZE stats_ext_tbl_range;
+SELECT attnames, most_common_vals
+ FROM pg_stats_ext
+ WHERE statistics_name = 'stats_ext_range';
+SELECT range_length_histogram, range_empty_frac, range_bounds_histogram
+ FROM pg_stats_ext_exprs
+ WHERE statistics_name = 'stats_ext_range';
+DROP TABLE stats_ext_tbl_range;
diff --git a/src/test/subscription/t/001_rep_changes.pl b/src/test/subscription/t/001_rep_changes.pl
index d7e62e4d488..7d41715ed81 100644
--- a/src/test/subscription/t/001_rep_changes.pl
+++ b/src/test/subscription/t/001_rep_changes.pl
@@ -353,7 +353,8 @@
# Note that the current location of the log file is not grabbed immediately
# after reloading the configuration, but after sending one SQL command to
# the node so as we are sure that the reloading has taken effect.
-my $log_location = -s $node_subscriber->logfile;
+my $log_location_pub = -s $node_publisher->logfile;
+my $log_location_sub = -s $node_subscriber->logfile;
$node_publisher->safe_psql('postgres',
"UPDATE tab_full_pk SET b = 'quux' WHERE a = 1");
@@ -363,7 +364,7 @@
$node_publisher->wait_for_catchup('tap_sub');
-my $logfile = slurp_file($node_subscriber->logfile, $log_location);
+my $logfile = slurp_file($node_subscriber->logfile, $log_location_sub);
like(
$logfile,
qr/conflict detected on relation "public.tab_full_pk": conflict=update_missing.*\n.*DETAIL:.* Could not find the row to be updated: remote row \(1, quux\), replica identity \(a\)=\(1\)/m,
@@ -445,11 +446,12 @@
#
# First, confirm that no such QUERY STATISTICS message appears before enabling
# log_statement_stats.
-$logfile = slurp_file($node_publisher->logfile, $log_location);
+$logfile = slurp_file($node_publisher->logfile, $log_location_pub);
unlike(
$logfile,
qr/QUERY STATISTICS/,
'log_statement_stats has not been enabled yet');
+$log_location_pub = -s $node_publisher->logfile;
# check that change of connection string and/or publication list causes
# restart of subscription workers. We check the state along with
@@ -476,7 +478,7 @@
# Check that the expected QUERY STATISTICS message appears,
# which shows that log_statement_stats=on from the CONNECTION string
# was correctly passed through to and honored by the walsender.
-$logfile = slurp_file($node_publisher->logfile, $log_location);
+$logfile = slurp_file($node_publisher->logfile, $log_location_pub);
like(
$logfile,
qr/QUERY STATISTICS/,
@@ -538,13 +540,13 @@
# Note that the current location of the log file is not grabbed immediately
# after reloading the configuration, but after sending one SQL command to
# the node so that we are sure that the reloading has taken effect.
-$log_location = -s $node_publisher->logfile;
+$log_location_pub = -s $node_publisher->logfile;
$node_publisher->safe_psql('postgres', "INSERT INTO tab_notrep VALUES (11)");
$node_publisher->wait_for_catchup('tap_sub');
-$logfile = slurp_file($node_publisher->logfile, $log_location);
+$logfile = slurp_file($node_publisher->logfile, $log_location_pub);
like(
$logfile,
qr/skipped replication of an empty transaction with XID/,
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 34374df0d67..d5b85001752 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -125,6 +125,7 @@ AnlIndexData
AnyArrayType
Append
AppendPath
+AppendPathInput
AppendRelInfo
AppendState
ApplyErrorCallbackArg
@@ -710,6 +711,7 @@ EachState
Edge
EditableObjectType
ElementsState
+ElidedNode
EnableTimeoutParams
EndDataPtrType
EndDirectModify_function
@@ -803,6 +805,7 @@ ExtensibleNodeMethods
ExtensionControlFile
ExtensionInfo
ExtensionLocation
+ExtensionSiblingCache
ExtensionVersionInfo
FDWCollateState
FD_SET
@@ -2488,6 +2491,7 @@ RecordCacheArrayEntry
RecordCacheEntry
RecordCompareData
RecordIOData
+RecoveryConflictReason
RecoveryLockEntry
RecoveryLockXidEntry
RecoveryPauseState
@@ -2804,6 +2808,7 @@ SharedTypmodTableEntry
Sharedsort
ShellTypeInfo
ShippableCacheEntry
+ShmemAllocatorData
ShippableCacheKey
ShmemIndexEnt
ShutdownForeignScan_function
@@ -2928,6 +2933,7 @@ SubLink
SubLinkType
SubOpts
SubPlan
+SubPlanRTInfo
SubPlanState
SubRelInfo
SubRemoveRels
@@ -3736,6 +3742,7 @@ gistxlogPageReuse
gistxlogPageSplit
gistxlogPageUpdate
grouping_sets_data
+growable_trgm_array
gseg_picksplit_item
gss_OID_set
gss_buffer_desc
@@ -3974,6 +3981,45 @@ pg_uuid_t
pg_wchar
pg_wchar_tbl
pgp_armor_headers_state
+pgpa_collected_advice
+pgpa_advice_item
+pgpa_advice_tag_type
+pgpa_advice_target
+pgpa_identifier
+pgpa_index_target
+pgpa_index_type
+pgpa_itm_type
+pgpa_jo_outcome
+pgpa_join_class
+pgpa_join_member
+pgpa_join_state
+pgpa_join_strategy
+pgpa_join_unroller
+pgpa_local_advice
+pgpa_local_advice_chunk
+pgpa_output_context
+pgpa_plan_walker_context
+pgpa_planner_state
+pgpa_qf_type
+pgpa_query_feature
+pgpa_ri_checker
+pgpa_ri_checker_key
+pgpa_scan
+pgpa_scan_strategy
+pgpa_shared_advice
+pgpa_shared_advice_chunk
+pgpa_shared_state
+pgpa_sj_unique_rel
+pgpa_target_type
+pgpa_trove
+pgpa_trove_entry
+pgpa_trove_entry_element
+pgpa_trove_entry_hash
+pgpa_trove_entry_key
+pgpa_trove_lookup_type
+pgpa_trove_result
+pgpa_trove_slice
+pgpa_unrolled_join
pgsocket
pgsql_thing_t
pgssEntry