From 2e72144ddb7fcbf21ad6eb5a928204a6aeadaa5c Mon Sep 17 00:00:00 2001 From: Marty Kulma <18468315+martykulma@users.noreply.github.com> Date: Tue, 31 Mar 2026 09:35:47 -0400 Subject: [PATCH 1/3] PG doc: add note on statistics and snapshotting in PG considerations --- doc/user/data/postgres_source_details.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/doc/user/data/postgres_source_details.yml b/doc/user/data/postgres_source_details.yml index ad84519263622..b9bfa14c4d2d0 100644 --- a/doc/user/data/postgres_source_details.yml +++ b/doc/user/data/postgres_source_details.yml @@ -212,6 +212,21 @@ DELETE FROM t; ``` +- name: postgres-snapshot-behavior + content: | + The PostgreSQL source performs parallel snapshot of tables, distributing rows among workers + using ranges of + [`CTID`](https://www.postgresql.org/docs/current/ddl-system-columns.html#DDL-SYSTEM-COLUMNS-CTID). + Materialize uses [estimates](https://www.postgresql.org/docs/current/row-estimation-examples.html) + for the amount of data and rows that will be read. Missing or stale statistics will result in + an uneven distribution of work, decreasing the potential speed gains. In addition, statistics on + snapshot completion will be incorrect in the console. + + It is recommended to ensure that statistics are current before creating the source in + Materialize. This can be accomplished manually via PostgreSQL's + [`ANALYZE`](https://www.postgresql.org/docs/current/sql-analyze.html) command. + + - name: postgres-considerations content: | ### Schema changes @@ -279,3 +294,8 @@ ### Modifying an existing source {{% include-headless "/headless/alter-source-snapshot-blocking-behavior" %}} + + ### Snapshotting + + {{% include-from-yaml data="postgres_source_details" + name="postgres-snapshot-behavior" %}} From 35c9ce585043c7ef0d97405b24f2aec78ae10f08 Mon Sep 17 00:00:00 2001 From: Marty Kulma <18468315+martykulma@users.noreply.github.com> Date: Tue, 31 Mar 2026 11:16:34 -0400 Subject: [PATCH 2/3] Reword from Kay --- doc/user/data/postgres_source_details.yml | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/doc/user/data/postgres_source_details.yml b/doc/user/data/postgres_source_details.yml index b9bfa14c4d2d0..a585eea60e716 100644 --- a/doc/user/data/postgres_source_details.yml +++ b/doc/user/data/postgres_source_details.yml @@ -214,18 +214,17 @@ - name: postgres-snapshot-behavior content: | - The PostgreSQL source performs parallel snapshot of tables, distributing rows among workers - using ranges of + The PostgreSQL source performs parallel snapshotting of tables by distributing rows among + workers using ranges of [`CTID`](https://www.postgresql.org/docs/current/ddl-system-columns.html#DDL-SYSTEM-COLUMNS-CTID). - Materialize uses [estimates](https://www.postgresql.org/docs/current/row-estimation-examples.html) - for the amount of data and rows that will be read. Missing or stale statistics will result in - an uneven distribution of work, decreasing the potential speed gains. In addition, statistics on - snapshot completion will be incorrect in the console. - - It is recommended to ensure that statistics are current before creating the source in - Materialize. This can be accomplished manually via PostgreSQL's - [`ANALYZE`](https://www.postgresql.org/docs/current/sql-analyze.html) command. - + Materialize uses + [PostgreSQL statistics to estimate](https://www.postgresql.org/docs/current/row-estimation-examples.html) + the amount of data and number of rows to read. Missing or stale statistics can result in uneven + work distribution, reducing snapshot performance. They can also cause incorrect snapshot + progress reporting in the Console. + + To avoid this situation, before creating the source in Materialize, ensure statistics are up to + date by running PostgreSQL `ANALYZE` command. - name: postgres-considerations content: | From 8b94b79c23a73e97d1e54983945dc333be81a8ac Mon Sep 17 00:00:00 2001 From: Marty Kulma <18468315+martykulma@users.noreply.github.com> Date: Tue, 31 Mar 2026 20:20:58 -0400 Subject: [PATCH 3/3] Additional tip --- .../examples/ingest_data/postgres/create_source_cloud.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/user/data/examples/ingest_data/postgres/create_source_cloud.yml b/doc/user/data/examples/ingest_data/postgres/create_source_cloud.yml index 6f9e346a3436a..73ddf53810c30 100644 --- a/doc/user/data/examples/ingest_data/postgres/create_source_cloud.yml +++ b/doc/user/data/examples/ingest_data/postgres/create_source_cloud.yml @@ -76,6 +76,13 @@ - name: "ingest-data-step" description: | + {{< tip >}} + When snapshotting, Materialize uses PostgreSQL statistics to estimate the amount of data and + number of rows to read. Before creating the source in Materialize, check that the PostgreSQL + statistics are up to date by running PostgreSQL `ANALYZE`. See + [Snapshotting considerations](#snapshotting) for more information. + {{< /tip >}} + {{< tabs >}} {{< tab "Legacy Syntax" >}} #### Legacy syntax