From f4a33ecc80ae189e41779ac30f192afe5949f0ae Mon Sep 17 00:00:00 2001 From: Leijurv Date: Sun, 31 May 2026 18:39:21 -0700 Subject: [PATCH 1/3] grouped linemerge --- CMakeLists.txt | 1 + flex-config/gen/grouped-linemerge.lua | 106 ++++++++ src/gen/gen-create.cpp | 5 + src/gen/gen-grouped-linemerge.cpp | 358 ++++++++++++++++++++++++++ src/gen/gen-grouped-linemerge.hpp | 67 +++++ tests/CMakeLists.txt | 9 +- tests/test-gen-grouped-linemerge.cpp | 328 +++++++++++++++++++++++ 7 files changed, 872 insertions(+), 2 deletions(-) create mode 100644 flex-config/gen/grouped-linemerge.lua create mode 100644 src/gen/gen-grouped-linemerge.cpp create mode 100644 src/gen/gen-grouped-linemerge.hpp create mode 100644 tests/test-gen-grouped-linemerge.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ea277eeb2..ee70c7335 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -264,6 +264,7 @@ else() src/gen/gen-base.cpp src/gen/gen-create.cpp src/gen/gen-discrete-isolation.cpp + src/gen/gen-grouped-linemerge.cpp src/gen/gen-rivers.cpp src/gen/gen-tile-builtup.cpp src/gen/gen-tile-raster.cpp diff --git a/flex-config/gen/grouped-linemerge.lua b/flex-config/gen/grouped-linemerge.lua new file mode 100644 index 000000000..7d772355e --- /dev/null +++ b/flex-config/gen/grouped-linemerge.lua @@ -0,0 +1,106 @@ +-- This config example file is released into the Public Domain. +-- +-- This Lua config demonstrates the 'grouped-linemerge' generalization. It +-- merges connected lines that share the same set of grouping columns into +-- single (multi-)lines, the equivalent of +-- +-- SELECT cols..., (ST_Dump(ST_LineMerge(ST_Collect(geom)))).geom +-- FROM roads GROUP BY cols... +-- +-- but done globally and maintained incrementally on updates. A typical use is +-- merging road segments that render identically (same name/ref/highway/layer) +-- so that labels and route shields are placed on the whole road instead of on +-- each individual OSM way, without the artifacts you get when merging only +-- within a tile. +-- +-- NOTE THAT THE GENERALIZATION SUPPORT IS EXPERIMENTAL AND MIGHT CHANGE +-- WITHOUT NOTICE! +-- +-- Workflow: +-- * Import as usual: osm2pgsql -O flex -S grouped-linemerge.lua DATA.osm.pbf +-- * Build the merged table: osm2pgsql-gen -S grouped-linemerge.lua +-- * Apply an update: osm2pgsql -a -O flex -S grouped-linemerge.lua CHANGES.osc.gz +-- * Update the merged table: osm2pgsql-gen -a -S grouped-linemerge.lua + +-- An expire output records which tiles changed during an update. The +-- grouped-linemerge generalization uses it only as a seed for "where did line +-- geometry change" - it then walks each affected connected component out from +-- there and re-merges it. Use a high maxzoom so the seed regions are small. +local exp_roads = osm2pgsql.define_expire_output({ + maxzoom = 18, + table = 'exp_roads', +}) + +-- The source table with the original road segments (one row per OSM way). +local roads = osm2pgsql.define_table({ + name = 'roads', + ids = { type = 'way', id_column = 'way_id' }, + columns = { + { column = 'name', type = 'text' }, + { column = 'ref', type = 'text' }, + { column = 'highway', type = 'text' }, + { column = 'layer', type = 'int' }, + -- Attach the expire output to the geometry so that any change to a + -- road's geometry (add/modify/delete) expires the tiles it covers. + { column = 'geom', type = 'linestring', not_null = true, + expire = { { output = exp_roads } } }, + } +}) + +-- The destination table with the merged roads. Its columns are exactly the +-- grouping columns plus the geometry. It has no OSM id column (it is derived +-- data maintained by osm2pgsql-gen, not by the normal update process); the +-- warning osm2pgsql prints about that is expected. +osm2pgsql.define_table({ + name = 'roads_merged', + columns = { + { column = 'name', type = 'text' }, + { column = 'ref', type = 'text' }, + { column = 'highway', type = 'text' }, + { column = 'layer', type = 'int' }, + { column = 'geom', type = 'linestring', not_null = true }, + } +}) + +function osm2pgsql.process_way(object) + local highway = object.tags.highway + if not highway then + return + end + roads:insert({ + name = object.tags.name, + ref = object.tags.ref, + highway = highway, + layer = tonumber(object.tags.layer), + geom = object:as_linestring(), + }) +end + +function osm2pgsql.process_gen() + osm2pgsql.run_gen('grouped-linemerge', { + name = 'roads', -- name (for logging) + debug = false, -- set to true for more detailed debug output + src_table = 'roads', -- input table with the line segments + dest_table = 'roads_merged', -- output table for the merged lines + geom_column = 'geom', -- geometry column (same in src and dest) + + -- Lines are merged when ALL of these columns are equal (NULLs compare + -- equal). Pass them as a comma-separated list. + group_by_columns = 'name, ref, highway, layer', + + -- Optional pre-filter (SQL boolean expression on the source columns). + -- Lines not matching are completely excluded from the generalization. + -- Here we only merge roads that carry a label or a shield. + where = 'name IS NOT NULL OR ref IS NOT NULL', + + -- In append mode, where to read the expired tiles from, and the zoom + -- level they were captured at (must match the expire output's maxzoom). + expire_list = 'exp_roads', + zoom = 18, + + -- Create functional endpoint indexes on the src/dest tables in create + -- mode. These make the incremental component walk fast. Set to false + -- if you manage the indexes yourself. + create_indexes = true, + }) +end diff --git a/src/gen/gen-create.cpp b/src/gen/gen-create.cpp index 55200b593..37b28de48 100644 --- a/src/gen/gen-create.cpp +++ b/src/gen/gen-create.cpp @@ -12,6 +12,7 @@ #include "format.hpp" #include "gen-base.hpp" #include "gen-discrete-isolation.hpp" +#include "gen-grouped-linemerge.hpp" #include "gen-rivers.hpp" #include "gen-tile-builtup.hpp" #include "gen-tile-raster.hpp" @@ -31,6 +32,10 @@ std::unique_ptr create_generalizer(std::string const &strategy, if (strategy == "discrete-isolation") { return std::make_unique(connection, append, params); } + if (strategy == "grouped-linemerge") { + return std::make_unique(connection, append, + params); + } if (strategy == "raster-union") { return std::make_unique(connection, append, params); diff --git a/src/gen/gen-grouped-linemerge.cpp b/src/gen/gen-grouped-linemerge.cpp new file mode 100644 index 000000000..435955e0e --- /dev/null +++ b/src/gen/gen-grouped-linemerge.cpp @@ -0,0 +1,358 @@ +/** + * SPDX-License-Identifier: GPL-2.0-or-later + * + * This file is part of osm2pgsql (https://osm2pgsql.org/). + * + * Copyright (C) 2006-2026 by the osm2pgsql developer community. + * For a full list of authors see the git log. + */ + +#include "gen-grouped-linemerge.hpp" + +#include "format.hpp" +#include "logging.hpp" +#include "params.hpp" +#include "pgsql.hpp" + +#include + +#include +#include + +namespace { + +std::string trim(std::string const &str) +{ + auto const begin = str.find_first_not_of(" \t\n\r"); + if (begin == std::string::npos) { + return {}; + } + auto const end = str.find_last_not_of(" \t\n\r"); + return str.substr(begin, end - begin + 1); +} + +} // anonymous namespace + +gen_grouped_linemerge_t::gen_grouped_linemerge_t(pg_conn_t *connection, + bool append, params_t *params) +: gen_base_t(connection, append, params), m_timer_merge(add_timer("merge")), + m_timer_walk(add_timer("walk")), m_timer_delete(add_timer("delete")) +{ + check_src_dest_table_params_exist(); + + // Parse the comma-separated list of grouping columns into the SQL + // fragments we need: + // * group_cols - quoted list "a", "b" (SELECT/GROUP BY/INSERT) + // * group_cols_l - l."a", l."b" (read from source alias) + // * group_cols_l_gk - l."a" AS "gk_a", ... (frontier of the walk) + // * group_cols_gk - "a" AS "gk_a", ... (seed of the walk) + // * group_join - l."a" IS NOT DISTINCT FROM n."gk_a" AND ... + // The walk's frontier table carries the grouping values under "gk_"- + // prefixed names so that an (optional) user 'where' filter, which uses the + // unqualified source column names, is never ambiguous against the frontier. + // IS NOT DISTINCT FROM makes NULLs (e.g. an unnamed road) compare equal, + // matching the GROUP BY semantics. + auto const group_by = get_params().get_string("group_by_columns", ""); + if (group_by.empty()) { + throw fmt_error("Missing 'group_by_columns' parameter in" + " generalizer{}.", + context()); + } + + std::string group_cols; + std::string group_cols_l; + std::string group_cols_l_gk; + std::string group_cols_gk; + std::string group_join; // l (source) vs n (reached node, gk_ prefixed) + std::string group_join_dn; // d (dest) vs n (reached node, gk_ prefixed) + bool first = true; + for (auto const &raw : osmium::split_string(group_by, ',')) { + auto const col = trim(raw); + if (col.empty()) { + continue; + } + check_identifier(col, "group_by_columns"); + if (!first) { + group_cols += ", "; + group_cols_l += ", "; + group_cols_l_gk += ", "; + group_cols_gk += ", "; + group_join += " AND "; + group_join_dn += " AND "; + } + group_cols += fmt::format(R"("{}")", col); + group_cols_l += fmt::format(R"(l."{}")", col); + group_cols_l_gk += fmt::format(R"(l."{0}" AS "gk_{0}")", col); + group_cols_gk += fmt::format(R"("{0}" AS "gk_{0}")", col); + group_join += + fmt::format(R"(l."{0}" IS NOT DISTINCT FROM n."gk_{0}")", col); + group_join_dn += + fmt::format(R"(d."{0}" IS NOT DISTINCT FROM n."gk_{0}")", col); + first = false; + } + if (first) { + throw fmt_error("Parameter 'group_by_columns' is empty in" + " generalizer{}.", + context()); + } + + params->set("group_cols", group_cols); + params->set("group_cols_l", group_cols_l); + params->set("group_cols_l_gk", group_cols_l_gk); + params->set("group_cols_gk", group_cols_gk); + params->set("group_join", group_join); + params->set("group_join_dn", group_join_dn); + + // Optional pre-filter. Lines not matching this are completely excluded + // from the generalization (they never enter the destination table and are + // never walked). The filter is a SQL boolean expression on the source + // columns. We default to 'true' so it composes everywhere, and build a + // matching predicate for the (partial) endpoint indexes. + auto const filter = get_params().get_string("where", ""); + params->set("where", filter.empty() ? std::string{"true"} + : "(" + filter + ")"); + params->set("index_predicate", + filter.empty() ? std::string{} : "WHERE (" + filter + ")"); + + // Names for the functional endpoint indexes optionally created on the + // source table in create mode (and needed for fast walks in append mode). + auto const src_table = get_params().get_identifier("src_table"); + params->set("idx_startpt", src_table + "_glm_startpt"); + params->set("idx_endpt", src_table + "_glm_endpt"); + + // The append-mode delete looks up destination rows by their endpoint + // coordinates, so the destination gets matching functional indexes too. + auto const dest_table = get_params().get_identifier("dest_table"); + params->set("idx_dest_startpt", dest_table + "_glm_startpt"); + params->set("idx_dest_endpt", dest_table + "_glm_endpt"); + + if (append_mode()) { + if (!get_params().has("expire_list")) { + throw fmt_error("Missing 'expire_list' parameter in" + " generalizer{} (required in append mode).", + context()); + } + if (!get_params().has("zoom")) { + throw fmt_error("Missing 'zoom' parameter in generalizer{}" + " (required in append mode).", + context()); + } + auto const zoom = get_params().get_int64("zoom"); + if (zoom < 0 || zoom > 20) { + throw fmt_error("Invalid value '{}' for 'zoom' parameter in" + " generalizer{}.", + zoom, context()); + } + params->set("expire", qualified_name(get_params().get_identifier( + "schema"), + get_params().get_identifier( + "expire_list"))); + } +} + +void gen_grouped_linemerge_t::process() +{ + if (append_mode()) { + process_append(); + } else { + process_create(); + } +} + +void gen_grouped_linemerge_t::process_create() +{ + if (get_params().get_bool("create_indexes", true)) { + log_gen("Creating endpoint indexes on source table..."); + dbexec( + R"(CREATE INDEX IF NOT EXISTS "{idx_startpt}" ON {src} USING btree)" + R"( (ST_X(ST_StartPoint("{geom_column}")),)" + R"( ST_Y(ST_StartPoint("{geom_column}"))) {index_predicate})"); + dbexec( + R"(CREATE INDEX IF NOT EXISTS "{idx_endpt}" ON {src} USING btree)" + R"( (ST_X(ST_EndPoint("{geom_column}")),)" + R"( ST_Y(ST_EndPoint("{geom_column}"))) {index_predicate})"); + } + + log_gen("Merging lines by group..."); + timer(m_timer_merge).start(); + connection().exec("BEGIN"); + dbexec("TRUNCATE {dest}"); + auto const result = dbexec(R"( +INSERT INTO {dest} ({group_cols}, "{geom_column}") + SELECT {group_cols}, + (ST_Dump(ST_LineMerge(ST_Collect("{geom_column}" + ORDER BY "{geom_column}")))).geom + FROM {src} + WHERE {where} + GROUP BY {group_cols} +)"); + connection().exec("COMMIT"); + timer(m_timer_merge).stop(); + log_gen("Inserted {} merged linestrings.", result.affected_rows()); + + dbexec("ANALYZE {dest}"); + + if (get_params().get_bool("create_indexes", true)) { + log_gen("Creating endpoint indexes on destination table..."); + dbexec(R"(CREATE INDEX IF NOT EXISTS "{idx_dest_startpt}" ON {dest})" + R"( USING btree (ST_X(ST_StartPoint("{geom_column}")),)" + R"( ST_Y(ST_StartPoint("{geom_column}"))))"); + dbexec(R"(CREATE INDEX IF NOT EXISTS "{idx_dest_endpt}" ON {dest})" + R"( USING btree (ST_X(ST_EndPoint("{geom_column}")),)" + R"( ST_Y(ST_EndPoint("{geom_column}"))))"); + } +} + +void gen_grouped_linemerge_t::process_append() +{ + connection().exec("BEGIN"); + + // Step 1: Consume the expire list for our zoom level and turn the expired + // tiles into a set of envelopes describing the changed region(s). + dbexec(R"( +CREATE TEMP TABLE _glm_region ON COMMIT DROP AS + WITH expired AS ( + DELETE FROM {expire} WHERE zoom = {zoom} RETURNING x, y + ) + SELECT ST_TileEnvelope({zoom}, x, y) AS env FROM expired +)"); + + auto const region_count = dbexec("SELECT count(*) FROM _glm_region"); + if (std::strtoll(region_count.get_value(0, 0), nullptr, 10) == 0) { + log_gen("No expired tiles, nothing to do."); + connection().exec("COMMIT"); + return; + } + + dbexec("CREATE INDEX ON _glm_region USING gist (env)"); + dbexec("ANALYZE _glm_region"); + + // Step 2: Find the nodes (endpoint coordinates) of every connected + // component touched by the changed region. We seed from the lines that + // intersect the region and walk out along shared endpoints, staying within + // the same grouping key, until each connected component is fully explored. + // The walk is on exact endpoint coordinates (so the functional btree + // indexes can be used) and de-duplicates (group, x, y) tuples, which + // guarantees termination. The 'where' filter is applied everywhere a + // source row is read, so excluded lines neither seed nor extend a + // component. The node coordinates are carried under "gk_"-prefixed names + // so the unqualified 'where' filter is never ambiguous against them. + timer(m_timer_walk).start(); + dbexec(R"( +CREATE TEMP TABLE _glm_nodes ON COMMIT DROP AS +WITH RECURSIVE +seeds AS ( + SELECT {group_cols_l}, l."{geom_column}" + FROM {src} l + WHERE {where} + AND EXISTS ( + SELECT 1 FROM _glm_region r + WHERE l."{geom_column}" && r.env + AND ST_Intersects(l."{geom_column}", r.env) + ) +), +nodes AS ( + SELECT b.* FROM ( + SELECT {group_cols_gk}, + ST_X(ST_StartPoint("{geom_column}")) AS x, + ST_Y(ST_StartPoint("{geom_column}")) AS y + FROM seeds + UNION + SELECT {group_cols_gk}, + ST_X(ST_EndPoint("{geom_column}")), ST_Y(ST_EndPoint("{geom_column}")) + FROM seeds + ) b + UNION + SELECT {group_cols_l_gk}, + CASE WHEN ST_X(ST_StartPoint(l."{geom_column}")) = n.x + AND ST_Y(ST_StartPoint(l."{geom_column}")) = n.y + THEN ST_X(ST_EndPoint(l."{geom_column}")) + ELSE ST_X(ST_StartPoint(l."{geom_column}")) END, + CASE WHEN ST_X(ST_StartPoint(l."{geom_column}")) = n.x + AND ST_Y(ST_StartPoint(l."{geom_column}")) = n.y + THEN ST_Y(ST_EndPoint(l."{geom_column}")) + ELSE ST_Y(ST_StartPoint(l."{geom_column}")) END + FROM nodes n + JOIN {src} l + ON {group_join} + AND ( (ST_X(ST_StartPoint(l."{geom_column}")) = n.x + AND ST_Y(ST_StartPoint(l."{geom_column}")) = n.y) + OR (ST_X(ST_EndPoint(l."{geom_column}")) = n.x + AND ST_Y(ST_EndPoint(l."{geom_column}")) = n.y) ) + AND {where} +) +SELECT * FROM nodes +)"); + dbexec("ANALYZE _glm_nodes"); + + // Step 3: Collect the actual member lines of those components: every source + // line of the right group that touches a reached node at one of its + // endpoints. + auto const ways = dbexec(R"( +CREATE TEMP TABLE _glm_ways ON COMMIT DROP AS +SELECT DISTINCT ON (l.ctid) {group_cols_l}, l."{geom_column}" + FROM _glm_nodes n + JOIN {src} l + ON {group_join} + AND ( (ST_X(ST_StartPoint(l."{geom_column}")) = n.x + AND ST_Y(ST_StartPoint(l."{geom_column}")) = n.y) + OR (ST_X(ST_EndPoint(l."{geom_column}")) = n.x + AND ST_Y(ST_EndPoint(l."{geom_column}")) = n.y) ) + AND {where} + ORDER BY l.ctid +)"); + timer(m_timer_walk).stop(); + log_gen("Collected {} member lines in affected components.", + ways.affected_rows()); + + // Step 4: Delete the existing merged outputs of the touched components. + // A merged output is part of a touched component if and only if one of its + // endpoints coincides exactly with a reached node of the same group (the + // walk only connects lines that share an endpoint exactly, so this matches + // the walk's notion of connectivity and will not delete a different + // component that merely crosses one mid-segment). This must use exact + // endpoint equality, not ST_Intersects, to avoid deleting unrelated + // same-group lines that cross a component. A second pass over the changed + // region cleans up outputs of components that disappeared entirely (all + // their lines deleted), which leave no reached nodes behind; that pass is + // self-correcting because anything it removes either vanished or has a + // surviving line in the region and is regenerated below. + timer(m_timer_delete).start(); + auto deleted = dbexec(R"( +DELETE FROM {dest} d + USING _glm_nodes n + WHERE {group_join_dn} + AND ( (ST_X(ST_StartPoint(d."{geom_column}")) = n.x + AND ST_Y(ST_StartPoint(d."{geom_column}")) = n.y) + OR (ST_X(ST_EndPoint(d."{geom_column}")) = n.x + AND ST_Y(ST_EndPoint(d."{geom_column}")) = n.y) ) +)"); + auto const deleted_by_nodes = deleted.affected_rows(); + deleted = dbexec(R"( +DELETE FROM {dest} d + WHERE EXISTS ( + SELECT 1 FROM _glm_region r + WHERE d."{geom_column}" && r.env + AND ST_Intersects(d."{geom_column}", r.env) + ) +)"); + timer(m_timer_delete).stop(); + log_gen("Deleted {} stale merged linestrings ({} by node, {} by region).", + deleted_by_nodes + deleted.affected_rows(), deleted_by_nodes, + deleted.affected_rows()); + + // Step 5: Regenerate the affected components from scratch. + timer(m_timer_merge).start(); + auto const inserted = dbexec(R"( +INSERT INTO {dest} ({group_cols}, "{geom_column}") + SELECT {group_cols}, + (ST_Dump(ST_LineMerge(ST_Collect("{geom_column}" + ORDER BY "{geom_column}")))).geom + FROM _glm_ways + GROUP BY {group_cols} +)"); + timer(m_timer_merge).stop(); + log_gen("Inserted {} merged linestrings.", inserted.affected_rows()); + + connection().exec("COMMIT"); +} diff --git a/src/gen/gen-grouped-linemerge.hpp b/src/gen/gen-grouped-linemerge.hpp new file mode 100644 index 000000000..ce1c4993c --- /dev/null +++ b/src/gen/gen-grouped-linemerge.hpp @@ -0,0 +1,67 @@ +#ifndef OSM2PGSQL_GEN_GROUPED_LINEMERGE_HPP +#define OSM2PGSQL_GEN_GROUPED_LINEMERGE_HPP + +/** + * SPDX-License-Identifier: GPL-2.0-or-later + * + * This file is part of osm2pgsql (https://osm2pgsql.org/). + * + * Copyright (C) 2006-2026 by the osm2pgsql developer community. + * For a full list of authors see the git log. + */ + +#include "gen-base.hpp" + +#include +#include + +class params_t; +class pg_conn_t; + +/** + * Generalization strategy "grouped-linemerge". + * + * Globally merges connected LineStrings that share the same set of grouping + * column values, equivalent to + * + * INSERT INTO dest (cols..., geom) + * SELECT cols..., (ST_Dump(ST_LineMerge(ST_Collect(geom)))).geom + * FROM src GROUP BY cols...; + * + * Unlike the tile-based strategies this does NOT clip to tiles and the + * destination geometries are global merged lines, not tile-keyed rows. + * + * In append (update) mode the work is done incrementally and locally: the + * expire table (populated by osm2pgsql during the update) is used only as a + * seed for "where did line geometry change". For every changed region we + * walk the connected component(s) of matching lines out from the seed (via a + * recursive query), delete the merged outputs that overlap the region and + * regenerate them from scratch. This keeps each update bounded to the local + * connected component instead of re-merging the whole planet. + */ +class gen_grouped_linemerge_t : public gen_base_t +{ +public: + gen_grouped_linemerge_t(pg_conn_t *connection, bool append, + params_t *params); + + void process() override; + + std::string_view strategy() const noexcept override + { + return "grouped-linemerge"; + } + +private: + /// Build the whole dest table from scratch (create mode). + void process_create(); + + /// Incrementally update the dest table from the expire list (append mode). + void process_append(); + + std::size_t m_timer_merge; + std::size_t m_timer_walk; + std::size_t m_timer_delete; +}; + +#endif // OSM2PGSQL_GEN_GROUPED_LINEMERGE_HPP diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 93f022e7a..1f8e11482 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -5,9 +5,9 @@ # name - Name of test (source file without suffix) # labels - optional labels for the test function(set_test test_name) - cmake_parse_arguments(test_param "" "" "LABELS" ${ARGN}) + cmake_parse_arguments(test_param "" "" "LABELS;EXTRA_SOURCES" ${ARGN}) - add_executable(${test_name} ${test_name}.cpp) + add_executable(${test_name} ${test_name}.cpp ${test_param_EXTRA_SOURCES}) target_link_libraries(${test_name} osm2pgsql_lib catch_main_lib) add_test(NAME ${test_name} COMMAND ${test_name}) @@ -41,6 +41,11 @@ target_compile_features(catch_main_lib PUBLIC cxx_std_17) set_test(test-check-input LABELS NoDB) set_test(test-db-copy-mgr) set_test(test-db-copy-thread) +# The generalization strategies live in the osm2pgsql-gen executable, not in +# osm2pgsql_lib, so the test compiles the needed sources directly. +set_test(test-gen-grouped-linemerge EXTRA_SOURCES + "${osm2pgsql_SOURCE_DIR}/src/gen/gen-base.cpp" + "${osm2pgsql_SOURCE_DIR}/src/gen/gen-grouped-linemerge.cpp") set_test(test-expire-from-geometry LABELS NoDB) set_test(test-expire-tiles LABELS NoDB) set_test(test-flex-indexes LABELS NoDB) diff --git a/tests/test-gen-grouped-linemerge.cpp b/tests/test-gen-grouped-linemerge.cpp new file mode 100644 index 000000000..281bd7534 --- /dev/null +++ b/tests/test-gen-grouped-linemerge.cpp @@ -0,0 +1,328 @@ +/** + * SPDX-License-Identifier: GPL-2.0-or-later + * + * This file is part of osm2pgsql (https://osm2pgsql.org/). + * + * Copyright (C) 2006-2026 by the osm2pgsql developer community. + * For a full list of authors see the git log. + */ + +#include + +#include "common-pg.hpp" +#include "format.hpp" +#include "gen/gen-grouped-linemerge.hpp" +#include "params.hpp" +#include "pgsql.hpp" + +#include +#include +#include +#include +#include + +namespace { + +testing::pg::tempdb_t db; + +// Expire output zoom used throughout. At z18 a tile is ~150m, the grid below +// uses a 500 unit step, so an edge spans a couple of tiles. +constexpr int ZOOM = 18; + +// Run the grouped-linemerge strategy (create or append) against the test +// tables. conn_t is a pg_conn_t, so it can be passed straight to the strategy. +void run_gen(testing::pg::conn_t &conn, bool append, + char const *src_table = "glm_lines", + char const *dest_table = "glm_merged", + char const *group_by_columns = "grp", + char const *where = nullptr) +{ + params_t params; + params.set("schema", "public"); + params.set("src_table", src_table); + params.set("dest_table", dest_table); + params.set("geom_column", "geom"); + params.set("group_by_columns", group_by_columns); + if (where != nullptr) { + params.set("where", where); + } + params.set("expire_list", "glm_exp"); + params.set("zoom", static_cast(ZOOM)); + + gen_grouped_linemerge_t gen{&conn, append, ¶ms}; + gen.process(); +} + +void setup_tables(testing::pg::conn_t &conn) +{ + conn.exec("DROP TABLE IF EXISTS glm_lines, glm_merged, glm_exp, glm_ref" + " CASCADE"); + conn.exec("CREATE TABLE glm_lines" + " (grp text, geom geometry(LineString, 3857) NOT NULL)"); + conn.exec("CREATE INDEX ON glm_lines USING gist (geom)"); + conn.exec("CREATE TABLE glm_merged (grp text, geom geometry NOT NULL)"); + conn.exec("CREATE INDEX ON glm_merged USING gist (geom)"); + conn.exec("CREATE TABLE glm_exp (zoom int4 NOT NULL, x int4 NOT NULL," + " y int4 NOT NULL, PRIMARY KEY (zoom, x, y))"); +} + +void insert_edge(testing::pg::conn_t &conn, std::string const &grp, + std::string const &wkt) +{ + conn.exec(fmt::format("INSERT INTO glm_lines (grp, geom)" + " VALUES ('{}', ST_GeomFromText('{}', 3857))", + grp, wkt)); +} + +void delete_edge(testing::pg::conn_t &conn, std::string const &grp, + std::string const &wkt) +{ + conn.exec(fmt::format("DELETE FROM glm_lines WHERE grp = '{}'" + " AND ST_Equals(geom, ST_GeomFromText('{}', 3857))", + grp, wkt)); +} + +// Expire the tiles a geometry covers, exactly as osm2pgsql would during an +// update (the changed way's footprint). Hard-coded for z18. +void expire(testing::pg::conn_t &conn, std::string const &wkt) +{ + conn.exec(fmt::format( + "INSERT INTO glm_exp (zoom, x, y)" + " SELECT DISTINCT 18, gx, gy" + " FROM (SELECT ST_GeomFromText('{}', 3857) AS way) g," + " LATERAL (SELECT" + " floor((ST_XMin(g.way)+20037508.342789244)/40075016.685578488*262144)::int x0," + " floor((ST_XMax(g.way)+20037508.342789244)/40075016.685578488*262144)::int x1," + " floor((20037508.342789244-ST_YMax(g.way))/40075016.685578488*262144)::int y0," + " floor((20037508.342789244-ST_YMin(g.way))/40075016.685578488*262144)::int y1) b," + " generate_series(b.x0, b.x1) gx, generate_series(b.y0, b.y1) gy" + " WHERE ST_Intersects(ST_TileEnvelope(18, gx, gy), g.way)" + " ON CONFLICT DO NOTHING", + wkt)); +} + +// Whether the strategy's output equals what a from-scratch GROUP BY + +// ST_LineMerge would produce on the current source. Compares as a set +// (geometric equality, group-aware, NULL-safe) AND on row count, so leftover +// duplicates or missing pieces are both caught. +bool matches_reference(testing::pg::conn_t &conn) +{ + conn.exec("DROP TABLE IF EXISTS glm_ref"); + conn.exec("CREATE TABLE glm_ref AS" + " SELECT grp, (ST_Dump(ST_LineMerge(ST_Collect(geom)))).geom AS geom" + " FROM glm_lines GROUP BY grp"); + + int const ref = conn.result_as_int("SELECT count(*) FROM glm_ref"); + int const strat = conn.result_as_int("SELECT count(*) FROM glm_merged"); + int const strat_extra = conn.result_as_int( + "SELECT count(*) FROM glm_merged m WHERE NOT EXISTS (SELECT 1 FROM" + " glm_ref r WHERE r.grp IS NOT DISTINCT FROM m.grp" + " AND ST_Equals(r.geom, m.geom))"); + int const ref_extra = conn.result_as_int( + "SELECT count(*) FROM glm_ref r WHERE NOT EXISTS (SELECT 1 FROM" + " glm_merged m WHERE r.grp IS NOT DISTINCT FROM m.grp" + " AND ST_Equals(r.geom, m.geom))"); + + INFO("reference=" << ref << " strategy=" << strat + << " strategy_only=" << strat_extra + << " reference_only=" << ref_extra); + return ref == strat && strat_extra == 0 && ref_extra == 0; +} + +struct edge_t +{ + std::string wkt; + bool present = false; + std::string grp; +}; + +// All horizontal and vertical segments of a GW x GH grid (the candidate +// "ways"). Interior grid points become degree-3+ junctions when same-group +// edges meet there, which is exactly the case that makes ST_LineMerge split a +// connected component into several output lines. +std::vector build_grid_edges() +{ + constexpr int GW = 4; + constexpr int GH = 4; + constexpr int STEP = 500; + std::vector edges; + auto const seg = [](int x1, int y1, int x2, int y2) { + return fmt::format("LINESTRING({} {},{} {})", x1, y1, x2, y2); + }; + for (int j = 0; j < GH; ++j) { + for (int i = 0; i < GW - 1; ++i) { + edges.push_back({seg(i * STEP, j * STEP, (i + 1) * STEP, j * STEP)}); + } + } + for (int i = 0; i < GW; ++i) { + for (int j = 0; j < GH - 1; ++j) { + edges.push_back({seg(i * STEP, j * STEP, i * STEP, (j + 1) * STEP)}); + } + } + return edges; +} + +constexpr char const *const GROUPS[] = {"a", "b", "c"}; + +} // anonymous namespace + +TEST_CASE("grouped-linemerge: create merges connected same-group lines") +{ + auto conn = db.connect(); + setup_tables(conn); + + // "a": two connected segments; "b": one segment touching them but in a + // different group (must not merge); plus a disjoint "a" segment far away. + insert_edge(conn, "a", "LINESTRING(0 0,500 0)"); + insert_edge(conn, "a", "LINESTRING(500 0,1000 0)"); + insert_edge(conn, "b", "LINESTRING(500 0,500 500)"); + insert_edge(conn, "a", "LINESTRING(5000 0,5500 0)"); + + run_gen(conn, false); + + REQUIRE(matches_reference(conn)); + // "a" -> merged (0..1000) + disjoint (5000..5500) = 2 rows; "b" -> 1 row. + CHECK(conn.get_count("glm_merged") == 3); + CHECK(conn.get_count("glm_merged", "grp = 'a'") == 2); +} + +TEST_CASE("grouped-linemerge: incremental updates match full re-merge (fuzz)") +{ + auto conn = db.connect(); + + constexpr int ops_per_seed = 120; + + for (unsigned seed : {1U, 2U, 3U, 4U}) { + setup_tables(conn); + auto edges = build_grid_edges(); + std::mt19937 rng{seed}; + + // Random initial population, then a from-scratch build. + for (auto &e : edges) { + if (rng() % 2U == 0U) { + e.present = true; + e.grp = GROUPS[rng() % 3U]; + insert_edge(conn, e.grp, e.wkt); + } + } + run_gen(conn, false); + INFO("seed=" << seed << " phase=create"); + REQUIRE(matches_reference(conn)); + + // Random connect/disconnect operations, each followed by an + // incremental append that must reproduce the from-scratch result. + for (int op = 0; op < ops_per_seed; ++op) { + std::vector present; + std::vector absent; + for (std::size_t i = 0; i < edges.size(); ++i) { + (edges[i].present ? present : absent).push_back(i); + } + + std::string desc; + bool const do_add = + !absent.empty() && (present.empty() || (rng() % 2U == 0U)); + if (do_add) { + auto const idx = absent[rng() % absent.size()]; + auto &e = edges[idx]; + e.grp = GROUPS[rng() % 3U]; // may differ from last time: a retag + e.present = true; + insert_edge(conn, e.grp, e.wkt); + expire(conn, e.wkt); + desc = fmt::format("add slot={} grp={}", idx, e.grp); + } else if (!present.empty()) { + auto const idx = present[rng() % present.size()]; + auto &e = edges[idx]; + expire(conn, e.wkt); // expire the old footprint, then remove it + delete_edge(conn, e.grp, e.wkt); + e.present = false; + desc = fmt::format("del slot={} grp={}", idx, e.grp); + } else { + continue; + } + + run_gen(conn, true); + + INFO("seed=" << seed << " op=" << op << " " << desc); + REQUIRE(matches_reference(conn)); + } + } +} + +TEST_CASE("grouped-linemerge: multi-column grouping, NULL keys, where filter") +{ + auto conn = db.connect(); + conn.exec("DROP TABLE IF EXISTS glm2_lines, glm2_merged, glm_exp, glm2_ref" + " CASCADE"); + conn.exec("CREATE TABLE glm2_lines" + " (name text, ref text, geom geometry(LineString, 3857) NOT NULL)"); + conn.exec("CREATE INDEX ON glm2_lines USING gist (geom)"); + conn.exec("CREATE TABLE glm2_merged" + " (name text, ref text, geom geometry NOT NULL)"); + conn.exec("CREATE INDEX ON glm2_merged USING gist (geom)"); + conn.exec("CREATE TABLE glm_exp (zoom int4 NOT NULL, x int4 NOT NULL," + " y int4 NOT NULL, PRIMARY KEY (zoom, x, y))"); + + char const *const cols = "name, ref"; + char const *const filter = "(name IS NOT NULL OR ref IS NOT NULL)"; + + auto ins = [&](char const *name, char const *ref, std::string const &wkt) { + conn.exec(fmt::format( + "INSERT INTO glm2_lines (name, ref, geom) VALUES ({}, {}," + " ST_GeomFromText('{}', 3857))", + name ? fmt::format("'{}'", name) : "NULL", + ref ? fmt::format("'{}'", ref) : "NULL", wkt)); + }; + + // The reference applies the same filter and multi-column grouping; the + // match is group-aware (NULL-safe) and geometric. + auto matches = [&]() { + conn.exec("DROP TABLE IF EXISTS glm2_ref"); + conn.exec(fmt::format( + "CREATE TABLE glm2_ref AS SELECT {0}," + " (ST_Dump(ST_LineMerge(ST_Collect(geom)))).geom AS geom" + " FROM glm2_lines WHERE {1} GROUP BY {0}", + cols, filter)); + int const ref = conn.result_as_int("SELECT count(*) FROM glm2_ref"); + int const strat = conn.result_as_int("SELECT count(*) FROM glm2_merged"); + int const se = conn.result_as_int( + "SELECT count(*) FROM glm2_merged m WHERE NOT EXISTS (SELECT 1 FROM" + " glm2_ref r WHERE r.name IS NOT DISTINCT FROM m.name" + " AND r.ref IS NOT DISTINCT FROM m.ref AND ST_Equals(r.geom, m.geom))"); + int const re = conn.result_as_int( + "SELECT count(*) FROM glm2_ref r WHERE NOT EXISTS (SELECT 1 FROM" + " glm2_merged m WHERE r.name IS NOT DISTINCT FROM m.name" + " AND r.ref IS NOT DISTINCT FROM m.ref AND ST_Equals(r.geom, m.geom))"); + INFO("reference=" << ref << " strategy=" << strat << " strategy_only=" + << se << " reference_only=" << re); + return ref == strat && se == 0 && re == 0; + }; + + // 'Main St'/NULL: three connected segments -> one line. + ins("Main St", nullptr, "LINESTRING(0 0,500 0)"); + ins("Main St", nullptr, "LINESTRING(500 0,1000 0)"); + ins("Main St", nullptr, "LINESTRING(1000 0,1500 0)"); + // NULL/'I 5': two connected segments -> one line (NULL name groups). + ins(nullptr, "I 5", "LINESTRING(0 500,500 500)"); + ins(nullptr, "I 5", "LINESTRING(500 500,1000 500)"); + // NULL/NULL: excluded by the filter entirely. + ins(nullptr, nullptr, "LINESTRING(0 1000,500 1000)"); + // 'Main St'/'I 5': a distinct group (differs from 'Main St'/NULL on ref). + ins("Main St", "I 5", "LINESTRING(0 1500,500 1500)"); + + run_gen(conn, false, "glm2_lines", "glm2_merged", cols, filter); + INFO("phase=create"); + REQUIRE(matches()); + CHECK(conn.get_count("glm2_merged", "name IS NULL AND ref IS NULL") == 0); + CHECK(conn.get_count("glm2_merged", "name = 'Main St' AND ref IS NULL") == 1); + + // Incremental: remove the middle 'Main St'/NULL segment; the component must + // split into two, with everything else untouched. + expire(conn, "LINESTRING(500 0,1000 0)"); + conn.exec("DELETE FROM glm2_lines WHERE name = 'Main St' AND ref IS NULL" + " AND ST_Equals(geom, ST_GeomFromText('LINESTRING(500 0,1000 0)'," + " 3857))"); + run_gen(conn, true, "glm2_lines", "glm2_merged", cols, filter); + INFO("phase=append-shatter"); + REQUIRE(matches()); + CHECK(conn.get_count("glm2_merged", "name = 'Main St' AND ref IS NULL") == 2); +} From 370a0943e43a58c51c827c84826b24ccfa40e9b3 Mon Sep 17 00:00:00 2001 From: Leijurv Date: Mon, 1 Jun 2026 09:55:10 -0700 Subject: [PATCH 2/3] tidy --- tests/test-gen-grouped-linemerge.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tests/test-gen-grouped-linemerge.cpp b/tests/test-gen-grouped-linemerge.cpp index 281bd7534..71d8e66f5 100644 --- a/tests/test-gen-grouped-linemerge.cpp +++ b/tests/test-gen-grouped-linemerge.cpp @@ -15,6 +15,7 @@ #include "params.hpp" #include "pgsql.hpp" +#include #include #include #include @@ -134,6 +135,8 @@ struct edge_t std::string wkt; bool present = false; std::string grp; + + explicit edge_t(std::string w) : wkt(std::move(w)) {} }; // All horizontal and vertical segments of a GW x GH grid (the candidate @@ -151,18 +154,18 @@ std::vector build_grid_edges() }; for (int j = 0; j < GH; ++j) { for (int i = 0; i < GW - 1; ++i) { - edges.push_back({seg(i * STEP, j * STEP, (i + 1) * STEP, j * STEP)}); + edges.emplace_back(seg(i * STEP, j * STEP, (i + 1) * STEP, j * STEP)); } } for (int i = 0; i < GW; ++i) { for (int j = 0; j < GH - 1; ++j) { - edges.push_back({seg(i * STEP, j * STEP, i * STEP, (j + 1) * STEP)}); + edges.emplace_back(seg(i * STEP, j * STEP, i * STEP, (j + 1) * STEP)); } } return edges; } -constexpr char const *const GROUPS[] = {"a", "b", "c"}; +constexpr std::array GROUPS = {"a", "b", "c"}; } // anonymous namespace @@ -190,9 +193,9 @@ TEST_CASE("grouped-linemerge: incremental updates match full re-merge (fuzz)") { auto conn = db.connect(); - constexpr int ops_per_seed = 120; + constexpr int OPS_PER_SEED = 120; - for (unsigned seed : {1U, 2U, 3U, 4U}) { + for (unsigned const seed : {1U, 2U, 3U, 4U}) { setup_tables(conn); auto edges = build_grid_edges(); std::mt19937 rng{seed}; @@ -201,7 +204,7 @@ TEST_CASE("grouped-linemerge: incremental updates match full re-merge (fuzz)") for (auto &e : edges) { if (rng() % 2U == 0U) { e.present = true; - e.grp = GROUPS[rng() % 3U]; + e.grp = GROUPS.at(rng() % 3U); insert_edge(conn, e.grp, e.wkt); } } @@ -211,7 +214,7 @@ TEST_CASE("grouped-linemerge: incremental updates match full re-merge (fuzz)") // Random connect/disconnect operations, each followed by an // incremental append that must reproduce the from-scratch result. - for (int op = 0; op < ops_per_seed; ++op) { + for (int op = 0; op < OPS_PER_SEED; ++op) { std::vector present; std::vector absent; for (std::size_t i = 0; i < edges.size(); ++i) { @@ -224,7 +227,7 @@ TEST_CASE("grouped-linemerge: incremental updates match full re-merge (fuzz)") if (do_add) { auto const idx = absent[rng() % absent.size()]; auto &e = edges[idx]; - e.grp = GROUPS[rng() % 3U]; // may differ from last time: a retag + e.grp = GROUPS.at(rng() % 3U); // may differ from last time: a retag e.present = true; insert_edge(conn, e.grp, e.wkt); expire(conn, e.wkt); From 9ca4b5e6c3f7ab87d9d708bf85c5ab04782335ab Mon Sep 17 00:00:00 2001 From: Leijurv Date: Mon, 1 Jun 2026 12:24:55 -0700 Subject: [PATCH 3/3] fix performance mistake in join order --- src/gen/gen-grouped-linemerge.cpp | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/gen/gen-grouped-linemerge.cpp b/src/gen/gen-grouped-linemerge.cpp index 435955e0e..eae37329a 100644 --- a/src/gen/gen-grouped-linemerge.cpp +++ b/src/gen/gen-grouped-linemerge.cpp @@ -224,7 +224,10 @@ CREATE TEMP TABLE _glm_region ON COMMIT DROP AS return; } - dbexec("CREATE INDEX ON _glm_region USING gist (env)"); + // No spatial index on _glm_region is needed: the seed lookup and the + // region delete both drive *from* this (small) table and probe the source + // / destination geometry indexes. ANALYZE so the planner knows it is small + // and drives the joins from it. dbexec("ANALYZE _glm_region"); // Step 2: Find the nodes (endpoint coordinates) of every connected @@ -242,14 +245,14 @@ CREATE TEMP TABLE _glm_region ON COMMIT DROP AS CREATE TEMP TABLE _glm_nodes ON COMMIT DROP AS WITH RECURSIVE seeds AS ( + -- Driven from the (small) region so the source geometry index is used to + -- find the few lines in the changed area. SELECT {group_cols_l}, l."{geom_column}" - FROM {src} l + FROM _glm_region r + JOIN {src} l + ON l."{geom_column}" && r.env + AND ST_Intersects(l."{geom_column}", r.env) WHERE {where} - AND EXISTS ( - SELECT 1 FROM _glm_region r - WHERE l."{geom_column}" && r.env - AND ST_Intersects(l."{geom_column}", r.env) - ) ), nodes AS ( SELECT b.* FROM ( @@ -330,11 +333,9 @@ DELETE FROM {dest} d auto const deleted_by_nodes = deleted.affected_rows(); deleted = dbexec(R"( DELETE FROM {dest} d - WHERE EXISTS ( - SELECT 1 FROM _glm_region r - WHERE d."{geom_column}" && r.env - AND ST_Intersects(d."{geom_column}", r.env) - ) + USING _glm_region r + WHERE d."{geom_column}" && r.env + AND ST_Intersects(d."{geom_column}", r.env) )"); timer(m_timer_delete).stop(); log_gen("Deleted {} stale merged linestrings ({} by node, {} by region).",