Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
74 commits
Select commit Hold shift + click to select a range
5ddd4d1
ocdbt: write_seg writes base scale only
akhileshh Apr 21, 2026
ff8caff
edits: forward SV-split bbox to publish_edit for async downsample
akhileshh Apr 21, 2026
d936782
downsample: async pubsub worker updates coarser mips
akhileshh Apr 21, 2026
8a269ea
deps: add tinybrain
akhileshh Apr 21, 2026
8abbf67
locks: add L2ChunkLock for spatial serialization of SV splits
akhileshh Apr 22, 2026
09ecbb0
cutting: return tagged result from run_multicut
akhileshh Apr 22, 2026
a76cdbb
sv split: extend root lock across multicut + SV split + commit
akhileshh Apr 22, 2026
7619c9c
sv split: drive lock + write scope from the SVs being split
akhileshh Apr 22, 2026
affa672
feat(ocdbt): pinned-at reads
akhileshh Apr 23, 2026
f216de7
refactor(sv-split): pure planner + consolidated persist
akhileshh Apr 23, 2026
984634d
feat(locks): IndefiniteL2ChunkLock + exception-path hold
akhileshh Apr 23, 2026
ff6f215
feat(repair): stuck_ops — cleanup-then-replay for SV-split ops
akhileshh Apr 23, 2026
a7fae00
deps: kvdbclient>=0.7.0
akhileshh Apr 23, 2026
4b7aa1d
refactor(sv-split): thread operation_ts + named structs
akhileshh Apr 23, 2026
f19eb4d
feat(stuck_ops): surface exception-path + verify cells before replay
akhileshh Apr 23, 2026
56923af
include kvstack spec in info
akhileshh Apr 23, 2026
7672ceb
feat(meta): require fork to be pre-created; assert fork_exists
akhileshh Apr 24, 2026
ed7f9f0
docs: scope-set stuck signal; pre-replay cell verification
akhileshh Apr 24, 2026
0eace5e
sv-split: drive bbox from user seeds, narrow cut set to bbox
akhileshh Apr 28, 2026
8eec8ca
docs: edge update after sv split
akhileshh Apr 28, 2026
4216ba6
sv discrepancy error message details
akhileshh May 7, 2026
d7fcd2e
update gitignore, add hybrid ws seg plan
akhileshh May 7, 2026
df4ac6a
unify ocdbt sv lookup for 2d and 3d clicks
akhileshh May 13, 2026
69b71df
fix(ocdbt): drop redundant `shape` from schema clone
akhileshh May 19, 2026
e684d50
ci/ingest: cache cloudbuild layers, explicit --populate-base flag
akhileshh May 19, 2026
9594b2c
ci: combine buildx create + build into one step
akhileshh May 19, 2026
70deb56
fix(ocdbt): wipe via underlying driver; create base before CG table
akhileshh May 19, 2026
2e62e06
feat(ocdbt): move populate to configurable parent-layer task with ato…
akhileshh May 20, 2026
9cef706
docs(ocdbt): clarify copy_ws_bbox_multiscale file-count invariant
akhileshh May 20, 2026
8de3ccb
feat(ingest): retry reuses stored config; populate runs async; richer…
akhileshh May 20, 2026
9ac370c
feat(ingest status): rich live panel + busy worker count + configurab…
akhileshh May 20, 2026
6dd4bfb
perf(ingest status): precompute redis keys; skip per-refresh rq alloc…
akhileshh May 20, 2026
603e282
fix(ingest): run ocdbt populate synchronously after add_parent_chunk
akhileshh May 20, 2026
ddea330
perf(ocdbt): stream copy_ws_bbox_multiscale via dst.write(src) — no i…
akhileshh May 20, 2026
e4462fd
refactor(ocdbt): module → package; OcdbtConfig dataclass; yaml-driven…
akhileshh May 20, 2026
54173ae
feat(meta): info-file > custom_data precedence; redis-cached helper s…
akhileshh May 20, 2026
cc20f10
refactor(ingest): consolidate OCDBT helpers into ingest/ocdbt.py; dro…
akhileshh May 20, 2026
dc75804
feat(ingest): coordinator-managed OCDBT populate; drop --reset-ocdbt
akhileshh May 20, 2026
d4ede71
refactor(ingest): consolidate duplicated chunk-requeue body into util…
akhileshh May 20, 2026
86ee9fe
fix(tests): unpack 4-tuple from bootstrap in test_from_config
akhileshh May 20, 2026
b4985c3
chore(ingest status): reorder + rename columns; split workers/busy
akhileshh May 21, 2026
12b68d1
fix(ingest): route OCDBT populate writes through the coordinator spec
akhileshh May 21, 2026
7802721
fix(ocdbt): drop atomic=True from copy_ws_bbox_multiscale
akhileshh May 21, 2026
9c11b7f
chore(ocdbt): log per-task populate work
akhileshh May 21, 2026
12cdebb
chore(ocdbt): per-task GCS dump on commit failures, gated on ERROR_DUMP
akhileshh May 21, 2026
c644efc
feat(ingest): purge_layer command for re-running from a previous backup
akhileshh May 21, 2026
f9be480
feat(ingest): retry runs full setup minus cg.create(); add --skip-queue
akhileshh May 21, 2026
5873aa8
docs(ocdbt): tensorstore OCDBT reference
akhileshh May 21, 2026
6f9920c
fix(ocdbt): lower max_inline_value_bytes so chunk values go out-of-line
akhileshh May 21, 2026
2617e01
fix(ocdbt): retry transient tensorstore errors in marker IO
akhileshh May 21, 2026
6b373a6
feat(ingest): consolidate OCDBT lifecycle under `ingest layer`; add t…
akhileshh May 22, 2026
8ca9743
tmp: disable ocdbt seg read for meshing
akhileshh May 23, 2026
0cb09c1
fix(ocdbt): self-heal stale fork manifests; re-enable OCDBT seg reads
akhileshh May 24, 2026
81752e4
feat(meshing): emit `dynamic_mesh_dir` in info JSON
akhileshh May 25, 2026
086771f
feat(graph): seamless setup for copied bigtables (mesh dir + ocdbt fork)
akhileshh May 25, 2026
fdff144
fix(deps): pin docker base image digest; regenerate requirements.txt
akhileshh May 25, 2026
0cdcb96
test(graph): align stale tests with current bootstrap/ws_cv/ocdbt source
akhileshh May 25, 2026
1fea204
fix(ocdbt): on-disk config wins on open; never assert in-code defaults
akhileshh May 25, 2026
949f8b8
fix(ocdbt): ensure_fork_synced acts only pre-edit; drop noisy steady-…
akhileshh May 25, 2026
b0d1a81
feat(meshing): one-shot mesh meta setup + `ingest mesh_meta` CLI
akhileshh May 25, 2026
01b1791
feat(meshing): 3-day TTL on manifest cache + admin endpoint to clear …
akhileshh May 25, 2026
6d8caa7
feat(debug): per-block memory + IO tracking on HierarchicalProfiler
akhileshh May 26, 2026
fcf37a7
feat(graph): PCG_DRY_RUN env var to suppress edit-flow writes
akhileshh May 26, 2026
a3dc41f
feat(debug): SV-split dry-run profile harness
akhileshh May 27, 2026
50d41cf
feat(debug): clean per-stage metrics + standalone replay helper
akhileshh May 27, 2026
a3b83e0
feat(debug): leaf-level profile() blocks across SV split path
akhileshh May 27, 2026
f01eacc
perf(sv-split): collapse build_coords + remap_to_root + binary_seg vi…
akhileshh May 28, 2026
b3887d4
perf(sv-split): confine enforce_cc label-3 dilation to its bounding box
akhileshh May 28, 2026
568a375
fix(sv-split): pin get_roots reads to parent_ts for deterministic replay
akhileshh May 28, 2026
6cd5e2e
refactor: consolidate sv-split into a package with co-located docs
akhileshh May 28, 2026
9430f46
perf(sv-split): window seed snapping to a bbox around the seeds
akhileshh May 28, 2026
a8a60fc
perf(sv-split): skip label-3 reassignment scan when no strays exist
akhileshh May 28, 2026
05b4373
perf(sv-split): write fresh IDs into the seg crop in place
akhileshh May 29, 2026
3309803
fix(sv-split): align chunk lattice to voxel offset, drop redundant ro…
akhileshh May 29, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ venv.bak/

# Visual Code
.vscode/
*.code-workspace

# terraform
.terraform/
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ venv.bak/


# local dev stuff
*.code-workspace
.claude/
.devcontainer/
*.ipynb
Expand Down
7 changes: 6 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# syntax=docker/dockerfile:1
ARG PYTHON_VERSION=3.12
ARG BASE_IMAGE=tiangolo/uwsgi-nginx-flask:python${PYTHON_VERSION}
# Pin by digest. Without it, upstream rebuilds of the
# `python3.12` tag invalidate the Stage-1 cache and pull in newer
# transitive Python packages (e.g. importlib_metadata) that conflict
# with our requirements.txt pins. Bump the digest manually when you
# want to pull a fresher base.
ARG BASE_IMAGE=tiangolo/uwsgi-nginx-flask:python${PYTHON_VERSION}@sha256:329d84f4cc50ccd14d60eb02384713b4ae8723eddefda9fda342c7c3f17cdcb1


######################################################
Expand Down
23 changes: 15 additions & 8 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,26 @@ steps:
args: ["-c", "docker login --username=$$USERNAME --password=$$PASSWORD"]
secretEnv: ["USERNAME", "PASSWORD"]

# Build + push in one BuildKit invocation using a docker-container
# builder (required for registry-type cache export). The builder
# `--use` setting is client-side and doesn't persist across cloudbuild
# steps, so create + use + build must happen in a single step.
# Registry cache at the fixed :buildcache tag lets unchanged stages
# (conda env, bigtable emulator, pip install) reuse the previous
# build's exact layer artifacts, so already-warm nodes only download
# what actually changed on pull.
- name: "gcr.io/cloud-builders/docker"
entrypoint: "bash"
args:
- "-c"
- |
DOCKER_BUILDKIT=1 docker build -t $$USERNAME/pychunkedgraph:$TAG_NAME .
timeout: 600s
secretEnv: ["USERNAME"]

# Push the final image to Dockerhub
- name: "gcr.io/cloud-builders/docker"
entrypoint: "bash"
args: ["-c", "docker push $$USERNAME/pychunkedgraph:$TAG_NAME"]
docker buildx create --use --name pcg-builder --driver docker-container
docker buildx build \
--cache-from type=registry,ref=$$USERNAME/pychunkedgraph:buildcache \
--cache-to type=registry,ref=$$USERNAME/pychunkedgraph:buildcache,mode=max \
--push \
-t $$USERNAME/pychunkedgraph:$TAG_NAME .
timeout: 1800s
secretEnv: ["USERNAME"]

availableSecrets:
Expand Down
101 changes: 101 additions & 0 deletions docs/precomputed_ocdbt_hybrid.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Hybrid base: precomputed + OCDBT fork (proposal)

Status: proposal, not implemented. Open question is whether storage and ingest-compute savings justify the read-path complexity.

## Problem

PCG ingest copies the entire watershed segmentation into `<ws>/ocdbt/base/` in OCDBT format before any CG edit can happen. Per-CG forks at `<ws>/ocdbt/<gid>/` store only the deltas from SV splits. Two costs follow:

- **Storage**: roughly 2× the segmentation footprint per dataset — original precomputed plus full OCDBT copy.
- **Ingest compute**: a per-chunk pass that reads the precomputed and writes it through the OCDBT driver. Hours of cluster time on TB-scale datasets.

Both costs are paid up-front, before any user has done a single edit. The proposal here: skip the base copy and serve unedited chunks directly from the raw precomputed directory. Per-CG OCDBT forks remain as the delta store.

## Why the current architecture has the base copy

Today's per-CG read spec is:

```
neuroglancer_precomputed
└─ kvstore: ocdbt
├─ base: kvstack [base_layer, fork_manifest, fork_data]
└─ config: { compression, max_inline_value_bytes, ... }
```

When a reader asks for chunk key `8_8_40/1024-..._0-128`:

1. The `neuroglancer_precomputed` driver passes the chunk key to its kvstore (the OCDBT driver).
2. OCDBT looks up the key **in its B+tree**. The B+tree's leaves map chunk keys to values.
3. If the key isn't in the B+tree, OCDBT returns not-found. It does not consult the kvstack any further.

The three kvstack layers serve OCDBT's *internal* storage (B+tree manifest + node blobs + leaf blobs) — they have no visibility into chunk-key lookups. So the OCDBT B+tree must contain every chunk key the reader will ever ask for, and that's why ingest copies the whole watershed: to populate the B+tree.

## What tensorstore primitives provide

Confirmed against tensorstore docs:

- **`kvstack` routes by exact / prefix match, with no fallthrough on miss.** A layer that claims a key range absorbs misses — they return `state='missing'` and do not cascade to the next layer. So we can't put raw precomputed below an OCDBT layer in a kvstack and expect kvstack to fall through when OCDBT doesn't have a key.
- **No native overlay/fallback kvstore driver.** `kvstack` is the only composition primitive at the kvstore level; it's precedence-based, not fallthrough.
- **OCDBT has no external-blob references.** B+tree leaves either inline the value or point to a data file under the OCDBT directory. There's no way to make a leaf reference a raw GCS precomputed file.
- **Array-level `stack` / `ts.overlay`** layers arrays by spatial domain. In overlapping regions, the later layer takes absolute precedence — missing-in-later does not fall back to earlier.

No single tensorstore primitive provides "try OCDBT delta first, fall through to raw precomputed on miss."

## Architectural options

### A — Two-stage read at the pcg layer

PCG reads open two handles: the OCDBT fork for the delta, and a raw `neuroglancer_precomputed` reader for the watershed base. For any voxel region, issue both reads and merge with "delta wins where present, base fills the rest."

- **Pros**: works inside pcg (`lookup_svs_from_seg`, sanity checks, debug tools) without any tensorstore changes.
- **Cons**: every pcg caller that uses `meta.ws_ocdbt` needs to route through a new merging reader. Neuroglancer doesn't benefit — it still gets a single kvstore spec from `dataset_info`. Either NG runs two layers itself (Option B) or we stand up a server-side proxy that does the merge before serving.

### B — NG-side layer stack

`dataset_info` publishes two precomputed layers: the raw watershed (read-only base) and the per-CG OCDBT fork (delta). NG composites them — visible segmentation is whichever has data at a given chunk.

- **Pros**: no change to pcg's read path. Pushes the architecture complexity into the viewer.
- **Cons**: requires NG to treat "missing chunk in delta" as "fall through to base," not "render as background." Default NG behavior is the latter, so a viewer-side or proxy-side shim is likely needed.

### C — Custom tensorstore kvstore driver

A new "fallthrough" kvstore driver: read tries layer N, falls through on miss to layer N−1. Implement upstream in tensorstore or fork-and-maintain.

- **Pros**: cleanest consumer-facing story — pcg and NG both keep using a single kvstore spec.
- **Cons**: tensorstore kvstore drivers are C++. Non-trivial maintenance surface; review/merge timeline if upstreaming.

### D — Lazy base population (not a win on its own)

Skip the ingest copy; copy a chunk from precomputed to OCDBT on first edit. Saves ingest compute. Does **not** save storage for reads — unedited chunks still 404 in OCDBT for a reader that doesn't have a fallback. Only useful in combination with A/B/C.

## Recommendation

Measure first. Confirm the actual storage and ingest-compute savings on a real dataset and weigh against the engineering cost of A/B/C.

If the savings justify the work, **A + B together** is the most pragmatic path:
- A gives pcg a single merged-read API. Edits, sanity checks, debug tooling keep working.
- B avoids standing up a proxy service for the viewer by letting NG handle the overlay.

Both require upstream verification:
- **For A**: confirm that `(x0:x1, y0:y1, z0:z1)` reads on an OCDBT with sparse keys surface missing-ness *per chunk* at the `neuroglancer_precomputed` array layer (not per-region, not silently fill-valued).
- **For B**: confirm NG's segmentation loader can be configured to fall through gaps in one layer to another. If it can't, build a small server-side merging shim — at which point Option A's reader becomes that shim and B reduces to "publish two specs."

C is the cleanest design but carries the highest cost. Pursue only if A/B turn out to have unworkable semantics.

## Open questions before any implementation

1. Does OCDBT's `read_result.state == 'missing'` surface per-chunk at the `neuroglancer_precomputed` array layer, or does the array silently fill missing chunks with fill-value? Verifiable by opening an OCDBT with sparse keys and reading a region that spans present + missing chunks.
2. Does NG distinguish "chunk returned as missing" from "chunk is all fill-value"? If not, a viewer-side overlay needs a shim regardless.
3. What's the actual delta volume per CG over its lifetime? If SV splits eventually touch a significant fraction of chunks, the storage win shrinks toward zero — at which point the simpler architecture (today's full base copy) wins on engineering cost.

## Files to start from when implementing

- `pychunkedgraph/graph/ocdbt.py` — spec construction (`build_cg_ocdbt_spec`), base population (`create_base_ocdbt`), fork setup (`fork_base_manifest`).
- `pychunkedgraph/ingest/cli.py`, `pychunkedgraph/ingest/cluster.py` — current base-copy flow.
- `pychunkedgraph/graph/utils/generic.py::get_local_segmentation` — single pcg read entry point that would need the two-stage merge in Option A.

## Verification (per chosen option)

- **A**: unit test that simulates a partial-delta OCDBT + raw precomputed and confirms the pcg reader returns the correct labels for spans crossing both.
- **B**: configure an NG link with both layers against a test dataset; compare the rendered segmentation to a known-good reference at edited and unedited regions.
- **C**: a tensorstore build with the new driver passes a fallthrough test (missing key in upper layer resolves from lower layer).
20 changes: 14 additions & 6 deletions pychunkedgraph/app/app_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,20 +229,28 @@ def ccs(coordinates_nm_):
return ccs

coordinates = np.array(coordinates, dtype=int)
coordinates_nm = coordinates * cg.meta.resolution
max_dist_steps = np.array([4, 8, 14, 28], dtype=float) * np.mean(cg.meta.resolution)

node_ids = np.array(node_ids, dtype=np.uint64)
if len(coordinates.shape) != 2:
raise cg_exceptions.BadRequest(
f"Could not determine supervoxel ID for coordinates "
f"{coordinates} - Validation stage."
)

# Fast path: all node_ids are L1 and OCDBT — single seg read for all coords
if cg.meta.ocdbt_seg and np.all(cg.get_chunk_layers(np.unique(node_ids)) == 1):
# OCDBT: always read the current segmentation at the click coords,
# regardless of node_ids layer.
# - 2D slice click: NG sends `node_id` = L1 SV from the slice view.
# That slice can be stale after an SV split; the seg read returns
# the current SV at that voxel (which may have a different root).
# - 3D mesh click: NG sends `node_id` = root; no L1 SV is attached,
# so we have to look it up against current seg anyway.
# `node_ids` are not used as a constraint here. Stale UI surfaces
# downstream as "different roots" with the sv_id->root diagnostic
# mapping added in operation.py / cutting.py.
if cg.meta.ocdbt_seg:
return lookup_svs_from_seg(cg.meta, coordinates)

coordinates_nm = coordinates * cg.meta.resolution
max_dist_steps = np.array([4, 8, 14, 28], dtype=float) * np.mean(cg.meta.resolution)
node_ids = np.array(node_ids, dtype=np.uint64)
atomic_ids = np.zeros(len(coordinates), dtype=np.uint64)
for node_id in np.unique(node_ids):
node_id_m = node_ids == node_id
Expand Down
10 changes: 9 additions & 1 deletion pychunkedgraph/app/meshing/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from pychunkedgraph.meshing.manifest import get_children_before_start_layer
from pychunkedgraph.meshing.manifest import ManifestCache


__meshing_url_prefix__ = os.environ.get("MESHING_URL_PREFIX", "meshing")


Expand Down Expand Up @@ -180,3 +179,12 @@ def _remeshing(serialized_cg_info, lvl2_nodes):
def clear_manifest_cache(cg, node_id):
node_ids = get_children_before_start_layer(cg, node_id, start_layer=2)
ManifestCache(cg.graph_id).clear_fragments(node_ids)


def clear_manifest_cache_all(cg) -> int:
"""Delete every cached manifest fragment for this graph.

Returns the number of redis keys deleted across both initial and
dynamic caches (they share the ``<graph_id>:`` namespace).
"""
return ManifestCache(cg.graph_id).clear_namespace()
10 changes: 9 additions & 1 deletion pychunkedgraph/app/meshing/v1/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from pychunkedgraph.app.app_utils import get_cg
from pychunkedgraph.app.app_utils import remap_public


bp = Blueprint(
"pcg_meshing_v1", __name__, url_prefix=f"/{common.__meshing_url_prefix__}/api/v1"
)
Expand Down Expand Up @@ -98,3 +97,12 @@ def handle_remesh(table_id):
def handle_clear_manifest_cache(table_id, node_id):
cg = get_cg(table_id)
common.clear_manifest_cache(cg, node_id)


@bp.route("/table/<table_id>/clear_manifest_cache", methods=["POST"])
@auth_requires_permission("admin")
def handle_clear_manifest_cache_all(table_id):
"""Drop every cached manifest fragment for this graph."""
cg = get_cg(table_id)
deleted = common.clear_manifest_cache_all(cg)
return {"deleted": deleted}
Loading
Loading