Add document-format crate#4234
Conversation
There was a problem hiding this comment.
Code Review
This pull request introduces the document-format crate, which provides a typed handle (Gdd) for the .gdd document format, decoupling on-disk layout from the editor's in-memory runtime types. It implements codecs, export options, path layouts, and session state persistence. The review feedback highlights several critical issues: a randomized history writing order in append_history_deltas that can cause document load failures, a race condition on WASM/OPFS in open_from_archive due to non-blocking writes, a potential runtime panic on WASM from using block_on in garbage_collect, and redundant loads in stream_entries due to duplicate hashes in hashes_from_store.
Important
The consumer version of Gemini Code Assist on GitHub is being sunset. Starting June 18, 2026, new organization installations will be blocked, and all code review activity will officially cease on July 17, 2026.
For more details on the timeline and next steps, please review the Help Documentation.
| let wanted: std::collections::HashSet<Rev> = revs.iter().copied().collect(); | ||
| let mut buffer = Vec::new(); | ||
| for delta in self.session.history().filter(|delta| wanted.contains(&delta.id)) { | ||
| self.manifest.codecs.history.append(&mut buffer, delta)?; | ||
| } |
There was a problem hiding this comment.
In append_history_deltas, iterating over self.session.history() (which is an unordered HashMap::values() iterator) and filtering by wanted discards the causal/topological order of revs. This causes the history file to be written in a randomized order. When a document is later opened from history only (e.g., in replay_from_history), applying a child delta before its parent will fail, causing document load failures. We should preserve the topological order of revs by mapping the deltas and then iterating over revs in their original order.
let wanted: std::collections::HashSet<Rev> = revs.iter().copied().collect();
let mut deltas_map = std::collections::HashMap::with_capacity(revs.len());
for delta in self.session.history() {
if wanted.contains(&delta.id) {
deltas_map.insert(delta.id, delta);
}
}
let mut buffer = Vec::new();
for rev in revs {
if let Some(delta) = deltas_map.get(rev) {
self.manifest.codecs.history.append(&mut buffer, delta)?;
}
}| let mut directories = vec![String::new()]; | ||
| while let Some(dir) = directories.pop() { | ||
| for path in Container::list(&staging, &dir)? { | ||
| let holder = Container::read(&staging, &path)?; | ||
| working.write_non_blocking(&path, holder.as_slice())?; | ||
| } | ||
| directories.extend(Container::list_dirs(&staging, &dir)?); | ||
| } |
There was a problem hiding this comment.
In open_from_archive, working.write_non_blocking is used to write files to the container. On OPFS/WASM, write_non_blocking is asynchronous and enqueues the write eagerly, returning Ok immediately. Since open_from_archive is an async fn and immediately calls Self::open_in(working, layout).await, this creates a race condition where open_in tries to read the manifest or other files before they have been fully written to OPFS. We should use the asynchronous working.write(...).await instead to ensure all files are fully written before opening.
| let mut directories = vec![String::new()]; | |
| while let Some(dir) = directories.pop() { | |
| for path in Container::list(&staging, &dir)? { | |
| let holder = Container::read(&staging, &path)?; | |
| working.write_non_blocking(&path, holder.as_slice())?; | |
| } | |
| directories.extend(Container::list_dirs(&staging, &dir)?); | |
| } | |
| let mut directories = vec![String::new()]; | |
| while let Some(dir) = directories.pop() { | |
| for path in Container::list(&staging, &dir)? { | |
| let holder = Container::read(&staging, &path)?; | |
| working.write(&path, holder.as_slice()).await?; | |
| } | |
| directories.extend(Container::list_dirs(&staging, &dir)?); | |
| } |
| fn garbage_collect(&self, used: &[ResourceHash]) { | ||
| let kept: std::collections::HashSet<&ResourceHash> = used.iter().collect(); | ||
| let hashes = match futures::executor::block_on(self.resource_hashes()) { | ||
| Ok(hashes) => hashes, | ||
| Err(error) => { | ||
| log::error!("Failed to list resources during garbage_collect: {error}"); | ||
| return; | ||
| } | ||
| }; | ||
| for hash in hashes { | ||
| if kept.contains(&hash) { | ||
| continue; | ||
| } | ||
| if let Err(error) = self.working.remove_non_blocking(&self.layout.resource_path(&hash)) { | ||
| log::error!("ResourceStorage::garbage_collect failed to remove {hash}: {error}"); | ||
| } | ||
| } | ||
| } |
There was a problem hiding this comment.
Using futures::executor::block_on inside garbage_collect will panic at runtime on WASM because WASM is single-threaded and does not support blocking the main thread. Since garbage_collect is synchronous but we can run the GC asynchronously, we should spawn the GC task asynchronously using wasm_bindgen_futures::spawn_local on WASM, and only fallback to block_on on native platforms.
fn garbage_collect(&self, used: &[ResourceHash]) {
let kept: std::collections::HashSet<ResourceHash> = used.iter().copied().collect();
let working = self.working.clone();
let layout = self.layout.clone();
let gc_task = async move {
let dir = layout.resources_dir();
let Ok(dirs) = working.list_dirs("").await else { return; };
if !dirs.iter().any(|d| d == dir) {
return;
}
let Ok(entries) = working.list(dir).await else { return; };
let prefix = format!("{dir}/");
for entry in entries {
let Some(name) = entry.strip_prefix(&prefix) else { continue };
if let Ok(hash) = name.parse::<ResourceHash>() {
if kept.contains(&hash) {
continue;
}
if let Err(error) = working.remove(&layout.resource_path(&hash)).await {
log::error!("ResourceStorage::garbage_collect failed to remove {hash}: {error}");
}
}
}
};
#[cfg(target_family = "wasm")]
wasm_bindgen_futures::spawn_local(gc_task);
#[cfg(not(target_family = "wasm"))]
futures::executor::block_on(gc_task);
}| let mut hashes_from_store: Vec<ResourceHash> = Vec::new(); | ||
| let mut links_to_promote: Vec<graph_storage::ResourceId> = Vec::new(); | ||
| for (id, entry) in &export_session.registry().resources { | ||
| let Some(hash) = entry.hash else { continue }; | ||
| let embed = entry.has_embedded_source() || options.embed_all_resources; | ||
| if !embed { | ||
| continue; | ||
| } | ||
| if !entry.has_embedded_source() { | ||
| links_to_promote.push(*id); | ||
| } | ||
| if !working_copy_hashes.contains(&hash) { | ||
| hashes_from_store.push(hash); | ||
| } | ||
| } |
There was a problem hiding this comment.
If multiple resources reference the same content hash, hashes_from_store will contain duplicate hashes. This leads to redundant async loads from the byte store. We should deduplicate hashes_from_store before loading the resources.
| let mut hashes_from_store: Vec<ResourceHash> = Vec::new(); | |
| let mut links_to_promote: Vec<graph_storage::ResourceId> = Vec::new(); | |
| for (id, entry) in &export_session.registry().resources { | |
| let Some(hash) = entry.hash else { continue }; | |
| let embed = entry.has_embedded_source() || options.embed_all_resources; | |
| if !embed { | |
| continue; | |
| } | |
| if !entry.has_embedded_source() { | |
| links_to_promote.push(*id); | |
| } | |
| if !working_copy_hashes.contains(&hash) { | |
| hashes_from_store.push(hash); | |
| } | |
| } | |
| let mut hashes_from_store: Vec<ResourceHash> = Vec::new(); | |
| let mut links_to_promote: Vec<graph_storage::ResourceId> = Vec::new(); | |
| for (id, entry) in &export_session.registry().resources { | |
| let Some(hash) = entry.hash else { continue }; | |
| let embed = entry.has_embedded_source() || options.embed_all_resources; | |
| if !embed { | |
| continue; | |
| } | |
| if !entry.has_embedded_source() { | |
| links_to_promote.push(*id); | |
| } | |
| if !working_copy_hashes.contains(&hash) { | |
| hashes_from_store.push(hash); | |
| } | |
| } | |
| hashes_from_store.sort_unstable(); | |
| hashes_from_store.dedup(); |
No description provided.