*/
+@JsonInclude(JsonInclude.Include.NON_NULL)
public record StageToggles(
- boolean splitter,
- boolean classifier,
+ @JsonProperty("splitter") boolean splitter,
+ @JsonProperty("classifier") boolean classifier,
@JsonProperty("field_validation") boolean fieldValidation,
@JsonProperty("visual_authenticity") boolean visualAuthenticity,
@JsonProperty("content_authenticity") boolean contentAuthenticity,
- boolean judge,
+ @JsonProperty("judge") boolean judge,
@JsonProperty("rule_engine") boolean ruleEngine,
@JsonProperty("judge_escalation") boolean judgeEscalation,
@JsonProperty("bbox_refine") boolean bboxRefine,
- boolean transform) {
+ @JsonProperty("transform") boolean transform) {
/** Service-default toggles (classifier + field_validation on, everything else off). */
public static StageToggles defaults() {
diff --git a/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/StandardFormat.java b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/StandardFormat.java
index 4d190d1..ac88431 100644
--- a/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/StandardFormat.java
+++ b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/StandardFormat.java
@@ -16,15 +16,18 @@
package com.firefly.flydocs.sdk.model;
+import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
/** JSON-Schema-style format hints applied at field validation time. */
public enum StandardFormat {
DATE("date"),
DATE_TIME("date-time"),
+ TIME("time"),
EMAIL("email"),
URI("uri"),
- UUID("uuid");
+ UUID("uuid"),
+ CURRENCY("currency");
private final String wire;
@@ -36,4 +39,17 @@ public enum StandardFormat {
public String wire() {
return wire;
}
+
+ @JsonCreator
+ public static StandardFormat fromWire(String value) {
+ if (value == null) {
+ throw new IllegalArgumentException("StandardFormat value is null");
+ }
+ for (StandardFormat f : values()) {
+ if (f.wire.equals(value)) {
+ return f;
+ }
+ }
+ throw new IllegalArgumentException("unknown StandardFormat: " + value);
+ }
}
diff --git a/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/StandardValidatorSpec.java b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/StandardValidatorSpec.java
deleted file mode 100644
index 7de9fea..0000000
--- a/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/StandardValidatorSpec.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2026 Firefly Software Solutions Inc
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.firefly.flydocs.sdk.model;
-
-import com.fasterxml.jackson.annotation.JsonInclude;
-import java.util.Map;
-import org.jspecify.annotations.Nullable;
-
-/**
- * One built-in validator declaration attached to a {@link FieldSpec}.
- *
- *
{@code type} is a free string so new server-side validators don't
- * require an SDK release; the canonical names live in
- * {@code flydocs.interfaces.enums.standard_validator}.
- */
-@JsonInclude(JsonInclude.Include.NON_DEFAULT)
-public record StandardValidatorSpec(
- String type,
- @Nullable Map params,
- Severity severity) {
-
- /** Default severity is {@link Severity#ERROR} -- mirrors the service's contract. */
- public StandardValidatorSpec {
- if (severity == null) {
- severity = Severity.ERROR;
- }
- }
-
- public StandardValidatorSpec(String type) {
- this(type, null, Severity.ERROR);
- }
-
- public StandardValidatorSpec(String type, Map params) {
- this(type, params, Severity.ERROR);
- }
-
- /** Whether a validation error is hard (``error``) or soft (``warning``). */
- public enum Severity {
- ERROR("error"),
- WARNING("warning");
-
- private final String wire;
-
- Severity(String wire) {
- this.wire = wire;
- }
-
- @com.fasterxml.jackson.annotation.JsonValue
- public String wire() {
- return wire;
- }
- }
-}
diff --git a/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/SubmitJobRequest.java b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/SubmitExtractionRequest.java
similarity index 62%
rename from sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/SubmitJobRequest.java
rename to sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/SubmitExtractionRequest.java
index f768ff6..aba12c7 100644
--- a/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/SubmitJobRequest.java
+++ b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/SubmitExtractionRequest.java
@@ -25,28 +25,32 @@
import org.jspecify.annotations.Nullable;
/**
- * Request body for {@code POST /api/v1/jobs}.
+ * Request body for {@code POST /api/v1/extractions} (async).
*
*
Superset of {@link ExtractionRequest} — adds the optional
- * {@code callback_url} (for webhook delivery on terminal status) and
- * a free-form {@code metadata} bag echoed back on the webhook payload.
+ * {@code callback_url} (webhook on terminal status) and a free-form
+ * {@code metadata} bag echoed back on the webhook envelope.
*/
@JsonInclude(JsonInclude.Include.NON_NULL)
-public record SubmitJobRequest(
- String intention,
- List documents,
- List docs,
- List rules,
- ExtractionOptions options,
+public record SubmitExtractionRequest(
+ @JsonProperty("intention") String intention,
+ @JsonProperty("files") List files,
+ @JsonProperty("document_types") List documentTypes,
+ @JsonProperty("rules") List rules,
+ @JsonProperty("options") ExtractionOptions options,
@JsonProperty("callback_url") @Nullable String callbackUrl,
- Map metadata) {
+ @JsonProperty("metadata") Map metadata) {
- public SubmitJobRequest {
- if (intention == null) intention = "Extract structured data from the document.";
- documents = List.copyOf(documents);
- docs = List.copyOf(docs);
+ public SubmitExtractionRequest {
+ if (intention == null) {
+ intention = "Extract structured data from the document.";
+ }
+ files = List.copyOf(files);
+ documentTypes = List.copyOf(documentTypes);
rules = rules == null ? List.of() : List.copyOf(rules);
- if (options == null) options = ExtractionOptions.defaults();
+ if (options == null) {
+ options = ExtractionOptions.defaults();
+ }
metadata = metadata == null ? Map.of() : Map.copyOf(metadata);
}
@@ -57,27 +61,27 @@ public static Builder builder() {
/** Fluent builder. */
public static final class Builder {
private @Nullable String intention;
- private final List documents = new ArrayList<>();
- private final List docs = new ArrayList<>();
+ private final List files = new ArrayList<>();
+ private final List documentTypes = new ArrayList<>();
private final List rules = new ArrayList<>();
private ExtractionOptions options = ExtractionOptions.defaults();
private @Nullable String callbackUrl;
private final Map metadata = new HashMap<>();
public Builder intention(String s) { this.intention = s; return this; }
- public Builder addDocument(DocumentInput d) { this.documents.add(d); return this; }
- public Builder addDocSpec(DocSpec d) { this.docs.add(d); return this; }
+ public Builder addFile(FileInput f) { this.files.add(f); return this; }
+ public Builder addDocumentType(DocumentTypeSpec d) { this.documentTypes.add(d); return this; }
public Builder addRule(RuleSpec r) { this.rules.add(r); return this; }
public Builder options(ExtractionOptions o) { this.options = o; return this; }
public Builder callbackUrl(String url) { this.callbackUrl = url; return this; }
public Builder metadata(String key, Object value) { this.metadata.put(key, value); return this; }
public Builder metadata(Map m) { this.metadata.putAll(m); return this; }
- public SubmitJobRequest build() {
- return new SubmitJobRequest(
+ public SubmitExtractionRequest build() {
+ return new SubmitExtractionRequest(
intention,
- List.copyOf(documents),
- List.copyOf(docs),
+ List.copyOf(files),
+ List.copyOf(documentTypes),
List.copyOf(rules),
options,
callbackUrl,
diff --git a/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/TraceEntry.java b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/TraceEntry.java
new file mode 100644
index 0000000..badbfb8
--- /dev/null
+++ b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/TraceEntry.java
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2026 Firefly Software Solutions Inc
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.firefly.flydocs.sdk.model;
+
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import java.time.OffsetDateTime;
+
+/**
+ * One node's execution in the pipeline DAG.
+ *
+ *
{@code status} is one of {@code success | failed | skipped}.
+ */
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public record TraceEntry(
+ @JsonProperty("node") String node,
+ @JsonProperty("started_at") OffsetDateTime startedAt,
+ @JsonProperty("completed_at") OffsetDateTime completedAt,
+ @JsonProperty("latency_ms") double latencyMs,
+ @JsonProperty("status") String status) {
+}
diff --git a/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/Transformation.java b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/Transformation.java
new file mode 100644
index 0000000..2e26bf3
--- /dev/null
+++ b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/Transformation.java
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2026 Firefly Software Solutions Inc
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.firefly.flydocs.sdk.model;
+
+import com.fasterxml.jackson.annotation.JsonSubTypes;
+import com.fasterxml.jackson.annotation.JsonTypeInfo;
+
+/**
+ * Sealed union of post-extraction transformation directives.
+ *
+ *
Discriminator is {@code type}. New declarative kinds add a new
+ * permitted record; the engine dispatches on the wire value.
+ *
+ *
+ *
{@link EntityResolutionTransformation} — deterministic dedup of
+ * array-rows by token overlap.
+ *
{@link LlmTransformation} — free-form LLM call against a target
+ * group.
+ *
+ */
+@JsonTypeInfo(
+ use = JsonTypeInfo.Id.NAME,
+ include = JsonTypeInfo.As.PROPERTY,
+ property = "type")
+@JsonSubTypes({
+ @JsonSubTypes.Type(value = EntityResolutionTransformation.class, name = "entity_resolution"),
+ @JsonSubTypes.Type(value = LlmTransformation.class, name = "llm"),
+})
+public sealed interface Transformation
+ permits EntityResolutionTransformation, LlmTransformation {
+
+ /** Stable id (server-generated when unset). */
+ String id();
+
+ /** Schema-side group name this transformation reads from. */
+ String targetGroup();
+
+ /** Optional rename of the output group. {@code null} means replace in place. */
+ String outputGroup();
+
+ /** {@code task} (per-document) or {@code request} (across all documents). */
+ TransformationScope scope();
+}
diff --git a/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/TransformationScope.java b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/TransformationScope.java
new file mode 100644
index 0000000..d88936b
--- /dev/null
+++ b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/TransformationScope.java
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2026 Firefly Software Solutions Inc
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.firefly.flydocs.sdk.model;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonValue;
+
+/** Whether a {@link Transformation} applies per-document or across the whole request. */
+public enum TransformationScope {
+ TASK("task"),
+ REQUEST("request");
+
+ private final String wire;
+
+ TransformationScope(String wire) {
+ this.wire = wire;
+ }
+
+ @JsonValue
+ public String wire() {
+ return wire;
+ }
+
+ @JsonCreator
+ public static TransformationScope fromWire(String value) {
+ if (value == null) {
+ throw new IllegalArgumentException("TransformationScope value is null");
+ }
+ for (TransformationScope s : values()) {
+ if (s.wire.equals(value)) {
+ return s;
+ }
+ }
+ throw new IllegalArgumentException("unknown TransformationScope: " + value);
+ }
+}
diff --git a/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/UsageBreakdown.java b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/UsageBreakdown.java
new file mode 100644
index 0000000..9d0307d
--- /dev/null
+++ b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/UsageBreakdown.java
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2026 Firefly Software Solutions Inc
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.firefly.flydocs.sdk.model;
+
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import java.util.Map;
+
+/** Aggregated token usage and cost across every LLM call of one request. */
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public record UsageBreakdown(
+ @JsonProperty("total_input_tokens") long totalInputTokens,
+ @JsonProperty("total_output_tokens") long totalOutputTokens,
+ @JsonProperty("total_tokens") long totalTokens,
+ @JsonProperty("total_cost_usd") double totalCostUsd,
+ @JsonProperty("total_requests") long totalRequests,
+ @JsonProperty("total_latency_ms") double totalLatencyMs,
+ @JsonProperty("record_count") long recordCount,
+ @JsonProperty("cache_creation_tokens") long cacheCreationTokens,
+ @JsonProperty("cache_read_tokens") long cacheReadTokens,
+ @JsonProperty("by_agent") Map> byAgent,
+ @JsonProperty("by_model") Map> byModel) {
+
+ public UsageBreakdown {
+ byAgent = byAgent == null ? Map.of() : Map.copyOf(byAgent);
+ byModel = byModel == null ? Map.of() : Map.copyOf(byModel);
+ }
+}
diff --git a/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/ValidationRule.java b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/ValidationRule.java
new file mode 100644
index 0000000..69c94a3
--- /dev/null
+++ b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/ValidationRule.java
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2026 Firefly Software Solutions Inc
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.firefly.flydocs.sdk.model;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonValue;
+
+/** Which validation check produced a given error. Wire values lowercase. */
+public enum ValidationRule {
+ TYPE("type"),
+ PATTERN("pattern"),
+ FORMAT("format"),
+ ENUM("enum"),
+ MINIMUM("minimum"),
+ MAXIMUM("maximum"),
+ VALIDATOR("validator");
+
+ private final String wire;
+
+ ValidationRule(String wire) {
+ this.wire = wire;
+ }
+
+ @JsonValue
+ public String wire() {
+ return wire;
+ }
+
+ @JsonCreator
+ public static ValidationRule fromWire(String value) {
+ if (value == null) {
+ throw new IllegalArgumentException("ValidationRule value is null");
+ }
+ for (ValidationRule r : values()) {
+ if (r.wire.equals(value)) {
+ return r;
+ }
+ }
+ throw new IllegalArgumentException("unknown ValidationRule: " + value);
+ }
+}
diff --git a/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/ValidatorSpec.java b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/ValidatorSpec.java
new file mode 100644
index 0000000..8ec8574
--- /dev/null
+++ b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/ValidatorSpec.java
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2026 Firefly Software Solutions Inc
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.firefly.flydocs.sdk.model;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonValue;
+import java.util.Map;
+import org.jspecify.annotations.Nullable;
+
+/**
+ * One named built-in validator attached to a {@link Field}.
+ *
+ *
The dispatch key is {@code name} (not {@code type} — renamed to avoid
+ * collision with {@link Field#type()} when both appear in the same parent
+ * envelope). Canonical catalogue names: {@code iban}, {@code bic},
+ * {@code phone_e164}, {@code vat_id}, {@code email}, {@code uri}, {@code uuid},
+ * {@code date}, {@code datetime}, {@code time}, {@code iso_8601}, …
+ *
+ *
{@link Severity#ERROR} (default) hard-fails the field. {@link Severity#WARNING}
+ * records the issue but keeps the field {@code valid=true}.
+ */
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public record ValidatorSpec(
+ @JsonProperty("name") String name,
+ @JsonProperty("params") @Nullable Map params,
+ @JsonProperty("severity") Severity severity) {
+
+ public ValidatorSpec {
+ if (severity == null) {
+ severity = Severity.ERROR;
+ }
+ }
+
+ public ValidatorSpec(String name) {
+ this(name, null, Severity.ERROR);
+ }
+
+ public ValidatorSpec(String name, Map params) {
+ this(name, params, Severity.ERROR);
+ }
+
+ /** Hard error vs soft warning. */
+ public enum Severity {
+ ERROR("error"),
+ WARNING("warning");
+
+ private final String wire;
+
+ Severity(String wire) {
+ this.wire = wire;
+ }
+
+ @JsonValue
+ public String wire() {
+ return wire;
+ }
+
+ @JsonCreator
+ public static Severity fromWire(String value) {
+ if (value == null) {
+ throw new IllegalArgumentException("Severity value is null");
+ }
+ for (Severity s : values()) {
+ if (s.wire.equals(value)) {
+ return s;
+ }
+ }
+ throw new IllegalArgumentException("unknown Severity: " + value);
+ }
+ }
+}
diff --git a/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/VersionInfo.java b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/VersionInfo.java
index bfff73b..aa87f0e 100644
--- a/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/VersionInfo.java
+++ b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/VersionInfo.java
@@ -17,14 +17,16 @@
package com.firefly.flydocs.sdk.model;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
/** Response body for {@code GET /api/v1/version}. */
@JsonIgnoreProperties(ignoreUnknown = true)
+@JsonInclude(JsonInclude.Include.NON_NULL)
public record VersionInfo(
- String service,
- String version,
- String model,
+ @JsonProperty("service") String service,
+ @JsonProperty("version") String version,
+ @JsonProperty("model") String model,
@JsonProperty("fallback_model") String fallbackModel,
@JsonProperty("eda_adapter") String edaAdapter) {
}
diff --git a/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/VisualCheck.java b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/VisualCheck.java
new file mode 100644
index 0000000..8833dc9
--- /dev/null
+++ b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/VisualCheck.java
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2026 Firefly Software Solutions Inc
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.firefly.flydocs.sdk.model;
+
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+/**
+ * One visual check the service should run against the document
+ * (signature presence, watermark, seal, …).
+ *
+ *
In v1, visual checks live on {@link DocumentTypeSpec#visualChecks()}
+ * directly — the v0 {@code validators.visual[]} envelope is gone.
+ */
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public record VisualCheck(
+ @JsonProperty("name") String name,
+ @JsonProperty("description") String description) {
+}
diff --git a/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/VisualCheckResult.java b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/VisualCheckResult.java
new file mode 100644
index 0000000..b831389
--- /dev/null
+++ b/sdks/java/flydocs-sdk/src/main/java/com/firefly/flydocs/sdk/model/VisualCheckResult.java
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2026 Firefly Software Solutions Inc
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.firefly.flydocs.sdk.model;
+
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import org.jspecify.annotations.Nullable;
+
+/** Outcome of one {@link VisualCheck} on a document. */
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public record VisualCheckResult(
+ @JsonProperty("name") String name,
+ @JsonProperty("passed") boolean passed,
+ @JsonProperty("confidence") double confidence,
+ @JsonProperty("notes") @Nullable String notes) {
+}
diff --git a/sdks/java/flydocs-sdk/src/test/java/com/firefly/flydocs/sdk/FlydocsClientAsyncTest.java b/sdks/java/flydocs-sdk/src/test/java/com/firefly/flydocs/sdk/FlydocsClientAsyncTest.java
index a896f8e..54d369f 100644
--- a/sdks/java/flydocs-sdk/src/test/java/com/firefly/flydocs/sdk/FlydocsClientAsyncTest.java
+++ b/sdks/java/flydocs-sdk/src/test/java/com/firefly/flydocs/sdk/FlydocsClientAsyncTest.java
@@ -31,19 +31,18 @@
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import com.firefly.flydocs.sdk.error.FlydocsHttpException;
-import com.firefly.flydocs.sdk.model.DocSpec;
-import com.firefly.flydocs.sdk.model.DocumentInput;
+import com.firefly.flydocs.sdk.model.DocumentTypeSpec;
+import com.firefly.flydocs.sdk.model.Extraction;
+import com.firefly.flydocs.sdk.model.ExtractionListQuery;
+import com.firefly.flydocs.sdk.model.ExtractionListResponse;
import com.firefly.flydocs.sdk.model.ExtractionRequest;
import com.firefly.flydocs.sdk.model.ExtractionResult;
-import com.firefly.flydocs.sdk.model.FieldSpec;
+import com.firefly.flydocs.sdk.model.ExtractionResultEnvelope;
+import com.firefly.flydocs.sdk.model.ExtractionStatus;
+import com.firefly.flydocs.sdk.model.Field;
import com.firefly.flydocs.sdk.model.FieldType;
-import com.firefly.flydocs.sdk.model.JobListResponse;
-import com.firefly.flydocs.sdk.model.JobResult;
-import com.firefly.flydocs.sdk.model.JobStatus;
-import com.firefly.flydocs.sdk.model.JobStatusResponse;
-import com.firefly.flydocs.sdk.model.SubmitJobRequest;
-import com.firefly.flydocs.sdk.model.SubmitJobResponse;
-import com.firefly.flydocs.sdk.model.VersionInfo;
+import com.firefly.flydocs.sdk.model.FileInput;
+import com.firefly.flydocs.sdk.model.SubmitExtractionRequest;
import com.github.tomakehurst.wiremock.WireMockServer;
import java.util.List;
import java.util.Map;
@@ -53,11 +52,11 @@
import reactor.test.StepVerifier;
/**
- * End-to-end mock tests for the reactive client.
+ * End-to-end mock tests for the reactive v1 client.
*
*
Each test stands up a WireMock stub that mimics what the real
* service would return, calls the SDK, and asserts both halves:
- * the request the SDK put on the wire matches the controller's
+ * the request the SDK put on the wire matches the controller's v1
* contract (path, body, headers) and the response is decoded onto
* the typed records.
WireMock's {@code .willReturn} can chain responses via scenarios;
- * we use {@code inScenario} so each successive poll returns a
- * different state and the helper sees the state machine progress.
+ *
WireMock scenarios drive the state-machine progression.
What's covered (low-cost, no LLM calls beyond ``validate``):
- *
- *
{@code version()} returns a plausibly-shaped service identity.
- *
{@code health(readiness)} returns {@code UP} (or any status).
- *
{@code validate(req)} round-trips a non-trivial schema and rules.
- *
{@code listJobs()} returns a paginated response.
- *
{@code getJob("does-not-exist")} surfaces a typed 404.
- *
- *
- *
Heavyweight paths -- ``extract`` and ``submitJob`` -- are
- * deliberately not exercised here because they consume real LLM tokens
- * unless the service is wired to a mock provider. Pair this with the
- * docker-compose test stack (mock-llm overlay) for that.
*/
@Tag("integration")
@EnabledIfEnvironmentVariable(named = "FLYDOCS_BASE_URL", matches = ".+")
@@ -92,46 +78,38 @@ void readinessHealthEndpointResponds() {
@Test
void validateAcceptsAWellFormedRequest() {
- DocSpec invoice = DocSpec.builder("invoice")
+ DocumentTypeSpec invoice = DocumentTypeSpec.builder("invoice")
.description("simple invoice")
.addFieldGroup("totals",
- FieldSpec.required("total_amount", FieldType.NUMBER),
- FieldSpec.required("currency", FieldType.STRING))
+ Field.required("total_amount", FieldType.NUMBER),
+ Field.required("currency", FieldType.STRING))
.build();
- // 1-byte placeholder PDF body -- validate is a dry-run, it
- // doesn't actually parse the document, only the schema graph.
+ // Minimal PDF placeholder body; validate is a dry-run.
ExtractionRequest req = ExtractionRequest.builder()
- .addDocument(DocumentInput.ofBytes(new byte[]{0x25, 0x50, 0x44, 0x46, 0x2d, 0x31, 0x2e, 0x34, 0x0a, 0x25, 0x25, 0x45, 0x4f, 0x46, 0x0a},
+ .addFile(FileInput.ofBytes(new byte[]{0x25, 0x50, 0x44, 0x46, 0x2d, 0x31, 0x2e, 0x34, 0x0a, 0x25, 0x25, 0x45, 0x4f, 0x46, 0x0a},
"placeholder.pdf"))
- .addDocSpec(invoice)
+ .addDocumentType(invoice)
.build();
Map report = flydocs.validate(req).block(Duration.ofSeconds(10));
assertThat(report).isNotNull();
- // The report always contains "errors" + "warnings" arrays, even
- // when empty -- this is the contract the sync controller pins.
assertThat(report).containsKey("errors");
assertThat(report).containsKey("warnings");
}
@Test
- void listJobsReturnsPaginatedResponse() {
- var page = flydocs.listJobs().block(Duration.ofSeconds(10));
+ void listExtractionsReturnsPaginatedResponse() {
+ var page = flydocs.extractions().list().block(Duration.ofSeconds(10));
assertThat(page).isNotNull();
- // jobs may be empty but the wrapper fields are always present.
assertThat(page.total()).isGreaterThanOrEqualTo(0);
assertThat(page.limit()).isGreaterThan(0);
}
@Test
- void getJobOfNonExistentIdReturnsTyped404() {
+ void getExtractionOfNonExistentIdReturnsTyped404() {
try {
- flydocs.getJob("00000000-0000-0000-0000-000000000000").block(Duration.ofSeconds(10));
- // Some deployments may treat the unknown id as anything;
- // the typed-404 contract is what we want to verify when
- // the service implements it. If we get a body, that's
- // also a legal answer.
+ flydocs.extractions().get("ext_does_not_exist").block(Duration.ofSeconds(10));
} catch (RuntimeException e) {
Throwable root = e;
while (root.getCause() != null && !(root instanceof FlydocsHttpException)) {
diff --git a/sdks/java/flydocs-sdk/src/test/java/com/firefly/flydocs/sdk/model/ModelMappingTest.java b/sdks/java/flydocs-sdk/src/test/java/com/firefly/flydocs/sdk/model/ModelMappingTest.java
index d107ade..9317674 100644
--- a/sdks/java/flydocs-sdk/src/test/java/com/firefly/flydocs/sdk/model/ModelMappingTest.java
+++ b/sdks/java/flydocs-sdk/src/test/java/com/firefly/flydocs/sdk/model/ModelMappingTest.java
@@ -18,25 +18,23 @@
import static org.assertj.core.api.Assertions.assertThat;
+import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
-import java.util.UUID;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
/**
- * Verifies Jackson round-trips the SDK's record DTOs as we expect.
+ * Verifies Jackson round-trips the SDK's record DTOs as we expect for the v1 wire.
*
*
Each test pins one half of the wire contract: either the JSON the
* service emits decodes onto our records, or the records we hand to
- * Jackson serialise back into JSON the service will accept. Skipping
- * this would let drift between camelCase fields (jobId) and snake_case
- * wire keys (job_id) sneak through.
For each builder we (1) construct a non-trivial request, (2)
- * serialise to JSON, and (3) assert the on-wire keys match what the
- * service expects. This pins both halves of the contract — the public
- * Java API and the JSON the service sees.
+ *
Each builder is exercised through (1) construction, (2) JSON
+ * serialisation, (3) assertions on the on-wire keys.
{@link FlydocsClientAsync} — reactive client, conditional on
- * {@code flydocs.base-url} being set and on no other
- * {@code FlydocsClientAsync} bean already existing.
- *
{@link FlydocsClient} — blocking facade over the async client,
- * same conditions plus the async bean.
- *
{@link WebhookVerifier} — only when
- * {@code flydocs.webhook.hmac-secret} is set.
+ * {@code flydocs.base-url} being set.
+ *
{@link FlydocsClient} — blocking facade over the async client.
+ *
{@link WebhookVerifier} — only when {@code flydocs.webhook.secret} is set.
+ *
{@link FlydocsWebhookArgumentResolver} + {@link FlydocsWebhookWebMvcConfigurer}
+ * — only when the {@link WebhookVerifier} bean is present and Spring MVC is
+ * on the classpath.
*
*
*
Both client beans declare {@code destroyMethod="close"}, so the
@@ -61,6 +65,9 @@ public FlydocsClientAsync flydocsClientAsync(FlydocsProperties props) {
.maxConnections(props.getMaxConnections())
.pendingAcquireTimeout(props.getPendingAcquireTimeout())
.maxInMemorySize(props.getMaxInMemorySize());
+ if (props.getApiKey() != null && !props.getApiKey().isEmpty()) {
+ b.apiKey(props.getApiKey());
+ }
if (props.getTenantId() != null && !props.getTenantId().isEmpty()) {
b.defaultHeader("X-Tenant-Id", props.getTenantId());
}
@@ -75,15 +82,44 @@ public FlydocsClient flydocsClient(FlydocsClientAsync async) {
@Bean
@ConditionalOnMissingBean
- @ConditionalOnProperty(prefix = "flydocs.webhook", name = "hmac-secret")
+ @ConditionalOnProperty(prefix = "flydocs.webhook", name = "secret")
public WebhookVerifier flydocsWebhookVerifier(FlydocsProperties props) {
- String secret = props.getWebhook().getHmacSecret();
+ String secret = props.getWebhook().getSecret();
if (secret == null || secret.isEmpty()) {
- // ConditionalOnProperty already filtered this, but the
- // accessor is nullable -- be explicit so the constructor
- // contract is clear.
- throw new IllegalStateException("flydocs.webhook.hmac-secret must be set");
+ throw new IllegalStateException("flydocs.webhook.secret must be set");
}
return new WebhookVerifier(secret);
}
+
+ /**
+ * Default Jackson mapper for webhook deserialisation. Falls back to a
+ * private instance when the application context has no
+ * {@link ObjectMapper} primary bean.
+ */
+ @Bean(name = "flydocsWebhookObjectMapper")
+ @ConditionalOnMissingBean(name = "flydocsWebhookObjectMapper")
+ public ObjectMapper flydocsWebhookObjectMapper() {
+ return new ObjectMapper()
+ .registerModule(new JavaTimeModule())
+ .disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS);
+ }
+
+ /** Resolver that powers {@code @FlydocsWebhook} controller parameters. */
+ @Bean
+ @ConditionalOnBean(WebhookVerifier.class)
+ @ConditionalOnClass(name = "jakarta.servlet.http.HttpServletRequest")
+ @ConditionalOnMissingBean
+ public FlydocsWebhookArgumentResolver flydocsWebhookArgumentResolver(
+ WebhookVerifier verifier, ObjectMapper flydocsWebhookObjectMapper) {
+ return new FlydocsWebhookArgumentResolver(verifier, flydocsWebhookObjectMapper);
+ }
+
+ /** Wires the argument resolver into Spring MVC's resolver chain. */
+ @Bean
+ @ConditionalOnBean(FlydocsWebhookArgumentResolver.class)
+ @ConditionalOnClass(WebMvcConfigurer.class)
+ public FlydocsWebhookWebMvcConfigurer flydocsWebhookWebMvcConfigurer(
+ FlydocsWebhookArgumentResolver resolver) {
+ return new FlydocsWebhookWebMvcConfigurer(resolver);
+ }
}
diff --git a/sdks/java/flydocs-spring-boot-starter/src/main/java/com/firefly/flydocs/sdk/spring/FlydocsProperties.java b/sdks/java/flydocs-spring-boot-starter/src/main/java/com/firefly/flydocs/sdk/spring/FlydocsProperties.java
index afd07ae..d0386f0 100644
--- a/sdks/java/flydocs-spring-boot-starter/src/main/java/com/firefly/flydocs/sdk/spring/FlydocsProperties.java
+++ b/sdks/java/flydocs-spring-boot-starter/src/main/java/com/firefly/flydocs/sdk/spring/FlydocsProperties.java
@@ -23,17 +23,15 @@
/**
* Configuration properties for the flydocs SDK starter.
*
- *
All knobs are optional except {@link #baseUrl}; the starter's
- * auto-configuration refuses to wire the client beans without it. Set
- * via {@code application.yaml}:
+ *
{@link #baseUrl} is required; everything else has a sane default.
*/
@ConfigurationProperties(prefix = "flydocs")
@@ -46,6 +44,13 @@ public class FlydocsProperties {
@Nullable
private String baseUrl;
+ /**
+ * API key sent as {@code Authorization: Bearer } on every request.
+ * Leave unset for unauthenticated local development.
+ */
+ @Nullable
+ private String apiKey;
+
/**
* Per-call HTTP response timeout (start of request to last byte
* received). Default 60s.
@@ -72,16 +77,13 @@ public class FlydocsProperties {
private Duration pendingAcquireTimeout = Duration.ofSeconds(45);
/**
- * Maximum response body the client buffers in memory. Default 64
- * MiB; large enough for the biggest extraction-result payload the
- * service emits today.
+ * Maximum response body the client buffers in memory. Default 64 MiB.
*/
private int maxInMemorySize = 64 * 1024 * 1024;
/**
* Optional caller identifier added as the {@code X-Tenant-Id}
- * header on every request. Leave unset if your service propagates
- * tenant information through some other mechanism.
+ * header on every request.
*/
@Nullable
private String tenantId;
@@ -97,6 +99,15 @@ public void setBaseUrl(@Nullable String baseUrl) {
this.baseUrl = baseUrl;
}
+ @Nullable
+ public String getApiKey() {
+ return apiKey;
+ }
+
+ public void setApiKey(@Nullable String apiKey) {
+ this.apiKey = apiKey;
+ }
+
public Duration getTimeout() {
return timeout;
}
@@ -161,20 +172,21 @@ public Webhook getWebhook() {
/** Nested webhook settings. */
public static class Webhook {
/**
- * HMAC-SHA256 secret used to verify inbound webhook
- * signatures. When set, the starter publishes a
- * {@code WebhookVerifier} bean.
+ * HMAC-SHA256 secret used to verify inbound webhook signatures.
+ * When set, the starter publishes a {@link com.firefly.flydocs.sdk.webhook.WebhookVerifier}
+ * bean and registers a {@link FlydocsWebhookArgumentResolver} so
+ * {@code @FlydocsWebhook} parameters resolve correctly.
*/
@Nullable
- private String hmacSecret;
+ private String secret;
@Nullable
- public String getHmacSecret() {
- return hmacSecret;
+ public String getSecret() {
+ return secret;
}
- public void setHmacSecret(@Nullable String hmacSecret) {
- this.hmacSecret = hmacSecret;
+ public void setSecret(@Nullable String secret) {
+ this.secret = secret;
}
}
}
diff --git a/sdks/java/flydocs-spring-boot-starter/src/main/java/com/firefly/flydocs/sdk/spring/FlydocsWebhook.java b/sdks/java/flydocs-spring-boot-starter/src/main/java/com/firefly/flydocs/sdk/spring/FlydocsWebhook.java
new file mode 100644
index 0000000..6970403
--- /dev/null
+++ b/sdks/java/flydocs-spring-boot-starter/src/main/java/com/firefly/flydocs/sdk/spring/FlydocsWebhook.java
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2026 Firefly Software Solutions Inc
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.firefly.flydocs.sdk.spring;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+/**
+ * Annotation that marks a controller method parameter as an inbound flydocs
+ * webhook envelope. The Spring Boot starter resolves it by:
+ *
+ *
+ *
Reading the raw request body bytes.
+ *
Verifying the {@code X-Flydocs-Signature} HMAC-SHA256 header against
+ * the configured {@code flydocs.webhook.hmac-secret}.
+ *
Deserialising the body onto an
+ * {@link com.firefly.flydocs.sdk.model.EventEnvelope} record.
+ *
+ *
+ *
Signature mismatches surface as a
+ * {@link com.firefly.flydocs.sdk.webhook.WebhookVerificationException}.
+ */
+@Target(ElementType.PARAMETER)
+@Retention(RetentionPolicy.RUNTIME)
+public @interface FlydocsWebhook {
+}
diff --git a/sdks/java/flydocs-spring-boot-starter/src/main/java/com/firefly/flydocs/sdk/spring/FlydocsWebhookArgumentResolver.java b/sdks/java/flydocs-spring-boot-starter/src/main/java/com/firefly/flydocs/sdk/spring/FlydocsWebhookArgumentResolver.java
new file mode 100644
index 0000000..c5276a7
--- /dev/null
+++ b/sdks/java/flydocs-spring-boot-starter/src/main/java/com/firefly/flydocs/sdk/spring/FlydocsWebhookArgumentResolver.java
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2026 Firefly Software Solutions Inc
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.firefly.flydocs.sdk.spring;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.firefly.flydocs.sdk.model.EventEnvelope;
+import com.firefly.flydocs.sdk.webhook.WebhookVerificationException;
+import com.firefly.flydocs.sdk.webhook.WebhookVerifier;
+import jakarta.servlet.http.HttpServletRequest;
+import java.io.IOException;
+import org.springframework.core.MethodParameter;
+import org.springframework.web.bind.support.WebDataBinderFactory;
+import org.springframework.web.context.request.NativeWebRequest;
+import org.springframework.web.method.support.HandlerMethodArgumentResolver;
+import org.springframework.web.method.support.ModelAndViewContainer;
+
+/**
+ * Servlet-stack {@link HandlerMethodArgumentResolver} for the
+ * {@link FlydocsWebhook} annotation.
+ *
+ *
Resolves a controller method parameter typed
+ * {@link EventEnvelope} by verifying the HMAC signature header and
+ * deserialising the raw request bytes.
+ */
+public class FlydocsWebhookArgumentResolver implements HandlerMethodArgumentResolver {
+
+ /** Header the service signs the body with. */
+ public static final String SIGNATURE_HEADER = "X-Flydocs-Signature";
+
+ private final WebhookVerifier verifier;
+ private final ObjectMapper mapper;
+
+ public FlydocsWebhookArgumentResolver(WebhookVerifier verifier, ObjectMapper mapper) {
+ this.verifier = verifier;
+ this.mapper = mapper;
+ }
+
+ @Override
+ public boolean supportsParameter(MethodParameter parameter) {
+ return parameter.hasParameterAnnotation(FlydocsWebhook.class)
+ && EventEnvelope.class.isAssignableFrom(parameter.getParameterType());
+ }
+
+ @Override
+ public Object resolveArgument(
+ MethodParameter parameter,
+ ModelAndViewContainer mavContainer,
+ NativeWebRequest webRequest,
+ WebDataBinderFactory binderFactory) throws IOException {
+ HttpServletRequest request = webRequest.getNativeRequest(HttpServletRequest.class);
+ if (request == null) {
+ throw new IllegalStateException("FlydocsWebhook resolver requires a servlet request");
+ }
+ byte[] body = request.getInputStream().readAllBytes();
+ String signature = request.getHeader(SIGNATURE_HEADER);
+ if (signature == null) {
+ throw new WebhookVerificationException("missing " + SIGNATURE_HEADER + " header");
+ }
+ verifier.verify(body, signature);
+ return mapper.readValue(body, EventEnvelope.class);
+ }
+}
diff --git a/sdks/java/flydocs-spring-boot-starter/src/main/java/com/firefly/flydocs/sdk/spring/FlydocsWebhookWebMvcConfigurer.java b/sdks/java/flydocs-spring-boot-starter/src/main/java/com/firefly/flydocs/sdk/spring/FlydocsWebhookWebMvcConfigurer.java
new file mode 100644
index 0000000..ecbd27a
--- /dev/null
+++ b/sdks/java/flydocs-spring-boot-starter/src/main/java/com/firefly/flydocs/sdk/spring/FlydocsWebhookWebMvcConfigurer.java
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2026 Firefly Software Solutions Inc
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.firefly.flydocs.sdk.spring;
+
+import java.util.List;
+import org.springframework.web.method.support.HandlerMethodArgumentResolver;
+import org.springframework.web.servlet.config.annotation.WebMvcConfigurer;
+
+/**
+ * Registers {@link FlydocsWebhookArgumentResolver} so controller methods
+ * annotated with {@link FlydocsWebhook} receive a verified
+ * {@link com.firefly.flydocs.sdk.model.EventEnvelope} argument.
+ *
+ *
Loaded by the starter only when both a {@link com.firefly.flydocs.sdk.webhook.WebhookVerifier}
+ * and Spring MVC are on the classpath.
+ */
+public class FlydocsWebhookWebMvcConfigurer implements WebMvcConfigurer {
+
+ private final FlydocsWebhookArgumentResolver resolver;
+
+ public FlydocsWebhookWebMvcConfigurer(FlydocsWebhookArgumentResolver resolver) {
+ this.resolver = resolver;
+ }
+
+ @Override
+ public void addArgumentResolvers(List resolvers) {
+ resolvers.add(resolver);
+ }
+}
diff --git a/sdks/java/flydocs-spring-boot-starter/src/test/java/com/firefly/flydocs/sdk/spring/FlydocsAutoConfigurationTest.java b/sdks/java/flydocs-spring-boot-starter/src/test/java/com/firefly/flydocs/sdk/spring/FlydocsAutoConfigurationTest.java
index 3c31cc6..a328193 100644
--- a/sdks/java/flydocs-spring-boot-starter/src/test/java/com/firefly/flydocs/sdk/spring/FlydocsAutoConfigurationTest.java
+++ b/sdks/java/flydocs-spring-boot-starter/src/test/java/com/firefly/flydocs/sdk/spring/FlydocsAutoConfigurationTest.java
@@ -48,19 +48,20 @@ void registersAsyncAndBlockingClientsWhenBaseUrlSet() {
.run(ctx -> {
assertThat(ctx).hasSingleBean(FlydocsClientAsync.class);
assertThat(ctx).hasSingleBean(FlydocsClient.class);
- // Webhook verifier stays absent without an HMAC secret.
+ // Webhook verifier stays absent without a secret.
assertThat(ctx).doesNotHaveBean(WebhookVerifier.class);
});
}
@Test
- void registersWebhookVerifierWhenHmacSecretSet() {
+ void registersWebhookVerifierWhenSecretSet() {
runner
.withPropertyValues(
"flydocs.base-url=http://localhost:8400",
- "flydocs.webhook.hmac-secret=super-secret")
+ "flydocs.webhook.secret=super-secret")
.run(ctx -> {
assertThat(ctx).hasSingleBean(WebhookVerifier.class);
+ assertThat(ctx).hasSingleBean(FlydocsWebhookArgumentResolver.class);
});
}
@@ -69,6 +70,7 @@ void honoursTimeoutAndRetryProperties() {
runner
.withPropertyValues(
"flydocs.base-url=http://localhost:8400",
+ "flydocs.api-key=my-key",
"flydocs.timeout=30s",
"flydocs.max-attempts=3",
"flydocs.retry-min-backoff=500ms",
@@ -77,6 +79,7 @@ void honoursTimeoutAndRetryProperties() {
.run(ctx -> {
assertThat(ctx).hasSingleBean(FlydocsClientAsync.class);
FlydocsProperties props = ctx.getBean(FlydocsProperties.class);
+ assertThat(props.getApiKey()).isEqualTo("my-key");
assertThat(props.getMaxAttempts()).isEqualTo(3);
assertThat(props.getRetryMinBackoff().toMillis()).isEqualTo(500);
assertThat(props.getMaxConnections()).isEqualTo(20);
diff --git a/sdks/java/pom.xml b/sdks/java/pom.xml
index 40b5306..adb689a 100644
--- a/sdks/java/pom.xml
+++ b/sdks/java/pom.xml
@@ -20,7 +20,7 @@
com.firefly.flydocsflydocs-sdk-parent
- 26.05.02
+ 26.6.0pomflydocs Java SDK (parent)
diff --git a/sdks/python/QUICKSTART.md b/sdks/python/QUICKSTART.md
index 627a4a3..a370eb0 100644
--- a/sdks/python/QUICKSTART.md
+++ b/sdks/python/QUICKSTART.md
@@ -1,13 +1,13 @@
-# flydocs Python SDK — Quickstart
+# flydocs Python SDK — Quickstart (v1)
-The fastest path from zero to your first extracted invoice. Five minutes, end to end.
+The fastest path from zero to your first extracted invoice. Five minutes, end to end. This document covers the **v1 contract** released in `26.6.0`.
---
## 1. Install (30 s)
```bash
-uv add https://github.com/firefly-operationOS/flydocs/releases/download/v26.05.01/flydocs_sdk-26.5.1-py3-none-any.whl
+uv add https://github.com/firefly-operationOS/flydocs/releases/download/v26.06.00/flydocs_sdk-26.6.0-py3-none-any.whl
```
The SDK depends only on `httpx` and `pydantic`.
@@ -28,50 +28,54 @@ If you already have a running flydocs deployment, point `base_url` at it and ski
# quickstart.py
import asyncio
from flydocs_sdk import (
- AsyncFlydocsClient,
- DocSpec,
- DocumentInput,
+ AsyncClient,
+ DocumentTypeSpec,
ExtractionRequest,
+ Field,
FieldGroup,
- FieldSpec,
FieldType,
+ FileInput,
)
async def main() -> None:
- # 1. Describe what you want extracted. The DocSpec carries the field
- # schema; the FieldGroup bundles related fields under one name.
- invoice = DocSpec(
- doc_type={"documentType": "invoice"},
+ # 1. Describe what you want extracted. The DocumentTypeSpec carries
+ # the schema; the FieldGroup bundles related fields under one name.
+ invoice = DocumentTypeSpec(
+ id="invoice",
field_groups=[
- FieldGroup.of(
- "totals",
- FieldSpec(field_name="total_amount", field_type=FieldType.NUMBER, required=True),
- FieldSpec(field_name="currency", field_type=FieldType.STRING, required=True),
- )
+ FieldGroup(
+ name="totals",
+ fields=[
+ Field(name="total_amount", type=FieldType.NUMBER, required=True),
+ Field(name="currency", type=FieldType.STRING, required=True),
+ ],
+ ),
],
)
- # 2. Build the request -- one or more files + one or more DocSpecs.
+ # 2. Build the request -- one or more files + one or more document types.
+ # v1 keys: ``files`` (was ``documents``) and ``document_types`` (was ``docs``).
request = ExtractionRequest(
- documents=[DocumentInput.from_path("invoice.pdf")],
- docs=[invoice],
+ files=[FileInput.from_path("invoice.pdf")],
+ document_types=[invoice],
)
- # 3. Call the service. AsyncFlydocsClient is the primary integration
- # surface; close it as a context manager.
- async with AsyncFlydocsClient("http://localhost:8400") as flydocs:
+ # 3. Call the service. AsyncClient is the primary integration surface;
+ # close it as a context manager.
+ async with AsyncClient("http://localhost:8400") as flydocs:
result = await flydocs.extract(request)
- # 4. Read the response. Each ExtractedDocument has fieldGroups, each
- # with extracted fields carrying value / confidence / bbox.
- print(f"model={result.model} latency={result.latency_ms} ms")
+ # 4. Read the response. v1 nests model + latency under ``result.pipeline``;
+ # each Document has ``field_groups``, each with ``fields``.
+ print(f"id={result.id} model={result.pipeline.model} "
+ f"latency={result.pipeline.latency_ms} ms")
for doc in result.documents:
- for group in doc["fields"]:
- for field in group["fieldGroupFields"]:
+ for group in doc.field_groups:
+ for field in group.fields:
print(
- f" {field['name']:>15} = {field.get('value')!r:>20} "
- f"conf={field.get('confidence', 0):.2f}"
+ f" {field.name:>15} = {field.value!r:>20} "
+ f"conf={field.confidence:.2f}"
)
@@ -80,7 +84,7 @@ asyncio.run(main())
```bash
uv run python quickstart.py
-# model=anthropic:claude-sonnet-4-6 latency=412 ms
+# id=ext_a1b2c3 model=anthropic:claude-sonnet-4-6 latency=412 ms
# total_amount = 1234.56 conf=0.97
# currency = 'EUR' conf=0.99
```
@@ -92,18 +96,19 @@ That's it — you've extracted structured data from a document.
## What next
- **[TUTORIAL.md](./TUTORIAL.md)** — the full payload composition reference: every field, every option, every variant, with constraints and worked examples.
-- **[examples/](./examples/)** — six runnable scripts: typed schema + rules, async jobs with `wait_for_completion`, webhook receiver, error handling, sync facade.
+- **[examples/](./examples/)** — six runnable scripts: typed schema + rules, async extractions with `wait_for_completion`, webhook receiver, error handling, sync facade.
- **[README.md](./README.md)** — feature matrix, API surface table, error model.
+- **[docs/migration-v0-to-v1.md](../../docs/migration-v0-to-v1.md)** — full table of v0 → v1 renames if you are upgrading.
## Need a synchronous API?
If you can't run an event loop:
```python
-from flydocs_sdk import FlydocsClient
+from flydocs_sdk import Client
-with FlydocsClient("http://localhost:8400") as flydocs:
+with Client("http://localhost:8400") as flydocs:
result = flydocs.extract(request)
```
-`FlydocsClient` mirrors `AsyncFlydocsClient` method-for-method, just without `await`. Prefer the async client whenever you can.
+`Client` mirrors `AsyncClient` method-for-method, just without `await`. Prefer the async client whenever you can.
diff --git a/sdks/python/README.md b/sdks/python/README.md
index 9d427d1..349a235 100644
--- a/sdks/python/README.md
+++ b/sdks/python/README.md
@@ -1,28 +1,18 @@
# flydocs Python SDK
-Official Python client for [flydocs](https://github.com/firefly-operationOS/flydocs) — the pure-multimodal Intelligent Document Processing service from Firefly OperationOS.
+Official Python client for [flydocs](https://github.com/firefly-operationOS/flydocs) — the pure-multimodal Intelligent Document Processing service from Firefly OperationOS. **v1 contract** (snake_case everywhere, `Extraction` lifecycle, single `EventEnvelope` for EDA + webhooks).
- **Async-first** over `httpx` with a synchronous wrapper.
-- **Typed** with Pydantic v2 — forward-compatible by design (unknown fields are preserved).
-- **Typed errors** mapping the service's RFC 7807 problem-details.
-- **Webhook verification** with constant-time HMAC.
+- **Typed** with Pydantic v2 — forward-compatible (`extra="allow"` everywhere).
+- **Typed errors** mapping the service's RFC 7807 problem-details (`code`, `title`, `detail`, `instance`, `extensions`).
+- **Webhook verification** with constant-time HMAC; returns a typed `EventEnvelope`.
## Install
The wheel is attached to every `vX.Y.Z` GitHub Release of [firefly-operationOS/flydocs](https://github.com/firefly-operationOS/flydocs). There is no PyPI publish; install the wheel directly from the release URL with [`uv`](https://docs.astral.sh/uv/):
```bash
-uv add https://github.com/firefly-operationOS/flydocs/releases/download/v26.05.01/flydocs_sdk-26.5.1-py3-none-any.whl
-```
-
-Or pin it in your `pyproject.toml`:
-
-```toml
-[project]
-dependencies = ["flydocs-sdk"]
-
-[tool.uv.sources]
-flydocs-sdk = { url = "https://github.com/firefly-operationOS/flydocs/releases/download/v26.05.01/flydocs_sdk-26.5.1-py3-none-any.whl" }
+uv add https://github.com/firefly-operationOS/flydocs/releases/download/v26.06.00/flydocs_sdk-26.6.0-py3-none-any.whl
```
The SDK depends only on `httpx` and `pydantic`.
@@ -31,78 +21,82 @@ The SDK depends only on `httpx` and `pydantic`.
```python
from flydocs_sdk import (
- DocSpec,
- DocumentInput,
+ Client,
+ DocumentTypeSpec,
ExtractionOptions,
ExtractionRequest,
+ Field,
FieldGroup,
- FieldSpec,
FieldType,
- FlydocsClient,
+ FileInput,
StageToggles,
)
-invoice = DocSpec(
- doc_type={"documentType": "invoice"},
+invoice = DocumentTypeSpec(
+ id="invoice",
field_groups=[
- FieldGroup.of(
- "totals",
- FieldSpec(field_name="total_amount", field_type=FieldType.NUMBER, required=True),
- FieldSpec(field_name="currency", field_type=FieldType.STRING, required=True),
- )
+ FieldGroup(
+ name="totals",
+ fields=[
+ Field(name="total_amount", type=FieldType.NUMBER, required=True),
+ Field(name="currency", type=FieldType.STRING, required=True),
+ ],
+ ),
],
)
-with FlydocsClient("http://localhost:8400") as flydocs:
+with Client("http://localhost:8400") as flydocs:
result = flydocs.extract(
ExtractionRequest(
- documents=[DocumentInput.from_path("invoice.pdf")],
- docs=[invoice],
- options=ExtractionOptions(stages=StageToggles(judge=True, bbox_refine=True)),
+ files=[FileInput.from_path("invoice.pdf")],
+ document_types=[invoice],
+ options=ExtractionOptions(
+ stages=StageToggles(judge=True, bbox_refine=True),
+ ),
)
)
-print(result.model, "latency:", result.latency_ms, "ms")
+print(result.pipeline.model, "latency:", result.pipeline.latency_ms, "ms")
for doc in result.documents:
- for group in doc["fields"]:
- for field in group["fieldGroupFields"]:
- print(field["name"], "=", field.get("value"))
+ for group in doc.field_groups:
+ for field in group.fields:
+ print(field.name, "=", field.value)
```
-> **See [TUTORIAL.md](./TUTORIAL.md) for the full walkthrough** — schemas, rules, async jobs, webhooks, errors.
+> **See [TUTORIAL.md](./TUTORIAL.md) for the full walkthrough** — schemas, rules, async extractions, webhooks, errors.
## Quickstart (async, with `wait_for_completion`)
```python
import asyncio
from flydocs_sdk import (
- AsyncFlydocsClient,
- DocumentInput,
- JobStatus,
- SubmitJobRequest,
+ AsyncClient,
+ ExtractionStatus,
+ FileInput,
+ SubmitExtractionRequest,
)
async def main() -> None:
- async with AsyncFlydocsClient("http://localhost:8400") as flydocs:
- submit = await flydocs.submit_job(
- SubmitJobRequest(
- documents=[DocumentInput.from_path("invoice.pdf")],
- docs=[invoice], # typed DocSpec from the sync example above
+ async with AsyncClient("http://localhost:8400") as flydocs:
+ ext = await flydocs.extractions.create(
+ SubmitExtractionRequest(
+ files=[FileInput.from_path("invoice.pdf")],
+ document_types=[invoice], # typed DocumentTypeSpec from above
callback_url="https://example.com/webhook",
metadata={"caller": "my-app"},
),
idempotency_key="my-app:invoice:42",
)
- print("queued", submit.job_id)
+ print("queued", ext.id)
final = await flydocs.wait_for_completion(
- submit.job_id, poll_interval=2.0, timeout=600.0
+ ext.id, poll_interval=2.0, timeout=600.0
)
- if final.status == JobStatus.SUCCEEDED:
- result = await flydocs.get_job_result(submit.job_id)
- print("got", len(result.result.documents), "documents")
- else:
- print("job did not succeed:", final.status, final.error_message)
+ if final.status == ExtractionStatus.SUCCEEDED:
+ envelope = await flydocs.extractions.get_result(ext.id)
+ print("got", len(envelope.result.documents), "documents")
+ elif final.error is not None:
+ print("did not succeed:", final.status.value, final.error.code, final.error.message)
asyncio.run(main())
```
@@ -110,7 +104,8 @@ asyncio.run(main())
## Webhook verification
```python
-from flydocs_sdk import WebhookVerifier, WebhookVerificationError, JobWebhookPayload
+import os
+from flydocs_sdk import EVENT_TYPE_EXTRACTION_COMPLETED, WebhookVerificationError, WebhookVerifier
verifier = WebhookVerifier(secret=os.environ["FLYDOCS_WEBHOOK_HMAC_SECRET"])
@@ -118,39 +113,42 @@ verifier = WebhookVerifier(secret=os.environ["FLYDOCS_WEBHOOK_HMAC_SECRET"])
raw_body: bytes = await request.body()
signature_header: str = request.headers.get("X-Flydocs-Signature", "")
try:
- verifier.verify(raw_body, signature_header)
+ envelope = verifier.verify(raw_body, signature_header) # typed EventEnvelope
except WebhookVerificationError:
return 403, "invalid signature"
-payload = JobWebhookPayload.model_validate_json(raw_body)
-# ... handle payload.status, payload.result, etc.
+if envelope.event_type == EVENT_TYPE_EXTRACTION_COMPLETED and envelope.result is not None:
+ for doc in envelope.result.documents:
+ ... # persist, fan out downstream work
```
## API surface
-| SDK method | HTTP | Returns |
-|-----------------------|---------------------------------------|---------------------------------|
-| `extract` | `POST /api/v1/extract` | `ExtractionResult` |
-| `validate` | `POST /api/v1/extract:validate` | `dict` (validation report) |
-| `submit_job` | `POST /api/v1/jobs` | `SubmitJobResponse` |
-| `get_job` | `GET /api/v1/jobs/{id}` | `JobStatusResponse` |
-| `get_job_result` | `GET /api/v1/jobs/{id}/result` | `JobResult` |
-| `list_jobs` | `GET /api/v1/jobs` | `JobListResponse` |
-| `cancel_job` | `DEL /api/v1/jobs/{id}` | `JobStatusResponse` |
-| `wait_for_completion` | polls `GET /api/v1/jobs/{id}` | `JobStatusResponse` (terminal) |
-| `version` | `GET /api/v1/version` | `VersionInfo` |
-| `health` | `GET /actuator/health/{probe}` | `dict` |
+| SDK method | HTTP | Returns |
+|-------------------------------------------------|--------------------------------------------------------|-------------------------------|
+| `client.extract(req)` | `POST /api/v1/extract` | `ExtractionResult` |
+| `client.validate(req)` | `POST /api/v1/extract:validate` | `ValidationResponse` |
+| `client.extractions.create(req, idempotency_key=...)` | `POST /api/v1/extractions` | `Extraction` (202) |
+| `client.extractions.list(...)` | `GET /api/v1/extractions` | `ExtractionListResponse` |
+| `client.extractions.get(id)` | `GET /api/v1/extractions/{id}` | `Extraction` |
+| `client.extractions.get_result(id, wait_for_bboxes=, timeout=)` | `GET /api/v1/extractions/{id}/result` | `ExtractionResultEnvelope` |
+| `client.extractions.cancel(id)` | `DELETE /api/v1/extractions/{id}` | `Extraction` |
+| `client.wait_for_completion(id, ...)` | polls `GET /api/v1/extractions/{id}` | `Extraction` (terminal) |
+| `client.version()` | `GET /api/v1/version` | `VersionInfo` |
+| `client.health()` | `GET /actuator/health/{probe}` | `dict` |
## Typed request models
-| Type | Purpose |
-|----------------------------|-------------------------------------------------------------------------------|
-| `StageToggles` | Opt-in switches for every optional pipeline stage. |
-| `ExtractionOptions` | Per-request knobs (model, language hint, stages, escalation, transformations).|
-| `DocSpec` + `DocType` | One expected document type plus its field schema and validators. |
-| `FieldGroup`, `FieldSpec`, `FieldItem` | Field schema (recursive: array fields nest items). |
-| `StandardValidatorSpec` | Built-in field validator (IBAN, BIC, VAT_ID, …) attached to a `FieldSpec`. |
-| `RuleSpec` + `RuleFieldParent` / `RuleValidatorParent` / `RuleRuleParent` | Business-rule DAG. |
+| Type | Purpose |
+|--------------------------------|------------------------------------------------------------------------------------------------------|
+| `StageToggles` | Opt-in switches for every optional pipeline stage. |
+| `ExtractionOptions` | Per-request knobs (`model`, `language_hint`, `stages`, `escalation`, `transformations`). |
+| `EscalationConfig` | Replaces v0 `escalation_threshold` + `escalation_model` (nested under `ExtractionOptions.escalation`). |
+| `DocumentTypeSpec` | Flattened v0 `DocSpec` + `DocType` (`id` / `description` / `country` are top-level fields). |
+| `FieldGroup`, `Field` | Single recursive `Field` (arrays via `items`, objects via `fields`) replaces v0 `FieldSpec` + `FieldItem`. |
+| `ValidatorSpec` | Built-in field validator (`iban`, `vat_id`, ...); dispatch key is `name` (was `type` in v0). |
+| `VisualCheck` | One visual check; lives on `DocumentTypeSpec.visual_checks` (was nested under `ValidatorsSpec.visual`). |
+| `RuleSpec` + `Rule{Field,Validator,Rule}Parent` | Business-rule DAG; parent discriminator is `kind` (was `parentType`). |
## Errors
@@ -158,26 +156,43 @@ Every error subclasses `FlydocsError`:
- `FlydocsTimeoutError` — the HTTP request itself timed out on the wire.
- `FlydocsClientError` — other transport problems (DNS, connect, TLS).
-- `FlydocsHTTPError` — the service answered with a 4xx/5xx. Carries `status_code`, `code`, `title`, `detail`, and the raw `payload` dict.
+- `FlydocsHttpError` — the service answered with a 4xx/5xx. Carries `status_code`, `code`, `title`, `detail`, `type`, `instance`, `extensions`, and the raw `payload` dict.
-The service emits RFC 7807-ish bodies with `code` / `title` / `detail`; the SDK decodes those onto the typed exception so you can branch:
+The service emits RFC 7807 bodies with `code` / `title` / `detail`. The v1 codes are: `not_found`, `not_ready`, `not_cancellable`, `timeout`, `file_too_large`, `unsupported_file`, `validation_failed`, `invalid_base64`, `invalid_request`, `encrypted_pdf`, `office_conversion_failed`, `archive_extraction_failed`, `image_conversion_failed`, `unauthorized`. The SDK doesn't pin to that set; it just exposes whatever the server sends.
```python
try:
flydocs.extract(req)
-except FlydocsHTTPError as e:
- if e.code == "extraction_timeout":
+except FlydocsHttpError as e:
+ if e.code == "timeout":
# fall back to async
- flydocs.submit_job(req)
+ flydocs.extractions.create(req)
```
+## Migrating from v0
+
+Read [`docs/migration-v0-to-v1.md`](../../docs/migration-v0-to-v1.md) for the complete rename / reshape table, or jump to:
+
+- `DocumentInput` → `FileInput`; `documents` → `files`; `document_type` → `expected_type`.
+- `DocSpec` + `DocType` → `DocumentTypeSpec` (flat); `docs` → `document_types`.
+- `FieldSpec` + `FieldItem` → single recursive `Field`.
+- `StandardValidatorSpec` → `ValidatorSpec`; dispatch key `type` → `name`.
+- `VisualValidatorSpec` + `ValidatorsSpec.visual` → `VisualCheck` + `DocumentTypeSpec.visual_checks`.
+- `JobStatus` → `ExtractionStatus`; values are lowercase; `PARTIAL_SUCCEEDED` / `REFINING_BBOXES` are gone.
+- `SubmitJobRequest`/`JobStatusResponse`/`SubmitJobResponse`/`JobResult`/`JobListResponse` → `SubmitExtractionRequest`/`Extraction`/`ExtractionResultEnvelope`/`ExtractionListResponse`.
+- `JobWebhookPayload` → `EventEnvelope` (carries the event-type discriminator and the typed `Extraction` snapshot).
+- Endpoints: `/api/v1/jobs/*` → `/api/v1/extractions/*`.
+- Response: top-level `model`/`latency_ms`/`trace`/`pipeline_errors`/`usage` collapse into `pipeline: PipelineMeta`.
+
## Development
```bash
cd sdks/python
-pip install -e ".[dev]"
-pytest
-ruff check src tests
+uv sync --extra dev
+uv run pytest
+uv run ruff check .
+uv run ruff format --check .
+uv build
```
## License
diff --git a/sdks/python/TUTORIAL.md b/sdks/python/TUTORIAL.md
index 3f1d577..90b9e11 100644
--- a/sdks/python/TUTORIAL.md
+++ b/sdks/python/TUTORIAL.md
@@ -1,6 +1,6 @@
-# flydocs Python SDK — Tutorial
+# flydocs Python SDK — Tutorial (v1 contract)
-A complete, payload-composition-focused reference for the flydocs Python SDK. Every typed model is documented: **what it carries**, **what variants exist**, **what values are accepted**, **what the defaults are**, and **what the wire shape looks like**.
+A complete, payload-composition-focused reference for the flydocs Python SDK. Every typed model is documented: **what it carries**, **what variants exist**, **what values are accepted**, **what the defaults are**, and **what the wire shape looks like**. Targets SDK `26.6.0` and the v1 server contract.
> **Audience.** Engineers integrating flydocs into a Python codebase who want to know exactly which knobs exist and how to compose them. For a 5-minute zero-to-first-extraction, see [QUICKSTART.md](./QUICKSTART.md).
>
@@ -12,14 +12,14 @@ A complete, payload-composition-focused reference for the flydocs Python SDK. Ev
1. [The mental model](#1-the-mental-model)
2. [`ExtractionRequest` — the top-level envelope](#2-extractionrequest--the-top-level-envelope)
-3. [`DocumentInput` — input files](#3-documentinput--input-files)
-4. [`DocSpec` — what to extract](#4-docspec--what-to-extract)
-5. [`FieldSpec` & `FieldItem` — field-level shape and constraints](#5-fieldspec--fielditem--field-level-shape-and-constraints)
-6. [`StandardValidatorSpec` — built-in validators (full catalogue)](#6-standardvalidatorspec--built-in-validators-full-catalogue)
+3. [`FileInput` — input files](#3-fileinput--input-files)
+4. [`DocumentTypeSpec` — what to extract](#4-documenttypespec--what-to-extract)
+5. [`Field` & `FieldGroup` — recursive field schema](#5-field--fieldgroup--recursive-field-schema)
+6. [`ValidatorSpec` — built-in validators (full catalogue)](#6-validatorspec--built-in-validators-full-catalogue)
7. [`ExtractionOptions` & `StageToggles` — pipeline configuration](#7-extractionoptions--stagetoggles--pipeline-configuration)
8. [`RuleSpec` — business rules over extracted fields](#8-rulespec--business-rules-over-extracted-fields)
9. [Transformations — post-extraction reshaping](#9-transformations--post-extraction-reshaping)
-10. [Async jobs — `SubmitJobRequest`, callbacks, idempotency](#10-async-jobs--submitjobrequest-callbacks-idempotency)
+10. [Async extractions — `SubmitExtractionRequest`, callbacks, idempotency](#10-async-extractions--submitextractionrequest-callbacks-idempotency)
11. [Webhooks — receiving and verifying delivery](#11-webhooks--receiving-and-verifying-delivery)
12. [Errors — RFC 7807 problem-details](#12-errors--rfc-7807-problem-details)
13. [Production patterns](#13-production-patterns)
@@ -33,12 +33,12 @@ A complete, payload-composition-focused reference for the flydocs Python SDK. Ev
A flydocs request carries three things:
```
- ┌─────────────────── ExtractionRequest ─────────────────┐
+ ┌────────────────── ExtractionRequest ──────────────────┐
│ │
- │ documents: [DocumentInput, ...] ← the bytes │
- │ docs: [DocSpec, ...] ← the schema │
- │ rules: [RuleSpec, ...] ← the predicates │
- │ options: ExtractionOptions ← the knobs │
+ │ files: [FileInput, ...] ← the bytes │
+ │ document_types: [DocumentTypeSpec] ← the schema │
+ │ rules: [RuleSpec, ...] ← the predicates│
+ │ options: ExtractionOptions ← the knobs │
│ │
└───────────────────────────────────────────────────────┘
```
@@ -46,282 +46,272 @@ A flydocs request carries three things:
The service runs a configurable pipeline:
```
- documents → splitter? → classifier? → extract (always) → field_validation? →
- → visual_authenticity? → content_authenticity? → judge? →
- → judge_escalation? → bbox_refine? → transform? → rule_engine? → assemble
+ files → splitter? → classifier? → extract (always) → field_validation? →
+ → visual_authenticity? → content_authenticity? → judge? →
+ → judge_escalation? → bbox_refine? → transform? → rule_engine? → assemble
```
-`extract` is mandatory; every other stage is opt-in via `StageToggles`. The response (`ExtractionResult`) carries one entry per resolved `DocSpec` under `documents`, plus per-stage trace and (when enabled) rule results, transformation outputs, judge verdicts, etc.
+`extract` is mandatory; every other stage is opt-in via `StageToggles`. The response (`ExtractionResult`) carries one entry per resolved `DocumentTypeSpec` under `documents`, plus per-stage trace and (when enabled) rule results, transformation outputs, judge verdicts, etc.
Two integration modes share the same request shape:
-| Mode | Method | When to use |
-|-----------------|-----------------------------------|-------------------------------------------------------------|
-| **Sync extract** | `await flydocs.extract(req)` | Single document, sub-minute. Caller waits on the HTTP call. |
-| **Async jobs** | `await flydocs.submit_job(req)` + `await flydocs.wait_for_completion(job_id)` | Long-running, batches, webhook-delivered results. |
+| Mode | Method | When to use |
+|--------------------|----------------------------------------------------------|-------------------------------------------------------------|
+| **Sync extract** | `await flydocs.extract(req)` | Single document, sub-minute. Caller waits on the HTTP call. |
+| **Async** | `await flydocs.extractions.create(req)` + `wait_for_completion` | Long-running, batches, webhook-delivered results. |
---
## 2. `ExtractionRequest` — the top-level envelope
-| Field | Type | Default | Required | Notes |
-|-----------------|-----------------------------------------|--------------------------|----------|--------------------------------------------------------------------|
-| `request_id` | `UUID` | random UUIDv4 | no | Use it to correlate logs / re-fetch by id later. |
-| `intention` | `str` | `"Extract structured data from the document."` | no | Free-form guidance for every LLM node (extract, judge, rules, …). |
-| `documents` | `list[DocumentInput]` | — | **yes**, min length 1 | Input files. A single file is a one-element list. |
-| `docs` | `list[DocSpec \| dict]` | — | **yes**, min length 1 | One entry per **expected document type**. |
-| `rules` | `list[RuleSpec \| dict]` | `[]` | no | Business-rule DAG. See §8. |
-| `options` | `ExtractionOptions \| dict` | `ExtractionOptions()` | no | Per-request knobs. See §7. |
+| Field | Type | Default | Required | Notes |
+|-------------------|-------------------------------------------------|--------------------------|----------|--------------------------------------------------------------------|
+| `intention` | `str` | `"Extract structured data from the document."` | no | Free-form guidance for every LLM node (extract, judge, rules, …). |
+| `files` | `list[FileInput]` | — | **yes**, min length 1 | Input files. A single file is a one-element list. |
+| `document_types` | `list[DocumentTypeSpec \| dict]` | — | **yes**, min length 1 | One entry per **expected document type**. |
+| `rules` | `list[RuleSpec \| dict]` | `[]` | no | Business-rule DAG. See §8. |
+| `options` | `ExtractionOptions \| dict` | `ExtractionOptions()` | no | Per-request knobs. See §7. |
Every field that takes a typed model also accepts a plain `dict` — useful for forward-compatibility with server-side fields the SDK hasn't surfaced yet.
```python
from flydocs_sdk import (
- DocSpec, DocumentInput, ExtractionRequest,
- FieldGroup, FieldSpec, FieldType,
+ DocumentTypeSpec, ExtractionRequest, Field, FieldGroup, FieldType, FileInput,
)
-invoice = DocSpec(
- doc_type={"documentType": "invoice"},
+invoice = DocumentTypeSpec(
+ id="invoice",
field_groups=[
- FieldGroup.of("totals",
- FieldSpec(field_name="total", field_type=FieldType.NUMBER, required=True)),
+ FieldGroup(name="totals", fields=[
+ Field(name="total", type=FieldType.NUMBER, required=True),
+ ]),
],
)
req = ExtractionRequest(
- documents=[DocumentInput.from_path("invoice.pdf")],
- docs=[invoice],
+ files=[FileInput.from_path("invoice.pdf")],
+ document_types=[invoice],
)
```
---
-## 3. `DocumentInput` — input files
+## 3. `FileInput` — input files
-One entry per input file. Each file is processed independently by the pipeline.
+One entry per input file. Each file is processed independently by the pipeline. Replaces v0 `DocumentInput`.
| Field | Type | Default | Required | Notes |
|-------------------|-----------------|---------|----------|-----------------------------------------------------------------|
| `filename` | `str` | — | **yes**, non-empty | Surfaced on the response so you know which file produced what. |
-| `content_base64` | `str` | — | **yes** | Base64 of the raw bytes. `data:;base64,...` URLs are accepted; the SDK strips the prefix client-side. |
+| `content_base64` | `str \| None` | `None` | yes for JSON | Base64 of the raw bytes. `data:;base64,...` URLs accepted; SDK strips the prefix client-side. Omit when posting multipart. |
| `content_type` | `str \| None` | `None` | no | MIME hint. Omit to let the service sniff magic bytes. |
-| `document_type` | `str \| None` | `None` | no | When set, pins this file to one of the declared `DocSpec.doc_type.documentType` values; the classifier is skipped for this file. |
+| `expected_type` | `str \| None` | `None` | no | When set, pins this file to one of the declared `DocumentTypeSpec.id` values; the classifier is skipped for this file. (Replaces v0 `document_type`.) |
### Three ways to build one
```python
# 1. From bytes you already have in memory
-doc = DocumentInput.from_bytes(b"%PDF-1.4...", filename="invoice.pdf",
- content_type="application/pdf")
+f = FileInput.from_bytes(b"%PDF-1.4...", filename="invoice.pdf",
+ content_type="application/pdf")
# 2. From a path on disk
-doc = DocumentInput.from_path("invoice.pdf")
-doc = DocumentInput.from_path("invoice.pdf", document_type="invoice") # caller-pinned
+f = FileInput.from_path("invoice.pdf")
+f = FileInput.from_path("invoice.pdf", expected_type="invoice") # caller-pinned
# 3. Hand-build (e.g. when you already have the base64)
-doc = DocumentInput(
+f = FileInput(
filename="invoice.pdf",
content_base64="JVBERi0xLjQK...",
content_type="application/pdf",
)
```
-### Accepted formats
+### Multipart upload
-flydocs runs binary normalisation upstream of the extractor, so any of these reach the LLM cleanly:
+For very large files (or when you'd rather not encode to base64 first), pass `files=[...]` to the client method:
-| Family | Examples | Native to provider? |
-|------------------|-------------------------------------------------------|---------------------|
-| PDF | `application/pdf` | yes (pass-through) |
-| Raster image | PNG, JPEG, WebP, GIF | yes (pass-through) |
-| Other image | HEIC/HEIF, AVIF, multi-frame TIFF, SVG, BMP | no — converted via Pillow / pillow-heif / cairosvg |
-| Office docs | DOCX, XLSX, PPTX, RTF, ODT, HTML | no — converted via the configured `OfficeConverter` (default Gotenberg HTTP sidecar) |
-| Archive / email | ZIP, 7z, TAR, GZIP, EML, MSG | no — fanned out into multiple internal rows by the normalizer |
+```python
+with open("big.pdf", "rb") as buf:
+ result = client.extract(req, files=[buf])
+```
-Encrypted or corrupt PDFs raise a typed `FlydocsHTTPError(422, code="invalid_request")` with the underlying reason in `detail`.
+The SDK posts a multipart body with the binaries riding as `files` parts and the JSON envelope (minus `files`) under a `request` part. The `FileInput` entries you put in the JSON still carry `filename` / `content_type` / `expected_type` for the matching parts.
### Sizing
-The service enforces `FLYDOCS_MAX_BYTES` per file. Going over yields `FlydocsHTTPError(413, code="document_too_large")`. Defaults vary by deployment — call `flydocs.version()` for instance identity, or split the file before submitting.
+The service enforces `FLYDOCS_MAX_BYTES` per file. Going over yields `FlydocsHttpError(413, code="file_too_large")`.
---
-## 4. `DocSpec` — what to extract
+## 4. `DocumentTypeSpec` — what to extract
+
+One `DocumentTypeSpec` per **expected document type**. When you submit multiple files, the classifier matches each file to one of the declared types unless the caller pins `FileInput.expected_type`.
-One `DocSpec` per **expected document type**. When you submit multiple files, the classifier matches each file to one of the declared specs unless the caller pins `DocumentInput.document_type`.
+Replaces v0 `DocSpec` and the nested `DocType` envelope — the v0 `docs[i].docType.documentType` is now `document_types[i].id`.
```python
from flydocs_sdk import (
- DocSpec, DocType, FieldGroup, FieldSpec, FieldType,
- ValidatorsSpec, VisualValidatorSpec,
+ DocumentTypeSpec, Field, FieldGroup, FieldType, VisualCheck,
)
-invoice = DocSpec(
- doc_type=DocType(
- document_type="invoice",
- description="Vendor invoice (paper or PDF)",
- country="ES",
- ),
+invoice = DocumentTypeSpec(
+ id="invoice",
+ description="Vendor invoice (paper or PDF)",
+ country="ES",
field_groups=[ ... ],
- validators=ValidatorsSpec(visual=[
- VisualValidatorSpec(name="signature_present",
- description="A handwritten or e-signature is visible"),
- ]),
+ visual_checks=[
+ VisualCheck(name="signature_present",
+ description="A handwritten or e-signature is visible"),
+ ],
)
```
-### `DocType`
+### Fields
-| Field | Type | Default | Required | Notes |
-|-------------------|---------|---------|----------|--------------------------------------------------------------------|
-| `document_type` | `str` | — | **yes**, non-empty | Stable id. Used by `RuleParent.document_type`, by `DocumentInput.document_type`, and surfaced verbatim on the response under `ExtractedDocument.document_type`. Snake_case lower-kebab works well: `invoice`, `purchase_order`, `id_card_es`, `passport_int`. |
-| `description` | `str` | `""` | no | Hints the classifier when multi-doc requests need disambiguation. |
-| `country` | `str` | `""` | no | ISO 3166-1 alpha-2. Hint for region-aware validators / formats. |
+| Field | Type | Default | Required | Notes |
+|------------------|----------------------------|---------------|----------|--------------------------------------------------------------------|
+| `id` | `str` | — | **yes**, non-empty | Stable identifier. Referenced by `RuleFieldParent.document_type`, `FileInput.expected_type`, and surfaced on the response as `Document.type`. |
+| `description` | `str \| None` | `None` | no | Hints the classifier when multi-doc requests need disambiguation. |
+| `country` | `str \| None` | `None` | no | ISO 3166-1 alpha-2. Hint for region-aware validators / formats. |
+| `field_groups` | `list[FieldGroup]` | — | **yes**, min 1 | One or more named groups of fields the extractor should produce. |
+| `visual_checks` | `list[VisualCheck]` | `[]` | no | Visual checks the service should run when `visual_authenticity` is on. (Replaces v0 `ValidatorsSpec.visual`.) |
### `FieldGroup`
-A named bundle of `FieldSpec`s that the service should extract together. Groups are how you partition the schema visually and logically — `header`, `totals`, `line_items_block`, …
+A named bundle of `Field`s that the service should extract together.
-| Field | Type | Default | Required | Notes |
-|----------------------|----------------------|---------|----------|---------------------------------------------------------|
-| `field_group_name` | `str` | — | **yes**, non-empty | JSON alias `fieldGroupName`. Use snake_case. |
-| `field_group_desc` | `str` | `""` | no | Free-form description shown to the LLM. |
-| `field_group_fields` | `list[FieldSpec]` | — | **yes**, min length 1 | JSON alias `fieldGroupFields`. |
+| Field | Type | Default | Required | Notes |
+|--------------|------------------|---------|----------|---------------------------------------------------------|
+| `name` | `str` | — | **yes**, non-empty | Group identifier (snake_case). Surfaced as `ExtractedFieldGroup.name`. |
+| `description`| `str \| None` | `None` | no | Free-form description shown to the LLM. |
+| `fields` | `list[Field]` | — | **yes**, min 1 | The fields the group carries. |
```python
-totals = FieldGroup.of(
- "totals", # name
- FieldSpec(field_name="subtotal", field_type=FieldType.NUMBER, required=True),
- FieldSpec(field_name="tax_amount", field_type=FieldType.NUMBER, required=True),
- FieldSpec(field_name="total_amount", field_type=FieldType.NUMBER, required=True),
- FieldSpec(field_name="currency", field_type=FieldType.STRING, required=True),
+totals = FieldGroup(
+ name="totals",
+ fields=[
+ Field(name="subtotal", type=FieldType.NUMBER, required=True),
+ Field(name="tax_amount", type=FieldType.NUMBER, required=True),
+ Field(name="total_amount", type=FieldType.NUMBER, required=True),
+ Field(name="currency", type=FieldType.STRING, required=True),
+ ],
description="Top-of-invoice money block",
)
```
-`FieldGroup.of(name, *fields, description="")` is the recommended factory — it folds the variadic fields into the list. Use the explicit constructor when you need to programmatically build the list.
-
-### `ValidatorsSpec` + `VisualValidatorSpec`
+### `VisualCheck`
-Per-`DocSpec` validator definitions. Currently only `visual` is exposed publicly; future additions (`audio`, `structural`) plug in here.
-
-| Field | Type | Notes |
-|--------------------------|-------------------------------|------------------------------------------------------|
-| `ValidatorsSpec.visual` | `list[VisualValidatorSpec]` | One entry per visual check the LLM should run. |
-| `VisualValidatorSpec.name` | `str` | Short identifier the response carries back. |
-| `VisualValidatorSpec.description` | `str` | What the LLM should look for (`"a handwritten or e-signature is visible"`). |
+| Field | Type | Notes |
+|---------------|---------|-------------------------------------------------------------|
+| `name` | `str` | Short identifier the response carries back. |
+| `description` | `str` | What the LLM should look for. |
`visual_authenticity` must be enabled on `StageToggles` for these to fire.
---
-## 5. `FieldSpec` & `FieldItem` — field-level shape and constraints
-
-### `FieldSpec`
-
-The unit of "one thing the caller wants extracted".
+## 5. `Field` & `FieldGroup` — recursive field schema
-| Field | Type | Default | Notes |
-|------------------------|-------------------------------|-------------|--------------------------------------------------------------------|
-| `field_name` | `str` (alias `name`) | —, **required** | The key under which the extracted value appears in the response. Snake_case lower-case is conventional. |
-| `field_description` | `str` (alias `description`) | `""` | Free-form hint for the LLM. The more specific, the better the recall on lookalikes. |
-| `field_type` | `FieldType` (alias `type`) | `STRING` | See enum below. |
-| `required` | `bool` | `False` | When `True`, a missing field surfaces as a `field_validation` error. |
-| `pattern` | `str \| None` | `None` | RFC-flavour regex applied by the field validator. |
-| `format` | `StandardFormat \| None` | `None` | One of `DATE` / `DATE_TIME` / `EMAIL` / `URI` / `UUID`. |
-| `enum` | `list \| None` | `None` | Closed set of acceptable values. |
-| `minimum` | `float \| None` | `None` | Numeric lower bound (inclusive). |
-| `maximum` | `float \| None` | `None` | Numeric upper bound (inclusive). |
-| `items` | `list[FieldItem] \| None` | `None` | **Only valid when `field_type == FieldType.ARRAY`**; describes the columns of each repeating row. |
-| `standard_validators` | `list[StandardValidatorSpec]` | `[]` | See §6. |
+v1 collapses v0's `FieldSpec` + `FieldItem` into a **single recursive `Field`** type. Primitives, arrays, and objects all use the same model.
-### `FieldType` — the five primitives
+### `Field`
-| Member | Wire form | Use for |
-|--------------------------|------------|------------------------------------------------------------------|
-| `FieldType.STRING` | `"string"` | Any free-form text, identifier, format-validated string. |
-| `FieldType.NUMBER` | `"number"` | Floats / decimals. Pair with `minimum` / `maximum` / `format=AMOUNT`. |
-| `FieldType.INTEGER` | `"integer"`| Integral quantities (counts, page numbers, quantities). |
-| `FieldType.BOOLEAN` | `"boolean"`| Yes/no / present/absent / signed/unsigned. |
-| `FieldType.ARRAY` | `"array"` | Repeating rows. **Requires** a non-empty `items` list. |
+| Field | Type | Default | Notes |
+|---------------|-------------------------------|-------------|--------------------------------------------------------------------|
+| `name` | `str` | —, required | The key under which the extracted value appears in the response. |
+| `description` | `str \| None` | `None` | Free-form hint for the LLM. |
+| `type` | `FieldType` | `STRING` | One of `STRING` / `NUMBER` / `INTEGER` / `BOOLEAN` / `ARRAY` / `OBJECT`. |
+| `required` | `bool` | `False` | When `True`, a missing field surfaces as a `field_validation` error. |
+| `pattern` | `str \| None` | `None` | RFC-flavour regex applied by the field validator. |
+| `format` | `StandardFormat \| None` | `None` | One of `DATE`, `DATE_TIME`, `TIME`, `EMAIL`, `URI`, `UUID`, `CURRENCY`. |
+| `enum` | `list \| None` | `None` | Closed set of acceptable values. |
+| `minimum` | `float \| None` | `None` | Numeric lower bound (inclusive). |
+| `maximum` | `float \| None` | `None` | Numeric upper bound (inclusive). |
+| `items` | `Field \| None` | `None` | **Only valid when `type == ARRAY`**; describes a single row shape. |
+| `fields` | `list[Field] \| None` | `None` | **Only valid when `type == OBJECT`**; describes the object's members. |
+| `validators` | `list[ValidatorSpec]` | `[]` | See §6. |
-### `StandardFormat` — JSON-Schema-style format hints
+### `FieldType`
-| Member | Wire form | Validation |
-|--------------------------|--------------|-----------------------------|
-| `StandardFormat.DATE` | `"date"` | `YYYY-MM-DD` |
-| `StandardFormat.DATE_TIME` | `"date-time"` | RFC 3339 / ISO 8601 with time |
-| `StandardFormat.EMAIL` | `"email"` | RFC 5322 |
-| `StandardFormat.URI` | `"uri"` | Generic URI |
-| `StandardFormat.UUID` | `"uuid"` | RFC 4122 |
+| Member | Wire form | Use for |
+|----------------------|------------|------------------------------------------------------------------|
+| `FieldType.STRING` | `"string"` | Any free-form text, identifier, format-validated string. |
+| `FieldType.NUMBER` | `"number"` | Floats / decimals. |
+| `FieldType.INTEGER` | `"integer"`| Integral quantities. |
+| `FieldType.BOOLEAN` | `"boolean"`| Yes/no / present/absent. |
+| `FieldType.ARRAY` | `"array"` | Repeating rows. **Requires** an `items` Field describing the row. |
+| `FieldType.OBJECT` | `"object"` | Nested object. **Requires** a non-empty `fields` list. |
-> **`format` vs `standard_validators`.** `format` is a single-shot JSON-Schema-style check baked into `FieldSpec`; `standard_validators` is the extensible catalogue (IBAN, NIE, VAT_ID, …). For format checks that have an equivalent validator, prefer `format` (cheaper, doesn't show up as a validator hit). For domain checks, use validators.
-
-### `FieldItem` — sub-fields inside an array
-
-`field_type == FieldType.ARRAY` makes a field a repeating row; `items` declares the columns:
-
-| Field | Type | Notes |
-|----------------------|-------------------------------|--------------------------------------------------------------------|
-| `field_name` | `str` (alias `fieldName`) | Column name (camelCase on the wire). |
-| `field_description` | `str` (alias `fieldDescription`) | Free-form hint per column. |
-| `field_type` | `FieldType` (alias `fieldType`) | One of the primitives. `FieldItem` does NOT support nested arrays — flatten or split into two field groups. |
-| `pattern`, `format`, `enum`, `minimum`, `maximum`, `standard_validators` | (same as `FieldSpec`) | All the field-level constraints apply per row. |
+### Arrays + objects
```python
-line_items = FieldSpec(
- field_name="line_items",
- field_type=FieldType.ARRAY,
- field_description="One row per line item on the invoice",
- items=[
- FieldItem(field_name="description", field_type=FieldType.STRING),
- FieldItem(field_name="quantity", field_type=FieldType.NUMBER, minimum=0),
- FieldItem(field_name="unit_price", field_type=FieldType.NUMBER, minimum=0),
- FieldItem(field_name="line_total", field_type=FieldType.NUMBER, minimum=0),
- ],
+line_items = Field(
+ name="line_items",
+ type=FieldType.ARRAY,
+ items=Field(
+ name="row",
+ type=FieldType.OBJECT,
+ fields=[
+ Field(name="description", type=FieldType.STRING),
+ Field(name="quantity", type=FieldType.NUMBER, minimum=0),
+ Field(name="unit_price", type=FieldType.NUMBER, minimum=0),
+ Field(name="line_total", type=FieldType.NUMBER, minimum=0),
+ ],
+ ),
)
```
+### `StandardFormat`
+
+| Member | Wire form | Validation |
+|---------------------------------|--------------|-----------------------------|
+| `StandardFormat.DATE` | `"date"` | `YYYY-MM-DD` |
+| `StandardFormat.DATE_TIME` | `"date-time"`| RFC 3339 / ISO 8601 with time |
+| `StandardFormat.TIME` | `"time"` | `HH:MM:SS` |
+| `StandardFormat.EMAIL` | `"email"` | RFC 5322 |
+| `StandardFormat.URI` | `"uri"` | Generic URI |
+| `StandardFormat.UUID` | `"uuid"` | RFC 4122 |
+| `StandardFormat.CURRENCY` | `"currency"` | ISO 4217 currency code |
+
### Variant cheat sheet
| Goal | Recipe |
|-------------------------------------|---------------------------------------------------------------------------------|
-| Required scalar | `FieldSpec(field_name="x", field_type=FieldType.STRING, required=True)` |
-| Optional with range | `FieldSpec(field_name="age", field_type=FieldType.INTEGER, minimum=0, maximum=120)` |
-| Closed enum | `FieldSpec(field_name="status", field_type=FieldType.STRING, enum=["active", "inactive"])` |
-| Date | `FieldSpec(field_name="dob", field_type=FieldType.STRING, format=StandardFormat.DATE)` |
-| Regex | `FieldSpec(field_name="ref", pattern=r"^[A-Z]{2}-\d{6}$")` |
-| IBAN | `FieldSpec(field_name="iban", standard_validators=[StandardValidatorSpec(type=StandardValidatorType.IBAN)])` |
-| Repeating rows | `FieldSpec(field_name="rows", field_type=FieldType.ARRAY, items=[FieldItem(...)])` |
-| Soft-warning validator | `StandardValidatorSpec(type=..., severity="warning")` — recorded, but `valid` stays `True`. |
+| Required scalar | `Field(name="x", type=FieldType.STRING, required=True)` |
+| Optional with range | `Field(name="age", type=FieldType.INTEGER, minimum=0, maximum=120)` |
+| Closed enum | `Field(name="status", type=FieldType.STRING, enum=["active", "inactive"])` |
+| Date | `Field(name="dob", type=FieldType.STRING, format=StandardFormat.DATE)` |
+| Regex | `Field(name="ref", pattern=r"^[A-Z]{2}-\d{6}$")` |
+| IBAN | `Field(name="iban", validators=[ValidatorSpec(name=ValidatorType.IBAN)])` |
+| Repeating rows | `Field(name="rows", type=FieldType.ARRAY, items=Field(...))` |
+| Object value | `Field(name="address", type=FieldType.OBJECT, fields=[Field(...)])` |
+| Soft-warning validator | `ValidatorSpec(name=..., severity="warning")` |
---
-## 6. `StandardValidatorSpec` — built-in validators (full catalogue)
+## 6. `ValidatorSpec` — built-in validators (full catalogue)
-Attach validators to a `FieldSpec` (or a `FieldItem` for array columns). The field validator stage runs them after extraction and folds the result into `ExtractedField.field_validation`.
+Attach validators to a `Field`. The field validator stage runs them after extraction and folds the result into `ExtractedField.validation`. Replaces v0 `StandardValidatorSpec`; dispatch key is `name` (not `type`).
```python
-from flydocs_sdk import StandardValidatorSpec, StandardValidatorType
+from flydocs_sdk import ValidatorSpec, ValidatorType
-iban_field = FieldSpec(
- field_name="iban",
- field_type=FieldType.STRING,
+iban_field = Field(
+ name="iban",
+ type=FieldType.STRING,
required=True,
- standard_validators=[StandardValidatorSpec(type=StandardValidatorType.IBAN)],
+ validators=[ValidatorSpec(name=ValidatorType.IBAN)],
)
-vat_es = FieldSpec(
- field_name="vat_id",
- standard_validators=[
- StandardValidatorSpec(type=StandardValidatorType.VAT_ID,
- params={"country": "ES"}),
+vat_es = Field(
+ name="vat_id",
+ validators=[
+ ValidatorSpec(name=ValidatorType.VAT_ID, params={"country": "ES"}),
],
)
-soft_warning = StandardValidatorSpec(
- type=StandardValidatorType.PHONE_E164,
+soft_warning = ValidatorSpec(
+ name=ValidatorType.PHONE_E164,
params={"country": "ES"},
severity="warning", # records the error but doesn't set valid=False
)
@@ -331,7 +321,7 @@ soft_warning = StandardValidatorSpec(
| Field | Type | Default | Notes |
|------------|-----------------------------------------------|-------------|----------------------------------------------------|
-| `type` | `StandardValidatorType` | —, required | Use the enum; raw strings work too for forward compat. |
+| `name` | `ValidatorType` | —, required | Use the enum; raw strings work too for forward compat. |
| `params` | `dict[str, Any]` | `{}` | Per-validator parameters (e.g. `{"country": "ES"}`). |
| `severity` | `Literal["error", "warning"]` | `"error"` | `"warning"` records the issue but keeps `valid=True`. |
@@ -353,7 +343,7 @@ soft_warning = StandardValidatorSpec(
| **Identifiers** | `UUID` | `uuid` | none |
| | `JSON` | `json` | none |
| | `HEX_COLOR` | `hex_color` | none |
-| **Finance** | `IBAN` | `iban` | none (country derived from prefix) |
+| **Finance** | `IBAN` | `iban` | none |
| | `BIC` | `bic` | none |
| | `CREDIT_CARD` | `credit_card` | none (Luhn-checked) |
| | `CURRENCY_CODE` | `currency_code` | none (ISO 4217) |
@@ -364,21 +354,21 @@ soft_warning = StandardValidatorSpec(
| | `POSTAL_CODE` | `postal_code` | `{"country": "ES"}` (optional) |
| | `LATITUDE` | `latitude` | none |
| | `LONGITUDE` | `longitude` | none |
-| **National IDs** | `NIF` | `nif` | `{"country": "ES"}` implied |
+| **National IDs** | `NIF` | `nif` | none |
| | `NIE` | `nie` | ES — foreign person tax id |
| | `CIF` | `cif` | ES — legacy company tax id |
| | `VAT_ID` | `vat_id` | `{"country": "ES"}` (EU VAT) |
| | `SSN` | `ssn` | US |
| | `PASSPORT_NUMBER` | `passport_number`| ICAO 9303 (length / charset only) |
-> **Soft vs hard.** Use `severity="warning"` for "extra signal" checks where you want the issue logged but still want the row to be `valid=True` (e.g. a non-canonical date format). Use the default `"error"` for "this is a contract violation" checks (e.g. malformed IBAN).
-
---
## 7. `ExtractionOptions` & `StageToggles` — pipeline configuration
```python
-from flydocs_sdk import ExtractionOptions, StageToggles
+from flydocs_sdk import (
+ EscalationConfig, ExtractionOptions, StageToggles,
+)
options = ExtractionOptions(
return_bboxes=True,
@@ -392,47 +382,50 @@ options = ExtractionOptions(
bbox_refine=True,
rule_engine=True,
),
- escalation_threshold=0.25,
- escalation_model="anthropic:claude-opus-4-7",
+ escalation=EscalationConfig(threshold=0.25, model="anthropic:claude-opus-4-7"),
transformations=[],
)
```
### `ExtractionOptions`
-| Field | Type | Default | Notes |
-|--------------------------|----------------------------|------------------------|--------------------------------------------------------------------|
-| `return_bboxes` | `bool` | `True` | When `False`, the response strips bounding boxes (cheaper to ship). |
-| `language_hint` | `str \| None` | `None` | ISO 639-1 (`"en"`, `"es"`, `"zh"`, …) — guides multilingual OCR / extraction. ≤ 16 chars. |
-| `model` | `str \| None` | `None` (uses env default) | Per-request primary model id (`"anthropic:claude-sonnet-4-6"`, `"openai:gpt-4o"`, …). |
-| `declared_media_type` | `str \| None` | `None` | Override sniffing; rare. Useful when callers know better than `magic`. |
-| `stages` | `StageToggles` | `StageToggles()` | See below. |
-| `escalation_threshold` | `float \| None` | `None` (env default) | `0.0–1.0`. When `stages.judge_escalation=True`, re-runs the request with `escalation_model` once the judge's fail-rate crosses this. |
-| `escalation_model` | `str \| None` | `None` (env default) | Model id used by the escalation re-run. |
-| `transformations` | `list[dict]` | `[]` | Post-extraction transformations. See §9. |
+| Field | Type | Default | Notes |
+|------------------------|----------------------------|------------------------|--------------------------------------------------------------------|
+| `model` | `str \| None` | `None` (env default) | Per-request primary model id (`"anthropic:claude-sonnet-4-6"`, `"openai:gpt-4o"`, …). |
+| `language_hint` | `str \| None` | `None` | ISO 639-1; guides multilingual OCR / extraction. ≤ 16 chars. |
+| `return_bboxes` | `bool` | `True` | When `False`, the response strips bounding boxes (cheaper to ship). |
+| `declared_media_type` | `str \| None` | `None` | Override sniffing. Rare; useful when the caller knows better than `magic`. |
+| `stages` | `StageToggles` | `StageToggles()` | See below. |
+| `escalation` | `EscalationConfig \| None` | `None` | Replaces v0 `escalation_threshold` + `escalation_model` (now nested). |
+| `transformations` | `list[Transformation \| dict]` | `[]` | Post-extraction transformations. See §9. |
+
+### `EscalationConfig`
+
+| Field | Type | Default | Notes |
+|--------------|-----------|---------|----------------------------------------------------------------------|
+| `threshold` | `float` | — | `0.0–1.0`. The judge fail-rate trigger for the escalation re-run. |
+| `model` | `str` | — | Model id used by the escalation re-run. |
### `StageToggles` — all ten stages
| Stage | Default | What it does |
|------------------------|---------|---------------------------------------------------------------------------------------------------------------|
-| `splitter` | `False` | LLM document splitter. Required when one upload mixes several document types and you need page ranges per type. |
-| `classifier` | **`True`** | LLM classifier that maps each input file to one of the declared `DocSpec.doc_type.documentType` values. No-op when every file already carries `document_type`. |
-| `field_validation` | **`True`** | Pure-Python validation pass — runs `pattern`, `format`, `enum`, `min`/`max`, every `StandardValidatorSpec`. |
-| `visual_authenticity` | `False` | LLM visual check using the `ValidatorsSpec.visual` declarations (signature, watermark, …). |
-| `content_authenticity` | `False` | LLM cross-document content checks (consistency across pages / files). |
+| `splitter` | `False` | LLM document splitter. Required when one upload mixes several document types. |
+| `classifier` | **`True`** | LLM classifier mapping each input file to one of the declared document types. No-op when every file carries `expected_type`. |
+| `field_validation` | **`True`** | Pure-Python validation pass. |
+| `visual_authenticity` | `False` | LLM visual check using the `DocumentTypeSpec.visual_checks` declarations. |
+| `content_authenticity` | `False` | LLM cross-document content checks. |
| `judge` | `False` | Per-field LLM re-evaluation. Annotates every extracted field with `confidence`, `evidence`, `flag_for_review`. |
-| `judge_escalation` | `False` | When the judge's fail-rate exceeds `escalation_threshold`, re-runs extract + judge with `escalation_model`; the lower-fail-rate run wins. Requires `judge`. |
-| `bbox_refine` | `False` | Replaces LLM-estimated bboxes with grounded coordinates from the document's real text layer (PyMuPDF for born-digital PDFs, OCR for rasters). Multilingual-aware. |
-| `rule_engine` | `False` | Evaluates the business-rule DAG against extracted fields + validator outcomes. See §8. |
+| `judge_escalation` | `False` | When the judge's fail-rate exceeds `escalation.threshold`, re-runs extract + judge with `escalation.model`. |
+| `bbox_refine` | `False` | Replaces LLM-estimated bboxes with grounded coordinates from the document's text layer (PyMuPDF) or OCR. |
+| `rule_engine` | `False` | Evaluates the business-rule DAG. See §8. |
| `transform` | `False` | Runs the `transformations` list. See §9. |
-> **Cost & latency.** `extract` is mandatory. `classifier` and `field_validation` are cheap (cheap LLM call + pure Python). `judge` doubles your LLM spend per field. `judge_escalation` adds a third pass when triggered. `bbox_refine` adds ~50–200 ms per 30-page PDF (text-layer) or seconds-per-page for image-only PDFs (OCR). `visual_authenticity`, `content_authenticity` each add one LLM call.
-
---
## 8. `RuleSpec` — business rules over extracted fields
-Rules are **natural-language predicates** the LLM evaluates against extracted fields, validator outcomes, or other rules' outputs. They form a DAG; the engine sorts topologically and runs in dependency order. Cycles are rejected at request-validation time.
+Rules are **natural-language predicates** the LLM evaluates against extracted fields, validator outcomes, or other rules' outputs. They form a DAG; the engine sorts topologically. Cycles are rejected at request-validation time.
```python
from flydocs_sdk import (
@@ -444,22 +437,22 @@ totals_consistent = RuleSpec(
predicate="subtotal + tax_amount equals total_amount within 0.01",
parents=[RuleFieldParent(
document_type="invoice",
- field_names=["subtotal", "tax_amount", "total_amount"],
+ fields=["subtotal", "tax_amount", "total_amount"],
)],
)
vat_id_valid = RuleSpec(
id="vat_id_valid",
- predicate="The supplier_vat field passes the VAT_ID validator",
- parents=[RuleValidatorParent(document_type="invoice", validator_name="vat_id")],
+ predicate="The supplier_vat field passes the vat_id validator",
+ parents=[RuleValidatorParent(document_type="invoice", validator="vat_id")],
)
acceptable = RuleSpec(
id="invoice_acceptable",
predicate="totals_consistent AND vat_id_valid",
parents=[
- RuleRuleParent(rule_id="totals_consistent"),
- RuleRuleParent(rule_id="vat_id_valid"),
+ RuleRuleParent(rule="totals_consistent"),
+ RuleRuleParent(rule="vat_id_valid"),
],
output=RuleOutputSpec(type="boolean"),
)
@@ -469,25 +462,20 @@ acceptable = RuleSpec(
| Field | Type | Default | Notes |
|--------------|--------------------------------------------|-----------------------|--------------------------------------------------------------------|
-| `id` | `str` | —, required | Unique within the request. Referenced by `RuleRuleParent.rule_id`. |
+| `id` | `str` | —, required | Unique within the request. Referenced by `RuleRuleParent.rule`. |
| `predicate` | `str` | —, required | Natural-language statement evaluated by the LLM. |
| `parents` | `list[RuleParent]` | `[]` | Discriminated union — see below. |
-| `output` | `RuleOutputSpec` | `RuleOutputSpec()` (`type="boolean"`) | Shape the response should carry. |
-
-### `RuleParent` — three variants
+| `output` | `RuleOutputSpec` | `RuleOutputSpec()` | Shape the response should carry. |
-| Variant | Discriminator | Fields | Use for |
-|--------------------------|-----------------|-----------------------------------------------------|--------------------------------------------------|
-| `RuleFieldParent` | `"field"` | `document_type` (str), `field_names` (list[str], min 1) | "This rule operates on these fields of this document type." |
-| `RuleValidatorParent` | `"validator"` | `document_type` (str), `validator_name` (str) | "This rule operates on the outcome of this validator." |
-| `RuleRuleParent` | `"rule"` | `rule_id` (str) | "This rule depends on another rule's output." |
+### `RuleParent` — three variants (discriminator `kind`)
-### `RuleOutputSpec`
+| Variant | `kind` | Fields | Use for |
+|--------------------------|------------------|-----------------------------------------------------|--------------------------------------------------|
+| `RuleFieldParent` | `"field"` | `document_type` (str), `fields` (list[str], min 1) | "This rule operates on these fields of this document type." |
+| `RuleValidatorParent` | `"validator"` | `document_type` (str), `validator` (str) | "This rule operates on the outcome of this validator." |
+| `RuleRuleParent` | `"rule"` | `rule` (str) | "This rule depends on another rule's output." |
-| Field | Type | Default | Notes |
-|-------------------|----------------------------|---------------|--------------------------------------------------------------------------------------|
-| `type` | `str` | `"boolean"` | Other supported types: `"string"`, `"number"`. The rule engine coerces accordingly. |
-| `valid_outputs` | `list[str] \| None` | `None` | Closed set of acceptable string outputs. Anything else is treated as `flag_for_review`. |
+The v0 keys (`parentType`, `fieldNames`, `validatorName`, `ruleId`) are gone in v1.
### Response shape
@@ -495,10 +483,10 @@ The response carries `result.rule_results: list[RuleResult]` with one entry per
```python
for rr in result.rule_results:
- print(rr["rule_id"], rr["output"], rr.get("summary"), rr.get("human_revision"))
+ print(rr.rule_id, rr.output, rr.summary, rr.human_revision)
```
-`output` is the resolved value (string form: `"true"` / `"false"` / your custom strings). `human_revision` carries instructions for a human reviewer when the rule's output didn't fit `valid_outputs`.
+`summary` and `human_revision` are both `str | None` in v1.
---
@@ -506,38 +494,34 @@ for rr in result.rule_results:
Two transformation types ship in-tree. Both are passed through `ExtractionOptions.transformations`; the `transform` stage must be enabled in `StageToggles`.
-### `entity_resolution` — declarative, fast, free
+### `EntityResolutionTransformation` — declarative, fast, free
-Deduplicates rows of an array field group using accent-fold + token-subset matching. Typical use: collapse `"Andrés Contreras"` and `"Andres Contreras Guillen"` into a single row across N documents.
+Deduplicates rows of an array field group using accent-fold + token-subset matching.
```python
from flydocs_sdk import (
- ExtractionOptions, StageToggles, TransformationScope,
- entity_resolution,
+ EntityResolutionTransformation, ExtractionOptions, StageToggles, TransformationScope,
)
opts = ExtractionOptions(
stages=StageToggles(transform=True),
transformations=[
- entity_resolution(
- target_group="personas", # array field group to dedupe
- match_by=["dni", "nombre"], # priority: DNI first, then name
- min_shared_tokens=2, # default; lower = more aggressive merging
- scope=TransformationScope.REQUEST, # dedupe ACROSS documents
- # output_group="personas_canonical", # keep both views (omit to replace)
+ EntityResolutionTransformation(
+ target_group="personas",
+ match_by=["dni", "nombre"],
+ min_shared_tokens=2,
+ scope=TransformationScope.REQUEST,
),
],
)
```
-### `llm_transformation` — free-form
-
-A focused LLM call against a target group, driven by a one-sentence `intention`:
+### `LlmTransformation` — free-form
```python
-from flydocs_sdk import llm_transformation
+from flydocs_sdk import LlmTransformation, TransformationScope
-llm_transformation(
+LlmTransformation(
target_group="cargos",
intention=(
"Normaliza cada cargo a una taxonomía cerrada: "
@@ -551,155 +535,173 @@ llm_transformation(
| Field | Type | Default | Notes |
|------------------|----------------------------|--------------------------|--------------------------------------------------------------------|
-| `target_group` | `str` | —, required | Must match a `FieldGroup.field_group_name` the extractor produces. |
-| `output_group` | `str \| None` | `None` | When set, the transformation output is appended as a NEW group; the original stays. When `None`, replaces in place. |
-| `scope` | `TransformationScope` | `TASK` | `TASK`: one pass per document. `REQUEST`: concatenates across documents, runs once, emits under `result.request_transformations`. |
+| `target_group` | `str` | —, required | Must match a `FieldGroup.name` the extractor produces. |
+| `output_group` | `str \| None` | `None` | When set, append the transformation output as a NEW group; the original stays. When `None`, replaces in place. |
+| `scope` | `TransformationScope` | `TASK` | `TASK`: one pass per document. `REQUEST`: across documents. |
| `id` | `str` | random UUIDv4 | Used in logs and the trace. |
-### `entity_resolution`-only
+### `EntityResolutionTransformation`-only
| Field | Type | Default | Notes |
|---------------------|-------------|---------|-------------------------------------------------------------|
-| `match_by` | `list[str]` | required, min length 1 | Priority-ordered field names. First non-empty wins as the matching key. |
+| `match_by` | `list[str]` | required, min length 1 | Priority-ordered field names. |
| `min_shared_tokens` | `int` | `2` | Minimum shared name tokens for a name-variant match. |
-### `llm_transformation`-only
+### `LlmTransformation`-only
| Field | Type | Default | Notes |
|--------------|----------------|---------|------------------------------------------------------|
| `intention` | `str` | required, min length 10 | One-sentence goal in any language. |
-| `prompt_id` | `str \| None` | `None` | Named template id from the server-side catalog. Omit to use the default transform prompt with `intention` interpolated. |
+| `prompt_id` | `str \| None` | `None` | Named template id from the server-side catalog. |
---
-## 10. Async jobs — `SubmitJobRequest`, callbacks, idempotency
+## 10. Async extractions — `SubmitExtractionRequest`, callbacks, idempotency
-For long documents, batches, or fire-and-forget workloads, use `submit_job` and either poll with `wait_for_completion` or receive a webhook.
+For long documents, batches, or fire-and-forget workloads, use `extractions.create` and either poll with `wait_for_completion` or receive a webhook. Replaces v0 `submit_job` / `/api/v1/jobs`.
```python
from flydocs_sdk import (
- AsyncFlydocsClient, DocumentInput, JobStatus, SubmitJobRequest,
+ AsyncClient, ExtractionStatus, FileInput, SubmitExtractionRequest,
)
-async with AsyncFlydocsClient("http://localhost:8400") as flydocs:
- submit = await flydocs.submit_job(
- SubmitJobRequest(
- documents=[DocumentInput.from_path("big-batch.pdf")],
- docs=[invoice],
+async with AsyncClient("http://localhost:8400") as flydocs:
+ ext = await flydocs.extractions.create(
+ SubmitExtractionRequest(
+ files=[FileInput.from_path("big-batch.pdf")],
+ document_types=[invoice],
callback_url="https://your-app.example.com/flydocs/webhook",
metadata={"caller": "ingest-pipeline", "batch_id": "b-42"},
),
- idempotency_key="ingest-pipeline:b-42", # safe to retry
+ idempotency_key="ingest-pipeline:b-42",
correlation_id="req-12345",
)
- print(f"queued {submit.job_id} ({submit.status})")
+ print(f"queued {ext.id} ({ext.status.value})")
final = await flydocs.wait_for_completion(
- submit.job_id,
- poll_interval=2.0,
- timeout=900.0,
+ ext.id, poll_interval=2.0, timeout=900.0,
)
- if final.status == JobStatus.SUCCEEDED:
- result = (await flydocs.get_job_result(submit.job_id)).result
+ if final.status == ExtractionStatus.SUCCEEDED:
+ envelope = await flydocs.extractions.get_result(ext.id)
+ result = envelope.result
...
```
-### `SubmitJobRequest`
+### `SubmitExtractionRequest`
A superset of `ExtractionRequest`:
| Field | Type | Default | Notes |
|------------------|---------------------------------|--------------------------|--------------------------------------------------------------------|
-| (all fields from `ExtractionRequest` minus `request_id`) | — | — | The job's `job_id` plays the role of `request_id`. |
-| `callback_url` | `str \| None` | `None` | When set, the service POSTs a `JobWebhookPayload` here on terminal status (see §11). |
-| `metadata` | `dict[str, Any]` | `{}` | Echoed back on the webhook payload — use for caller-side correlation. |
+| (all fields from `ExtractionRequest`) | — | — | The extraction's `id` plays the role of v0's `request_id`. |
+| `callback_url` | `str \| None` | `None` | When set, the service POSTs an `EventEnvelope` here on terminal status. |
+| `metadata` | `dict[str, Any]` | `{}` | Echoed back on the envelope — use for caller-side correlation. |
+
+### Lifecycle states
+
+v1 simplifies the state machine to a linear `queued → running → succeeded | failed | cancelled`. The intermediate `PARTIAL_SUCCEEDED` / `REFINING_BBOXES` states from v0 are gone — bbox refinement runs as additive post-processing under `Extraction.post_processing.bbox_refinement` without gating the main lifecycle.
+
+| Status | Wire form | When |
+|---------------------|------------------|------------------------------------------------------------------|
+| `ExtractionStatus.QUEUED` | `"queued"` | Persisted, waiting for the worker. |
+| `ExtractionStatus.RUNNING` | `"running"` | Worker claimed it. |
+| `ExtractionStatus.SUCCEEDED` | `"succeeded"` | Terminal: the main pipeline finished cleanly. |
+| `ExtractionStatus.FAILED` | `"failed"` | Terminal: the worker hit an unrecoverable error. |
+| `ExtractionStatus.CANCELLED` | `"cancelled"` | Terminal: caller cancelled while queued. |
+
+Post-processing has its own lifecycle (`PostProcessingStatus.PENDING/RUNNING/SUCCEEDED/FAILED`).
### Headers per call
| Header | SDK kwarg | Notes |
|-------------------|--------------------|--------------------------------------------------------------------|
-| `Idempotency-Key` | `idempotency_key=` | Send the same key to replay an existing submission instead of creating a duplicate. The service indexes by key. |
-| `X-Correlation-Id`| `correlation_id=` | Stamped on every internal log line and on the webhook payload (`correlation_id` field). |
+| `Idempotency-Key` | `idempotency_key=` | Send the same key to replay an existing submission. |
+| `X-Correlation-Id`| `correlation_id=` | Stamped on every internal log line and on the webhook envelope. |
### Polling helper
```python
final = await flydocs.wait_for_completion(
- submit.job_id,
- poll_interval=2.0, # seconds between GET /api/v1/jobs/{id}
+ ext.id,
+ poll_interval=2.0, # seconds between GET /api/v1/extractions/{id}
timeout=900.0, # raises TimeoutError after this many seconds
)
```
-Terminal statuses are `SUCCEEDED`, `PARTIAL_SUCCEEDED`, `FAILED`, `CANCELLED`. `wait_for_completion` returns the final `JobStatusResponse` in all four cases — it only raises `TimeoutError` when the deadline elapses while the worker is still in flight.
+Terminal statuses are `SUCCEEDED`, `FAILED`, `CANCELLED`. `wait_for_completion` returns the final `Extraction` in all three cases.
### Listing / cancelling
```python
-listing = await flydocs.list_jobs(
- status=["SUCCEEDED", "PARTIAL_SUCCEEDED"], # CSV filter
- bbox_refine_status=["pending", "running"], # CSV filter
- idempotency_key="ingest-pipeline:b-42", # exact match
+listing = await flydocs.extractions.list(
+ status=[ExtractionStatus.SUCCEEDED, ExtractionStatus.FAILED],
+ post_processing_status=[PostProcessingStatus.PENDING, PostProcessingStatus.RUNNING],
+ idempotency_key="ingest-pipeline:b-42",
created_after=datetime(2026, 5, 1),
created_before=datetime(2026, 5, 31, 23, 59),
limit=25,
offset=0,
)
-for job in listing.items:
- print(job.job_id, job.status, job.submitted_at)
+for ext in listing.items:
+ print(ext.id, ext.status, ext.submitted_at)
-await flydocs.cancel_job("job-abc") # only valid while QUEUED
+await flydocs.extractions.cancel("ext_abc") # only valid while QUEUED
```
---
## 11. Webhooks — receiving and verifying delivery
-When `callback_url` is set, the service POSTs a `JobWebhookPayload` on terminal status. It signs the body with HMAC-SHA256 in `X-Flydocs-Signature` when `FLYDOCS_WEBHOOK_HMAC_SECRET` is configured on the service.
+When `callback_url` is set, the service POSTs an `EventEnvelope` on every lifecycle event. It signs the body with HMAC-SHA256 in `X-Flydocs-Signature` when `FLYDOCS_WEBHOOK_HMAC_SECRET` is configured on the service.
+
+### Event types (string literals)
-### Payload shape — `JobWebhookPayload`
+| Constant | String value |
+|---------------------------------------------------------|-----------------------------------------------|
+| `EVENT_TYPE_EXTRACTION_SUBMITTED` | `"extraction.submitted"` |
+| `EVENT_TYPE_EXTRACTION_COMPLETED` | `"extraction.completed"` |
+| `EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED` | `"extraction.post_processing.requested"` |
+| `EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED` | `"extraction.post_processing.completed"` |
+
+### `EventEnvelope` shape
| Field | Type | Notes |
|-------------------|----------------------------|--------------------------------------------------------------------------------|
| `event_id` | `str` | Unique per delivery. Dedupe on this — the publisher retries on transient errors. |
-| `event_type` | `str` | `"IDPJobCompleted"`. |
-| `version` | `str` | Semver of the payload schema (`"1.0.0"`). |
-| `job_id` | `str` | The submitted job. |
-| `status` | `JobStatus` | Terminal: `SUCCEEDED` / `PARTIAL_SUCCEEDED` / `FAILED` / `CANCELLED`. |
-| `occurred_at` / `started_at` / `finished_at` | `datetime` | Lifecycle timestamps. |
-| `attempts` | `int` | Worker attempts consumed. |
+| `event_type` | `str` | One of the four constants above. |
+| `version` | `str` | Semver of the envelope schema (`"1.0.0"`). |
+| `occurred_at` | `datetime` | When the event happened. |
| `correlation_id` | `str \| None` | The `X-Correlation-Id` you passed at submit time, if any. |
| `tenant_id` | `str \| None` | When the service runs multi-tenant. |
-| `metadata` | `dict[str, Any]` | The dict you passed in `SubmitJobRequest.metadata`. |
-| `result` | `ExtractionResult \| None` | Present on `SUCCEEDED` / `PARTIAL_SUCCEEDED`; `None` on `FAILED` / `CANCELLED`. |
-| `error_code` / `error_message` | `str \| None` | Populated when the job failed. |
+| `extraction` | `Extraction` | Current-state snapshot of the resource. |
+| `result` | `ExtractionResult \| None` | Present on `extraction.completed` when terminal status is `succeeded`; `None` otherwise. |
+| `metadata` | `dict[str, Any]` | The dict you passed in `SubmitExtractionRequest.metadata`. |
### Verifying — FastAPI example
```python
import os
from flydocs_sdk import (
- JobStatus, JobWebhookPayload,
+ EVENT_TYPE_EXTRACTION_COMPLETED, ExtractionStatus,
WebhookVerificationError, WebhookVerifier,
)
-from fastapi import FastAPI, Header, HTTPException, Request
+from fastapi import FastAPI, HTTPException, Request
verifier = WebhookVerifier(secret=os.environ["FLYDOCS_WEBHOOK_HMAC_SECRET"])
app = FastAPI()
@app.post("/flydocs/webhook")
-async def on_webhook(
- request: Request,
- x_flydocs_signature: str = Header(...),
-) -> dict:
+async def on_webhook(request: Request) -> dict:
body = await request.body() # raw bytes
+ signature = request.headers.get("X-Flydocs-Signature", "")
try:
- verifier.verify(body, x_flydocs_signature)
+ envelope = verifier.verify(body, signature) # typed EventEnvelope
except WebhookVerificationError:
raise HTTPException(status_code=403, detail="bad signature")
- payload = JobWebhookPayload.model_validate_json(body)
- if payload.status == JobStatus.SUCCEEDED and payload.result is not None:
- ... # persist, fan out downstream work
+ if envelope.event_type == EVENT_TYPE_EXTRACTION_COMPLETED:
+ ext = envelope.extraction
+ if ext.status == ExtractionStatus.SUCCEEDED and envelope.result is not None:
+ ... # persist, fan out downstream work
return {"ok": True}
```
@@ -709,29 +711,37 @@ async def on_webhook(
## 12. Errors — RFC 7807 problem-details
-Every non-2xx response decodes into a typed `FlydocsHTTPError` with `status_code`, `code`, `title`, `detail`, and the raw `payload` dict:
-
-| `code` | Status | Meaning |
-|-------------------------|--------|-----------------------------------------------------------------------------------------------------------|
-| `extraction_timeout` | 408 | Pipeline exceeded the sync ceiling (`FLYDOCS_SYNC_TIMEOUT_S`). Retry via `submit_job`. |
-| `document_too_large` | 413 | Document over `FLYDOCS_MAX_BYTES`. |
-| `invalid_base64` | 422 | `content_base64` failed strict parsing. |
-| `invalid_request` | 422 | Semantic validation found issues (rule references unknown field, duplicate ids, cycles, …). `payload` carries every issue. |
-| `job_not_ready` | 409 | `GET /jobs/{id}/result` called before the worker finished. |
-| `job_not_cancellable` | 409 | Worker already started; mid-flight cancellation isn't supported. |
-| `JOB_NOT_FOUND` | 404 | Unknown `job_id`. |
+Every non-2xx response decodes into a typed `FlydocsHttpError` with `status_code`, `code`, `title`, `detail`, `type`, `instance`, `extensions`, and the raw `payload` dict:
+
+| `code` | Status | Meaning |
+|------------------------------|--------|-----------------------------------------------------------------------------------------------------------|
+| `timeout` | 408 | Sync pipeline exceeded `FLYDOCS_SYNC_TIMEOUT_S`. Retry via `extractions.create`. |
+| `file_too_large` | 413 | File over `FLYDOCS_MAX_BYTES`. |
+| `unsupported_file` | 415 | The file's media type is unsupported. |
+| `invalid_base64` | 422 | `content_base64` failed strict parsing. |
+| `validation_failed` | 422 | Semantic validation found issues. `payload` carries every issue. |
+| `invalid_request` | 422 | Generic request-shape problem. |
+| `encrypted_pdf` | 422 | PDF carries an encryption header the service can't open. |
+| `office_conversion_failed` | 422 | Gotenberg/LibreOffice could not convert the Office document. |
+| `archive_extraction_failed` | 422 | ZIP / 7z / TAR could not be unpacked. |
+| `image_conversion_failed` | 422 | Pillow / cairosvg could not convert the image. |
+| `not_ready` | 409 | `GET /extractions/{id}/result` called before the extraction succeeded. |
+| `not_cancellable` | 409 | Worker already started; mid-flight cancellation isn't supported. |
+| `not_found` | 404 | Unknown extraction id. |
+| `unauthorized` | 401 | API key missing or invalid. |
```python
from flydocs_sdk import (
- FlydocsClientError, FlydocsHTTPError, FlydocsTimeoutError,
+ FlydocsClientError, FlydocsHttpError, FlydocsTimeoutError,
+ SubmitExtractionRequest,
)
try:
result = await flydocs.extract(req)
-except FlydocsHTTPError as exc:
- if exc.code == "extraction_timeout":
- submit = await flydocs.submit_job(SubmitJobRequest(**req.model_dump()))
- elif exc.code == "invalid_request":
+except FlydocsHttpError as exc:
+ if exc.code == "timeout":
+ await flydocs.extractions.create(SubmitExtractionRequest(**req.model_dump()))
+ elif exc.code in ("validation_failed", "invalid_request"):
for issue in exc.payload.get("errors", []):
print(issue)
raise
@@ -743,23 +753,27 @@ except FlydocsClientError:
raise # transport failure (DNS, connect, TLS, …)
```
+The error also carries the full RFC 7807 view via `exc.as_problem_details()` returning a typed `ProblemDetails`.
+
---
## 13. Production patterns
-**Reuse a client.** Construct `AsyncFlydocsClient` once per application and share it. The underlying httpx connection pool is the most expensive part to set up.
+**Reuse a client.** Construct `AsyncClient` once per application and share it. The underlying httpx connection pool is the most expensive part to set up.
-**Correlation ids.** Pass `correlation_id="..."` on `extract` / `submit_job`. The service stamps it on every internal log line and on the webhook payload.
+**API keys.** Pass `api_key="..."` to the constructor and the SDK adds the `Authorization: Bearer ...` header on every call.
-**Custom timeouts.** Default is 60 s. `AsyncFlydocsClient("http://...", timeout=120.0)`.
+**Correlation ids.** Pass `correlation_id="..."` on `extract` / `extractions.create`. The service stamps it on every internal log line and on the webhook envelope.
-**Default headers.** `AsyncFlydocsClient(..., default_headers={"X-Tenant-Id": "tenant-42"})` adds the header to every outbound request.
+**Custom timeouts.** Default is 60 s. `AsyncClient("http://...", timeout=120.0)`.
-**Bring your own httpx client.** `AsyncFlydocsClient(..., http_client=existing)` shares your app's connection pool. The SDK never closes transports it didn't create.
+**Default headers.** `AsyncClient(..., default_headers={"X-Tenant-Id": "tenant-42"})` adds the header to every outbound request.
-**Health checks.** `await flydocs.health("readiness")` returns the actuator JSON — wire it into your deploy verification.
+**Bring your own httpx client.** `AsyncClient(..., http_client=existing)` shares your app's connection pool. The SDK never closes transports it didn't create.
-**Cost tracking.** When the service has cost tracking enabled, `result.usage` carries per-agent and per-model token + USD breakdowns; webhook payloads carry the same.
+**Health checks.** `await flydocs.health("readiness")` returns the actuator JSON.
+
+**Cost tracking.** When the service has cost tracking enabled, `result.pipeline.usage` carries per-agent and per-model token + USD breakdowns; webhook envelopes carry the same.
---
@@ -770,63 +784,80 @@ A realistic invoice extraction touching every feature: typed schema with array r
```python
import asyncio
from flydocs_sdk import (
- AsyncFlydocsClient,
- DocSpec, DocType, DocumentInput,
- ExtractionOptions, ExtractionRequest,
- FieldGroup, FieldItem, FieldSpec, FieldType,
- JobStatus,
- RuleFieldParent, RuleRuleParent, RuleSpec, RuleValidatorParent,
- StageToggles, StandardFormat,
- StandardValidatorSpec, StandardValidatorType,
- SubmitJobRequest,
- TransformationScope, entity_resolution,
+ AsyncClient,
+ DocumentTypeSpec,
+ EntityResolutionTransformation,
+ EscalationConfig,
+ ExtractionOptions,
+ ExtractionStatus,
+ Field,
+ FieldGroup,
+ FieldType,
+ FileInput,
+ RuleFieldParent,
+ RuleRuleParent,
+ RuleSpec,
+ RuleValidatorParent,
+ StageToggles,
+ StandardFormat,
+ SubmitExtractionRequest,
+ TransformationScope,
+ ValidatorSpec,
+ ValidatorType,
)
-invoice = DocSpec(
- doc_type=DocType(document_type="invoice", description="Vendor invoice", country="ES"),
+invoice = DocumentTypeSpec(
+ id="invoice",
+ description="Vendor invoice",
+ country="ES",
field_groups=[
- FieldGroup.of("header",
- FieldSpec(field_name="invoice_number", field_type=FieldType.STRING, required=True),
- FieldSpec(field_name="invoice_date", field_type=FieldType.STRING,
- format=StandardFormat.DATE, required=True),
- FieldSpec(field_name="supplier_name", field_type=FieldType.STRING, required=True),
- FieldSpec(
- field_name="supplier_vat",
- field_type=FieldType.STRING,
+ FieldGroup(name="header", fields=[
+ Field(name="invoice_number", type=FieldType.STRING, required=True),
+ Field(name="invoice_date", type=FieldType.STRING,
+ format=StandardFormat.DATE, required=True),
+ Field(name="supplier_name", type=FieldType.STRING, required=True),
+ Field(
+ name="supplier_vat",
+ type=FieldType.STRING,
required=True,
- standard_validators=[
- StandardValidatorSpec(
- type=StandardValidatorType.VAT_ID, params={"country": "ES"},
- ),
- ],
+ validators=[ValidatorSpec(
+ name=ValidatorType.VAT_ID, params={"country": "ES"}
+ )],
),
- FieldSpec(field_name="supplier_iban", field_type=FieldType.STRING,
- standard_validators=[
- StandardValidatorSpec(type=StandardValidatorType.IBAN),
- ]),
- ),
- FieldGroup.of("totals",
- FieldSpec(field_name="subtotal", field_type=FieldType.NUMBER, required=True, minimum=0.0),
- FieldSpec(field_name="tax_amount", field_type=FieldType.NUMBER, required=True, minimum=0.0),
- FieldSpec(field_name="total_amount", field_type=FieldType.NUMBER, required=True, minimum=0.0),
- FieldSpec(field_name="currency", field_type=FieldType.STRING, required=True,
- standard_validators=[
- StandardValidatorSpec(type=StandardValidatorType.CURRENCY_CODE),
- ]),
- ),
- FieldGroup.of("line_items_block",
- FieldSpec(
- field_name="line_items",
- field_type=FieldType.ARRAY,
- items=[
- FieldItem(field_name="description", field_type=FieldType.STRING),
- FieldItem(field_name="quantity", field_type=FieldType.NUMBER, minimum=0),
- FieldItem(field_name="unit_price", field_type=FieldType.NUMBER, minimum=0),
- FieldItem(field_name="line_total", field_type=FieldType.NUMBER, minimum=0),
- ],
+ Field(
+ name="supplier_iban",
+ type=FieldType.STRING,
+ validators=[ValidatorSpec(name=ValidatorType.IBAN)],
),
- ),
+ ]),
+ FieldGroup(name="totals", fields=[
+ Field(name="subtotal", type=FieldType.NUMBER, required=True, minimum=0.0),
+ Field(name="tax_amount", type=FieldType.NUMBER, required=True, minimum=0.0),
+ Field(name="total_amount", type=FieldType.NUMBER, required=True, minimum=0.0),
+ Field(
+ name="currency",
+ type=FieldType.STRING,
+ required=True,
+ validators=[ValidatorSpec(name=ValidatorType.CURRENCY_CODE)],
+ ),
+ ]),
+ FieldGroup(name="line_items_block", fields=[
+ Field(
+ name="line_items",
+ type=FieldType.ARRAY,
+ items=Field(
+ name="row",
+ type=FieldType.OBJECT,
+ fields=[
+ Field(name="description", type=FieldType.STRING),
+ Field(name="quantity", type=FieldType.NUMBER, minimum=0),
+ Field(name="unit_price", type=FieldType.NUMBER, minimum=0),
+ Field(name="line_total", type=FieldType.NUMBER, minimum=0),
+ ],
+ ),
+ ),
+ ]),
],
)
@@ -834,31 +865,33 @@ rules = [
RuleSpec(
id="totals_consistent",
predicate="subtotal + tax_amount equals total_amount within 0.01",
- parents=[RuleFieldParent(document_type="invoice",
- field_names=["subtotal", "tax_amount", "total_amount"])],
+ parents=[RuleFieldParent(
+ document_type="invoice",
+ fields=["subtotal", "tax_amount", "total_amount"],
+ )],
),
RuleSpec(
id="vat_id_valid",
- predicate="The supplier_vat field passes the VAT_ID validator",
- parents=[RuleValidatorParent(document_type="invoice", validator_name="vat_id")],
+ predicate="The supplier_vat field passes the vat_id validator",
+ parents=[RuleValidatorParent(document_type="invoice", validator="vat_id")],
),
RuleSpec(
id="invoice_acceptable",
predicate="totals_consistent AND vat_id_valid",
parents=[
- RuleRuleParent(rule_id="totals_consistent"),
- RuleRuleParent(rule_id="vat_id_valid"),
+ RuleRuleParent(rule="totals_consistent"),
+ RuleRuleParent(rule="vat_id_valid"),
],
),
]
async def main(invoice_path: str) -> None:
- async with AsyncFlydocsClient("http://localhost:8400") as flydocs:
- submit = await flydocs.submit_job(
- SubmitJobRequest(
- documents=[DocumentInput.from_path(invoice_path)],
- docs=[invoice],
+ async with AsyncClient("http://localhost:8400") as flydocs:
+ ext = await flydocs.extractions.create(
+ SubmitExtractionRequest(
+ files=[FileInput.from_path(invoice_path)],
+ document_types=[invoice],
rules=rules,
options=ExtractionOptions(
language_hint="es",
@@ -872,10 +905,12 @@ async def main(invoice_path: str) -> None:
rule_engine=True,
transform=True,
),
- escalation_threshold=0.25,
- escalation_model="anthropic:claude-opus-4-7",
+ escalation=EscalationConfig(
+ threshold=0.25,
+ model="anthropic:claude-opus-4-7",
+ ),
transformations=[
- entity_resolution(
+ EntityResolutionTransformation(
target_group="line_items",
match_by=["description"],
scope=TransformationScope.TASK,
@@ -890,15 +925,17 @@ async def main(invoice_path: str) -> None:
correlation_id="req-12345",
)
- final = await flydocs.wait_for_completion(submit.job_id, poll_interval=2.0, timeout=900.0)
- if final.status != JobStatus.SUCCEEDED:
- raise SystemExit(f"job did not succeed: {final.status} {final.error_message}")
+ final = await flydocs.wait_for_completion(ext.id, poll_interval=2.0, timeout=900.0)
+ if final.status != ExtractionStatus.SUCCEEDED:
+ err_msg = final.error.message if final.error else ""
+ raise SystemExit(f"extraction did not succeed: {final.status.value} {err_msg}")
- result = (await flydocs.get_job_result(submit.job_id)).result
+ envelope = await flydocs.extractions.get_result(ext.id)
+ result = envelope.result
for rr in result.rule_results:
- print(f" rule {rr['rule_id']}: {rr['output']}")
- for line in result.documents[0]["fields"]:
- print(line["fieldGroupName"], "→", len(line["fieldGroupFields"]), "fields")
+ print(f" rule {rr.rule_id}: {rr.output}")
+ for group in result.documents[0].field_groups:
+ print(group.name, "→", len(group.fields), "fields")
asyncio.run(main("invoice.pdf"))
@@ -908,25 +945,25 @@ asyncio.run(main("invoice.pdf"))
## 15. Synchronous facade (when async isn't an option)
-For scripts, batch tools, and callers that can't run an event loop, `FlydocsClient` wraps `AsyncFlydocsClient` on a dedicated background loop:
+For scripts, batch tools, and callers that can't run an event loop, `Client` wraps `AsyncClient` on a dedicated background loop:
```python
-from flydocs_sdk import FlydocsClient
+from flydocs_sdk import Client
-with FlydocsClient("http://localhost:8400") as flydocs:
+with Client("http://localhost:8400") as flydocs:
result = flydocs.extract(req)
```
-Method-for-method identical to `AsyncFlydocsClient`, just without `await`. Prefer the async client whenever you can — the sync wrapper costs you one extra event loop per instance.
+Method-for-method identical to `AsyncClient`, just without `await`. Prefer the async client whenever you can — the sync wrapper costs you one extra event loop per instance.
---
## Further reading
- [`QUICKSTART.md`](./QUICKSTART.md) — 5-minute zero-to-first-extraction.
-- [`examples/`](./examples/) — six runnable scripts mirroring each section above.
+- [`examples/`](./examples/) — six runnable scripts.
+- [`docs/migration-v0-to-v1.md`](../../docs/migration-v0-to-v1.md) — complete rename / reshape table for v0 callers.
- [`docs/api-reference.md`](../../docs/api-reference.md) — full HTTP wire contract.
- [`docs/pipeline.md`](../../docs/pipeline.md) — stage DAG internals.
- [`docs/rule-engine.md`](../../docs/rule-engine.md) — rule engine semantics + DAG resolution.
-- [`docs/standard-validators.md`](../../docs/standard-validators.md) — per-validator algorithm references.
- [`docs/transformations.md`](../../docs/transformations.md) — the `transform` stage internals.
diff --git a/sdks/python/examples/01_first_extraction.py b/sdks/python/examples/01_first_extraction.py
index 2ecf65a..a7d22a2 100644
--- a/sdks/python/examples/01_first_extraction.py
+++ b/sdks/python/examples/01_first_extraction.py
@@ -1,4 +1,9 @@
-"""Hello, flydocs — the smallest async-first runnable example.
+"""Hello, flydocs -- the smallest async-first runnable example (v1 contract).
+
+What it shows:
+ * Build a typed :class:`DocumentTypeSpec` for a single document type.
+ * Submit one file via the v1 ``files`` / ``document_types`` keys.
+ * Walk the new response shape (``documents[*].field_groups[*].fields``).
Run from the repo root, with a flydocs service reachable at
``http://localhost:8400`` (e.g. via ``task docker:up:test``)::
@@ -13,44 +18,45 @@
from pathlib import Path
from flydocs_sdk import (
- AsyncFlydocsClient,
- DocSpec,
- DocumentInput,
+ AsyncClient,
+ DocumentTypeSpec,
ExtractionRequest,
+ Field,
FieldGroup,
- FieldSpec,
FieldType,
+ FileInput,
)
async def main(path: Path) -> int:
- invoice = DocSpec(
- doc_type={"documentType": "invoice"},
+ invoice = DocumentTypeSpec(
+ id="invoice",
field_groups=[
- FieldGroup.of(
- "totals",
- FieldSpec(field_name="total_amount", field_type=FieldType.NUMBER, required=True),
- FieldSpec(field_name="currency", field_type=FieldType.STRING, required=True),
- )
+ FieldGroup(
+ name="totals",
+ fields=[
+ Field(name="total_amount", type=FieldType.NUMBER, required=True),
+ Field(name="currency", type=FieldType.STRING, required=True),
+ ],
+ ),
],
)
- async with AsyncFlydocsClient("http://localhost:8400") as flydocs:
+ async with AsyncClient("http://localhost:8400") as flydocs:
result = await flydocs.extract(
ExtractionRequest(
- documents=[DocumentInput.from_path(path)],
- docs=[invoice],
+ files=[FileInput.from_path(path)],
+ document_types=[invoice],
)
)
- print(f"model={result.model} latency={result.latency_ms}ms")
+ # In v1, model + latency live under ``pipeline``.
+ print(f"id={result.id} model={result.pipeline.model} latency={result.pipeline.latency_ms}ms")
for doc in result.documents:
- for group in doc["fields"]:
- for field in group["fieldGroupFields"]:
- print(
- f" {field['name']:>15} = {field.get('value')!r:>20} "
- f"conf={field.get('confidence', 0):.2f}"
- )
+ for group in doc.field_groups:
+ for field in group.fields:
+ value = field.value if field.value is not None else ""
+ print(f" {field.name:>15} = {value!r:>20} conf={field.confidence:.2f}")
return 0
diff --git a/sdks/python/examples/02_typed_schema_and_rules.py b/sdks/python/examples/02_typed_schema_and_rules.py
index e8df5c7..d9ff6d6 100644
--- a/sdks/python/examples/02_typed_schema_and_rules.py
+++ b/sdks/python/examples/02_typed_schema_and_rules.py
@@ -1,8 +1,16 @@
-"""A realistic invoice extraction: typed DocSpec, validators, rules, dry-run.
+"""A realistic invoice extraction: typed DocumentTypeSpec + validators + rules + dry-run.
+
+What it shows:
+ * The full v1 ``DocumentTypeSpec`` shape with validators on fields.
+ * The new rule discriminator (``kind`` instead of ``parentType``).
+ * The dry-run :meth:`AsyncClient.validate` returning a typed
+ :class:`ValidationResponse` instead of a raw dict.
+ * Reading per-rule results through the typed :class:`RuleResult`.
Run from the repo root::
- uv run python sdks/python/examples/02_typed_schema_and_rules.py path/to/invoice.pdf
+ PYTHONPATH=sdks/python/examples \
+ uv run python sdks/python/examples/02_typed_schema_and_rules.py path/to/invoice.pdf
"""
from __future__ import annotations
@@ -11,21 +19,21 @@
import sys
from pathlib import Path
-from examples_helpers import INVOICE_DOC_SPEC, INVOICE_RULES # type: ignore[import-not-found]
+from examples_helpers import INVOICE_DOCUMENT_TYPE, INVOICE_RULES # type: ignore[import-not-found]
from flydocs_sdk import (
- AsyncFlydocsClient,
- DocumentInput,
+ AsyncClient,
ExtractionOptions,
ExtractionRequest,
+ FileInput,
StageToggles,
)
async def main(path: Path) -> int:
req = ExtractionRequest(
- documents=[DocumentInput.from_path(path)],
- docs=[INVOICE_DOC_SPEC],
+ files=[FileInput.from_path(path)],
+ document_types=[INVOICE_DOCUMENT_TYPE],
rules=INVOICE_RULES,
options=ExtractionOptions(
language_hint="es",
@@ -38,20 +46,20 @@ async def main(path: Path) -> int:
),
),
)
- async with AsyncFlydocsClient("http://localhost:8400") as flydocs:
- # Dry-run the semantic validator first.
+ async with AsyncClient("http://localhost:8400") as flydocs:
report = await flydocs.validate(req)
- if not report["ok"]:
+ if not report.ok:
print("semantic validation failed:")
- for err in report["errors"]:
- print(f" {err['path']}: {err['message']}")
+ for err in report.errors:
+ print(f" {err.get('path', '?')}: {err.get('message', err)}")
return 1
result = await flydocs.extract(req, correlation_id="examples:02")
- print(f"model={result.model} latency={result.latency_ms}ms")
+ print(f"id={result.id} model={result.pipeline.model} latency={result.pipeline.latency_ms}ms")
for rr in result.rule_results:
- print(f" rule {rr['rule_id']:>20} = {rr['output']} {rr.get('summary', '')}")
+ suffix = f" {rr.summary}" if rr.summary else ""
+ print(f" rule {rr.rule_id:>22} = {rr.output}{suffix}")
return 0
diff --git a/sdks/python/examples/03_async_extraction_with_wait.py b/sdks/python/examples/03_async_extraction_with_wait.py
new file mode 100644
index 0000000..90cc189
--- /dev/null
+++ b/sdks/python/examples/03_async_extraction_with_wait.py
@@ -0,0 +1,79 @@
+"""Submit an async extraction, poll until terminal, fetch the result envelope.
+
+What it shows:
+ * The new ``POST /api/v1/extractions`` endpoint (was ``POST /api/v1/jobs``).
+ * The :class:`SubmitExtractionRequest` shape (file + types + callback).
+ * The :class:`Client.extractions` sub-resource (``create`` /
+ ``get`` / ``get_result``).
+ * :meth:`AsyncClient.wait_for_completion` polling :class:`ExtractionStatus`
+ until a terminal state is reached.
+ * Reading the result envelope: ``envelope.result.pipeline.latency_ms``,
+ ``envelope.result.documents[*].field_groups[*].fields``.
+
+The legacy "PARTIAL_SUCCEEDED" / "REFINING_BBOXES" intermediate states are
+gone in v1: an extraction reaches ``succeeded`` the moment the main pipeline
+finishes, and bbox refinement runs as additive post-processing.
+
+Run from the repo root::
+
+ PYTHONPATH=sdks/python/examples \
+ uv run python sdks/python/examples/03_async_extraction_with_wait.py path/to/document.pdf
+"""
+
+from __future__ import annotations
+
+import asyncio
+import sys
+from pathlib import Path
+
+from examples_helpers import INVOICE_DOCUMENT_TYPE, INVOICE_RULES # type: ignore[import-not-found]
+
+from flydocs_sdk import (
+ AsyncClient,
+ ExtractionStatus,
+ FileInput,
+ SubmitExtractionRequest,
+)
+
+
+async def main(path: Path) -> int:
+ async with AsyncClient("http://localhost:8400", timeout=30.0) as flydocs:
+ ext = await flydocs.extractions.create(
+ SubmitExtractionRequest(
+ files=[FileInput.from_path(path)],
+ document_types=[INVOICE_DOCUMENT_TYPE],
+ rules=INVOICE_RULES,
+ callback_url="https://your-app.example.com/flydocs/webhook",
+ metadata={"caller": "examples:03"},
+ ),
+ idempotency_key=f"examples:03:{path.name}",
+ )
+ print(f"queued {ext.id} ({ext.status.value})")
+
+ final = await flydocs.wait_for_completion(
+ ext.id,
+ poll_interval=2.0,
+ timeout=600.0,
+ )
+
+ if final.status == ExtractionStatus.SUCCEEDED:
+ envelope = await flydocs.extractions.get_result(ext.id)
+ result = envelope.result
+ print(f"done: {len(result.documents)} document(s), {result.pipeline.latency_ms}ms")
+ return 0
+ err = final.error
+ if err is not None:
+ print(f"extraction did not succeed: {final.status.value} {err.code}: {err.message}")
+ else:
+ print(f"extraction did not succeed: {final.status.value}")
+ return 1
+
+
+if __name__ == "__main__":
+ if len(sys.argv) != 2:
+ print(
+ "usage: python 03_async_extraction_with_wait.py path/to/document.pdf",
+ file=sys.stderr,
+ )
+ sys.exit(2)
+ sys.exit(asyncio.run(main(Path(sys.argv[1]))))
diff --git a/sdks/python/examples/03_async_job_with_wait.py b/sdks/python/examples/03_async_job_with_wait.py
deleted file mode 100644
index 912728f..0000000
--- a/sdks/python/examples/03_async_job_with_wait.py
+++ /dev/null
@@ -1,67 +0,0 @@
-"""Submit an async job, poll until terminal, fetch the result.
-
-The same request shape as ``extract``, just driven through the queue.
-Use this for long-running workloads, batches, or anywhere you'd like
-the worker to deliver via webhook.
-
- uv run python sdks/python/examples/03_async_job_with_wait.py path/to/document.pdf
-"""
-
-from __future__ import annotations
-
-import asyncio
-import sys
-from pathlib import Path
-
-from examples_helpers import INVOICE_DOC_SPEC, INVOICE_RULES # type: ignore[import-not-found]
-
-from flydocs_sdk import (
- AsyncFlydocsClient,
- DocumentInput,
- JobStatus,
- SubmitJobRequest,
-)
-
-
-async def main(path: Path) -> int:
- async with AsyncFlydocsClient("http://localhost:8400", timeout=30.0) as flydocs:
- submit = await flydocs.submit_job(
- SubmitJobRequest(
- documents=[DocumentInput.from_path(path)],
- docs=[INVOICE_DOC_SPEC],
- rules=INVOICE_RULES,
- callback_url="https://your-app.example.com/flydocs/webhook",
- metadata={"caller": "examples:03"},
- ),
- idempotency_key=f"examples:03:{path.name}",
- )
- print(f"queued {submit.job_id} ({submit.status})")
-
- final = await flydocs.wait_for_completion(
- submit.job_id,
- poll_interval=2.0,
- timeout=600.0,
- )
-
- if final.status == JobStatus.SUCCEEDED:
- result = (await flydocs.get_job_result(submit.job_id)).result
- print(f"done: {len(result.documents)} document(s), {result.latency_ms}ms")
- return 0
- if final.status == JobStatus.PARTIAL_SUCCEEDED:
- result = (await flydocs.get_job_result(submit.job_id)).result
- print(
- f"partial: {len(result.documents)} document(s), "
- f"{len(result.pipeline_errors)} non-fatal errors"
- )
- for err in result.pipeline_errors:
- print(f" - {err}")
- return 0
- print(f"job did not succeed: {final.status} {final.error_code} {final.error_message}")
- return 1
-
-
-if __name__ == "__main__":
- if len(sys.argv) != 2:
- print("usage: python 03_async_job_with_wait.py path/to/document.pdf", file=sys.stderr)
- sys.exit(2)
- sys.exit(asyncio.run(main(Path(sys.argv[1]))))
diff --git a/sdks/python/examples/04_webhook_receiver_fastapi.py b/sdks/python/examples/04_webhook_receiver_fastapi.py
index 0c01a1f..8ebc3a3 100644
--- a/sdks/python/examples/04_webhook_receiver_fastapi.py
+++ b/sdks/python/examples/04_webhook_receiver_fastapi.py
@@ -1,5 +1,14 @@
"""A FastAPI app that receives flydocs webhooks and verifies them.
+What it shows:
+ * Verifying an incoming HMAC-signed body with :class:`WebhookVerifier`.
+ * Parsing the typed :class:`EventEnvelope` returned by ``verifier.verify``.
+ * Switching on the four v1 event types
+ (``extraction.submitted`` / ``extraction.completed`` /
+ ``extraction.post_processing.requested`` /
+ ``extraction.post_processing.completed``).
+ * Reading the v1 ``Extraction`` + nested ``ExtractionResult`` shape.
+
Run it::
FLYDOCS_WEBHOOK_HMAC_SECRET=topsecret \
@@ -16,8 +25,11 @@
from fastapi.responses import JSONResponse
from flydocs_sdk import (
- JobStatus,
- JobWebhookPayload,
+ EVENT_TYPE_EXTRACTION_COMPLETED,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
+ EVENT_TYPE_EXTRACTION_SUBMITTED,
+ ExtractionStatus,
WebhookVerificationError,
WebhookVerifier,
)
@@ -33,16 +45,25 @@ async def on_webhook(request: Request) -> JSONResponse:
body = await request.body()
signature = request.headers.get("X-Flydocs-Signature", "")
try:
- verifier.verify(body, signature)
+ envelope = verifier.verify(body, signature)
except WebhookVerificationError as exc:
raise HTTPException(status_code=403, detail=str(exc)) from exc
- payload = JobWebhookPayload.model_validate_json(body)
- if payload.status == JobStatus.SUCCEEDED and payload.result is not None:
- for doc in payload.result.documents:
- # persist extracted fields, kick off downstream work, ...
- print(f" {doc.get('document_type')}: {len(doc.get('fields', []))} field groups")
- elif payload.status == JobStatus.FAILED:
- print(f"job {payload.job_id} failed: {payload.error_code} {payload.error_message}")
+ ext = envelope.extraction
+ if envelope.event_type == EVENT_TYPE_EXTRACTION_SUBMITTED:
+ print(f"submitted: {ext.id}")
+ elif envelope.event_type == EVENT_TYPE_EXTRACTION_COMPLETED:
+ if ext.status == ExtractionStatus.SUCCEEDED and envelope.result is not None:
+ for doc in envelope.result.documents:
+ groups = doc.field_groups
+ print(f" succeeded {ext.id}: {doc.type} -> {len(groups)} field groups")
+ elif ext.status == ExtractionStatus.FAILED and ext.error is not None:
+ print(f" failed {ext.id}: {ext.error.code} {ext.error.message}")
+ elif ext.status == ExtractionStatus.CANCELLED:
+ print(f" cancelled {ext.id}")
+ elif envelope.event_type == EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED:
+ print(f"post-processing requested for {ext.id}")
+ elif envelope.event_type == EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED:
+ print(f"post-processing completed for {ext.id}")
return JSONResponse({"ok": True})
diff --git a/sdks/python/examples/05_error_handling.py b/sdks/python/examples/05_error_handling.py
index 1392864..2f3042b 100644
--- a/sdks/python/examples/05_error_handling.py
+++ b/sdks/python/examples/05_error_handling.py
@@ -1,10 +1,20 @@
-"""Branching on RFC 7807 ``code`` for graceful fallback.
+"""Branching on RFC 7807 ``code`` for graceful fallback (v1 codes).
-Tries the sync extraction path first, falls back to the async job
-queue when the service signals an extraction timeout, and surfaces
-semantic validation errors in a way the caller can act on.
+What it shows:
+ * Catching :class:`FlydocsHttpError` and branching on the v1 ``code``
+ field (``timeout`` instead of v0 ``extraction_timeout``,
+ ``file_too_large`` instead of v0 ``document_too_large``, ...).
+ * Falling back from the sync ``extract`` endpoint to the async
+ ``extractions.create`` queue when the pipeline hits the sync timeout.
+ * Surfacing the ``validation_failed`` 422 body so callers can show
+ the validator's findings in their UI.
+ * Distinguishing transport timeouts (:class:`FlydocsTimeoutError`)
+ from server-side ``timeout`` problem-details.
- uv run python sdks/python/examples/05_error_handling.py path/to/document.pdf
+Run from the repo root::
+
+ PYTHONPATH=sdks/python/examples \
+ uv run python sdks/python/examples/05_error_handling.py path/to/document.pdf
"""
from __future__ import annotations
@@ -13,44 +23,50 @@
import sys
from pathlib import Path
-from examples_helpers import INVOICE_DOC_SPEC # type: ignore[import-not-found]
+from examples_helpers import INVOICE_DOCUMENT_TYPE # type: ignore[import-not-found]
from flydocs_sdk import (
- AsyncFlydocsClient,
- DocumentInput,
+ AsyncClient,
ExtractionRequest,
+ ExtractionStatus,
+ FileInput,
FlydocsClientError,
- FlydocsHTTPError,
+ FlydocsHttpError,
FlydocsTimeoutError,
- SubmitJobRequest,
+ SubmitExtractionRequest,
)
async def main(path: Path) -> int:
req = ExtractionRequest(
- documents=[DocumentInput.from_path(path)],
- docs=[INVOICE_DOC_SPEC],
+ files=[FileInput.from_path(path)],
+ document_types=[INVOICE_DOCUMENT_TYPE],
)
- async with AsyncFlydocsClient("http://localhost:8400") as flydocs:
+ async with AsyncClient("http://localhost:8400") as flydocs:
try:
result = await flydocs.extract(req)
- print(f"extracted in sync: latency={result.latency_ms}ms")
+ print(f"extracted in sync: latency={result.pipeline.latency_ms}ms")
return 0
- except FlydocsHTTPError as exc:
- if exc.code == "extraction_timeout":
+ except FlydocsHttpError as exc:
+ if exc.code == "timeout":
print("sync ceiling exceeded; falling back to async")
- submit = await flydocs.submit_job(SubmitJobRequest(**req.model_dump()))
- final = await flydocs.wait_for_completion(submit.job_id, timeout=600.0)
- print(f"async result: {final.status} {final.error_message or ''}")
- return 0 if str(final.status).startswith("SUCCE") else 1
- if exc.code == "document_too_large":
+ submit_payload = SubmitExtractionRequest(**req.model_dump())
+ ext = await flydocs.extractions.create(submit_payload)
+ final = await flydocs.wait_for_completion(ext.id, timeout=600.0)
+ err = final.error
+ print(f"async result: {final.status.value} {(err.message if err else '')}".rstrip())
+ return 0 if final.status == ExtractionStatus.SUCCEEDED else 1
+ if exc.code == "file_too_large":
print(f"413: {exc.detail}")
return 2
- if exc.code == "invalid_request":
- print("422 invalid_request:")
+ if exc.code in ("validation_failed", "invalid_request"):
+ print(f"422 {exc.code}:")
for issue in exc.payload.get("errors", []):
print(f" - {issue}")
return 2
+ if exc.code == "invalid_base64":
+ print(f"422 invalid_base64: {exc.detail}")
+ return 2
print(f"HTTP {exc.status_code} {exc.code}: {exc.detail}")
return 1
except FlydocsTimeoutError:
diff --git a/sdks/python/examples/06_sync_facade.py b/sdks/python/examples/06_sync_facade.py
index c08754d..dcc6a80 100644
--- a/sdks/python/examples/06_sync_facade.py
+++ b/sdks/python/examples/06_sync_facade.py
@@ -1,11 +1,20 @@
"""Synchronous facade -- for callers that can't run an event loop.
-``FlydocsClient`` wraps ``AsyncFlydocsClient`` on a dedicated background
-event loop. The API surface is identical, just without ``await``. Use
-the async client whenever you can; this is the script / cron-job /
+What it shows:
+ * :class:`flydocs_sdk.Client` mirrors :class:`flydocs_sdk.AsyncClient`
+ method-for-method without ``await``.
+ * The new v1 request shape (``files`` + ``document_types``) and
+ response shape (``result.pipeline.model`` / ``result.pipeline.latency_ms``).
+
+``Client`` wraps ``AsyncClient`` on a dedicated background event loop.
+The API surface is identical, just without ``await``. Use the async
+client whenever you can; this is the script / cron-job /
synchronous-codebase escape hatch.
- uv run python sdks/python/examples/06_sync_facade.py path/to/invoice.pdf
+Run from the repo root::
+
+ PYTHONPATH=sdks/python/examples \
+ uv run python sdks/python/examples/06_sync_facade.py path/to/invoice.pdf
"""
from __future__ import annotations
@@ -13,24 +22,24 @@
import sys
from pathlib import Path
-from examples_helpers import INVOICE_DOC_SPEC # type: ignore[import-not-found]
+from examples_helpers import INVOICE_DOCUMENT_TYPE # type: ignore[import-not-found]
from flydocs_sdk import (
- DocumentInput,
+ Client,
ExtractionRequest,
- FlydocsClient,
+ FileInput,
)
def main(path: Path) -> int:
- with FlydocsClient("http://localhost:8400") as flydocs:
+ with Client("http://localhost:8400") as flydocs:
result = flydocs.extract(
ExtractionRequest(
- documents=[DocumentInput.from_path(path)],
- docs=[INVOICE_DOC_SPEC],
+ files=[FileInput.from_path(path)],
+ document_types=[INVOICE_DOCUMENT_TYPE],
)
)
- print(f"sync: model={result.model} latency={result.latency_ms}ms")
+ print(f"sync: id={result.id} model={result.pipeline.model} latency={result.pipeline.latency_ms}ms")
return 0
diff --git a/sdks/python/examples/README.md b/sdks/python/examples/README.md
index 9004218..6c61c4d 100644
--- a/sdks/python/examples/README.md
+++ b/sdks/python/examples/README.md
@@ -1,15 +1,15 @@
-# flydocs Python SDK — Examples
+# flydocs Python SDK — Examples (v1 contract)
Runnable async-first scripts exercising every capability from the [TUTORIAL](../TUTORIAL.md). Each example is self-contained except for shared fixtures in `examples_helpers.py`.
-| # | Script | What it shows |
-|---|-------------------------------------------------------------------|------------------------------------------------------------------|
-| 1 | [`01_first_extraction.py`](./01_first_extraction.py) | Smallest async extraction, hand-written `DocSpec`. |
-| 2 | [`02_typed_schema_and_rules.py`](./02_typed_schema_and_rules.py) | Realistic invoice schema with validators + business rules + dry-run validate. |
-| 3 | [`03_async_job_with_wait.py`](./03_async_job_with_wait.py) | Async job submission + `wait_for_completion` + `get_job_result`. |
-| 4 | [`04_webhook_receiver_fastapi.py`](./04_webhook_receiver_fastapi.py) | FastAPI app that verifies `X-Flydocs-Signature` and unpacks the payload. |
-| 5 | [`05_error_handling.py`](./05_error_handling.py) | RFC 7807 typed errors and sync→async fallback on `extraction_timeout`. |
-| 6 | [`06_sync_facade.py`](./06_sync_facade.py) | The synchronous facade (`FlydocsClient`) for non-async callers. |
+| # | Script | What it shows |
+|---|------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------|
+| 1 | [`01_first_extraction.py`](./01_first_extraction.py) | Smallest async extraction with a hand-written `DocumentTypeSpec`. |
+| 2 | [`02_typed_schema_and_rules.py`](./02_typed_schema_and_rules.py) | Realistic invoice schema with validators + business rules + dry-run `validate`. |
+| 3 | [`03_async_extraction_with_wait.py`](./03_async_extraction_with_wait.py) | Submit via `extractions.create`, poll `wait_for_completion`, fetch with `extractions.get_result`. |
+| 4 | [`04_webhook_receiver_fastapi.py`](./04_webhook_receiver_fastapi.py) | FastAPI app that verifies `X-Flydocs-Signature` and dispatches on the four v1 event types. |
+| 5 | [`05_error_handling.py`](./05_error_handling.py) | Branching on RFC 7807 v1 codes (`timeout`, `file_too_large`, `validation_failed`) + sync→async fallback. |
+| 6 | [`06_sync_facade.py`](./06_sync_facade.py) | The synchronous facade (`Client`) for non-async callers. |
## Running
@@ -21,7 +21,18 @@ uv run python sdks/python/examples/01_first_extraction.py path/to/invoice.pdf
PYTHONPATH=sdks/python/examples \
uv run python sdks/python/examples/02_typed_schema_and_rules.py path/to/invoice.pdf
PYTHONPATH=sdks/python/examples \
- uv run python sdks/python/examples/03_async_job_with_wait.py path/to/invoice.pdf
+ uv run python sdks/python/examples/03_async_extraction_with_wait.py path/to/invoice.pdf
```
The mock LLM accepts any document and returns a fixed schema-compatible response, so the examples work end-to-end without an Anthropic / OpenAI key.
+
+## v0 → v1 notes
+
+If you migrated from the v0 SDK, every example here highlights the breaking changes:
+
+* Request bodies use `files` (was `documents`) and `document_types` (was `docs`).
+* Responses nest `model` / `latency_ms` / `trace` under `result.pipeline`.
+* `documents[*].field_groups[*].fields` replaces the v0 `documents[*]["fields"][*]["fieldGroupFields"]` dict walk.
+* Async endpoints are `POST /api/v1/extractions` etc. (was `/api/v1/jobs`).
+* Statuses are lowercase (`queued`, `succeeded`, ...).
+* Webhook payloads use a single `EventEnvelope` with a dotted `event_type` (`extraction.completed`, ...).
diff --git a/sdks/python/examples/examples_helpers.py b/sdks/python/examples/examples_helpers.py
index 63bfe1e..a3780d4 100644
--- a/sdks/python/examples/examples_helpers.py
+++ b/sdks/python/examples/examples_helpers.py
@@ -1,68 +1,84 @@
"""Shared test data for the example scripts.
-Defines an invoice ``DocSpec`` and rule set that the other examples
-reuse, so each script stays focused on one capability instead of
-re-defining the schema.
+Defines a v1 invoice :class:`DocumentTypeSpec` and rule set that the
+other examples reuse, so each script stays focused on one capability
+instead of re-defining the schema. Add this directory to ``PYTHONPATH``
+when running examples that import from it.
"""
from __future__ import annotations
from flydocs_sdk import (
- DocSpec,
- DocType,
+ DocumentTypeSpec,
+ Field,
FieldGroup,
- FieldItem,
- FieldSpec,
FieldType,
RuleFieldParent,
RuleRuleParent,
RuleSpec,
RuleValidatorParent,
StandardFormat,
- StandardValidatorSpec,
- StandardValidatorType,
+ ValidatorSpec,
+ ValidatorType,
)
-INVOICE_DOC_SPEC = DocSpec(
- doc_type=DocType(document_type="invoice", description="Vendor invoice", country="ES"),
+INVOICE_DOCUMENT_TYPE = DocumentTypeSpec(
+ id="invoice",
+ description="Vendor invoice",
+ country="ES",
field_groups=[
- FieldGroup.of(
- "header",
- FieldSpec(field_name="invoice_number", field_type=FieldType.STRING, required=True),
- FieldSpec(
- field_name="invoice_date",
- field_type=FieldType.STRING,
- format=StandardFormat.DATE,
- required=True,
- ),
- FieldSpec(
- field_name="supplier_vat",
- field_type=FieldType.STRING,
- required=True,
- standard_validators=[
- StandardValidatorSpec(type=StandardValidatorType.VAT_ID, params={"country": "ES"}),
- ],
- ),
+ FieldGroup(
+ name="header",
+ fields=[
+ Field(name="invoice_number", type=FieldType.STRING, required=True),
+ Field(
+ name="invoice_date",
+ type=FieldType.STRING,
+ format=StandardFormat.DATE,
+ required=True,
+ ),
+ Field(
+ name="supplier_vat",
+ type=FieldType.STRING,
+ required=True,
+ validators=[
+ ValidatorSpec(name=ValidatorType.VAT_ID, params={"country": "ES"}),
+ ],
+ ),
+ ],
),
- FieldGroup.of(
- "totals",
- FieldSpec(field_name="subtotal", field_type=FieldType.NUMBER, required=True, minimum=0.0),
- FieldSpec(field_name="tax_amount", field_type=FieldType.NUMBER, required=True, minimum=0.0),
- FieldSpec(field_name="total_amount", field_type=FieldType.NUMBER, required=True, minimum=0.0),
- FieldSpec(field_name="currency", field_type=FieldType.STRING, required=True),
+ FieldGroup(
+ name="totals",
+ fields=[
+ Field(name="subtotal", type=FieldType.NUMBER, required=True, minimum=0.0),
+ Field(name="tax_amount", type=FieldType.NUMBER, required=True, minimum=0.0),
+ Field(name="total_amount", type=FieldType.NUMBER, required=True, minimum=0.0),
+ Field(
+ name="currency",
+ type=FieldType.STRING,
+ required=True,
+ validators=[ValidatorSpec(name=ValidatorType.CURRENCY_CODE)],
+ ),
+ ],
),
- FieldGroup.of(
- "line_items_block",
- FieldSpec(
- field_name="line_items",
- field_type=FieldType.ARRAY,
- items=[
- FieldItem(field_name="description", field_type=FieldType.STRING),
- FieldItem(field_name="quantity", field_type=FieldType.NUMBER),
- FieldItem(field_name="unit_price", field_type=FieldType.NUMBER),
- FieldItem(field_name="line_total", field_type=FieldType.NUMBER),
- ],
- ),
+ FieldGroup(
+ name="line_items_block",
+ fields=[
+ Field(
+ name="line_items",
+ type=FieldType.ARRAY,
+ items=Field(
+ name="row",
+ type=FieldType.OBJECT,
+ fields=[
+ Field(name="description", type=FieldType.STRING),
+ Field(name="quantity", type=FieldType.NUMBER, minimum=0),
+ Field(name="unit_price", type=FieldType.NUMBER, minimum=0),
+ Field(name="line_total", type=FieldType.NUMBER, minimum=0),
+ ],
+ ),
+ ),
+ ],
),
],
)
@@ -74,21 +90,21 @@
parents=[
RuleFieldParent(
document_type="invoice",
- field_names=["subtotal", "tax_amount", "total_amount"],
+ fields=["subtotal", "tax_amount", "total_amount"],
)
],
),
RuleSpec(
id="vat_id_valid",
- predicate="The supplier_vat field passes the VAT_ID validator",
- parents=[RuleValidatorParent(document_type="invoice", validator_name="vat_id")],
+ predicate="The supplier_vat field passes the vat_id validator",
+ parents=[RuleValidatorParent(document_type="invoice", validator="vat_id")],
),
RuleSpec(
id="invoice_acceptable",
predicate="totals_consistent AND vat_id_valid",
parents=[
- RuleRuleParent(rule_id="totals_consistent"),
- RuleRuleParent(rule_id="vat_id_valid"),
+ RuleRuleParent(rule="totals_consistent"),
+ RuleRuleParent(rule="vat_id_valid"),
],
),
]
diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml
index 14e1462..5ef27fe 100644
--- a/sdks/python/pyproject.toml
+++ b/sdks/python/pyproject.toml
@@ -3,7 +3,7 @@ name = "flydocs-sdk"
# CalVer YY.MM.PP — bumped to match the service's release tag. Note that
# PEP 440 normalises ``26.05.01`` -> ``26.5.1`` in the built wheel
# filename; the git tag and Java artifact stay in the full zero-padded form.
-version = "26.5.2"
+version = "26.6.0"
description = "Official Python SDK for flydocs — pure-multimodal Intelligent Document Processing. Sync + async clients, typed RFC 7807 errors, and an HMAC webhook verifier."
readme = "README.md"
requires-python = ">=3.11"
diff --git a/sdks/python/src/flydocs_sdk/__init__.py b/sdks/python/src/flydocs_sdk/__init__.py
index ffc949b..9de63e4 100644
--- a/sdks/python/src/flydocs_sdk/__init__.py
+++ b/sdks/python/src/flydocs_sdk/__init__.py
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-"""Official Python SDK for flydocs.
+"""Official Python SDK for flydocs (v1 contract).
flydocs is a pure-multimodal Intelligent Document Processing service:
structured field extraction with bounding boxes, validation,
@@ -22,115 +22,217 @@
service's REST API, plus a synchronous wrapper for non-async code and a
helper for verifying outbound webhook signatures.
- from flydocs_sdk import FlydocsClient, DocumentInput, ExtractionRequest
+ from flydocs_sdk import (
+ Client, DocumentTypeSpec, ExtractionRequest, Field, FieldGroup,
+ FieldType, FileInput,
+ )
- client = FlydocsClient("http://localhost:8400")
- result = client.extract(
- ExtractionRequest(
- documents=[DocumentInput.from_path("invoice.pdf")],
- docs=[{"docType": {"documentType": "invoice"}, "groups": [...]}],
- )
+ invoice = DocumentTypeSpec(
+ id="invoice",
+ field_groups=[
+ FieldGroup(name="totals", fields=[
+ Field(name="total_amount", type=FieldType.NUMBER, required=True),
+ Field(name="currency", type=FieldType.STRING, required=True),
+ ]),
+ ],
)
- for doc in result.documents:
- for group in doc.fields:
- for field in group.field_group_fields:
- print(field.name, "=", field.value)
+
+ with Client("http://localhost:8400") as flydocs:
+ result = flydocs.extract(
+ ExtractionRequest(
+ files=[FileInput.from_path("invoice.pdf")],
+ document_types=[invoice],
+ )
+ )
"""
from flydocs_sdk._version import __version__
-from flydocs_sdk.async_client import AsyncFlydocsClient
-from flydocs_sdk.client import FlydocsClient
+from flydocs_sdk.async_client import AsyncClient, AsyncExtractionsResource
+from flydocs_sdk.client import Client, ExtractionsResource
from flydocs_sdk.errors import (
FlydocsAPIError,
FlydocsClientError,
FlydocsError,
FlydocsHTTPError,
+ FlydocsHttpError,
FlydocsTimeoutError,
+ ProblemDetails,
)
from flydocs_sdk.models import (
- DocumentInput,
+ ALL_EVENT_TYPES,
+ EVENT_TYPE_EXTRACTION_COMPLETED,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
+ EVENT_TYPE_EXTRACTION_SUBMITTED,
+ BboxQuality,
+ BboxRefinementInfo,
+ BboxSource,
+ BoundingBox,
+ CheckStatus,
+ ClassificationInfo,
+ ContentAuthenticity,
+ ContentCoherenceCheck,
+ ContentIntegrityStatus,
+ Document,
+ DocumentAuthenticity,
+ DocumentTypeSpec,
+ EntityResolutionTransformation,
+ EscalationConfig,
+ EscalationInfo,
+ EventEnvelope,
+ ExtractedField,
+ ExtractedFieldGroup,
+ Extraction,
+ ExtractionError,
+ ExtractionListQuery,
+ ExtractionListResponse,
+ ExtractionOptions,
ExtractionRequest,
ExtractionResult,
- JobListResponse,
- JobResult,
- JobStatus,
- JobStatusResponse,
- JobWebhookPayload,
- SubmitJobRequest,
- SubmitJobResponse,
- VersionInfo,
-)
-from flydocs_sdk.request import (
- DocSpec,
- DocType,
- ExtractionOptions,
+ ExtractionResultEnvelope,
+ ExtractionStatus,
+ Field,
FieldGroup,
- FieldItem,
- FieldSpec,
FieldType,
+ FieldValidation,
+ FieldValidationError,
+ FileInput,
+ FileSummary,
+ JudgeOutcome,
+ JudgeStatus,
+ LlmTransformation,
+ PipelineError,
+ PipelineMeta,
+ PostProcessing,
+ PostProcessingStatus,
RuleFieldParent,
RuleOutputSpec,
+ RuleParent,
+ RuleResult,
RuleRuleParent,
RuleSpec,
RuleValidatorParent,
StageToggles,
StandardFormat,
- StandardValidatorSpec,
- StandardValidatorType,
+ SubmitExtractionRequest,
+ TraceEntry,
+ Transformation,
TransformationScope,
- ValidatorsSpec,
- VisualValidatorSpec,
- entity_resolution,
- llm_transformation,
+ UsageBreakdown,
+ ValidationResponse,
+ ValidationRule,
+ ValidatorSpec,
+ ValidatorType,
+ VersionInfo,
+ VisualCheck,
+ VisualCheckResult,
)
from flydocs_sdk.webhooks import WebhookVerificationError, WebhookVerifier
__all__ = [
"__version__",
+ # ------------------------------------------------------------------
# Clients
- "AsyncFlydocsClient",
- "FlydocsClient",
+ # ------------------------------------------------------------------
+ "AsyncClient",
+ "AsyncExtractionsResource",
+ "Client",
+ "ExtractionsResource",
+ # ------------------------------------------------------------------
# Errors
+ # ------------------------------------------------------------------
"FlydocsAPIError",
"FlydocsClientError",
"FlydocsError",
"FlydocsHTTPError",
+ "FlydocsHttpError",
"FlydocsTimeoutError",
- # Wire response / shared models
- "DocumentInput",
- "ExtractionRequest",
- "ExtractionResult",
- "JobListResponse",
- "JobResult",
- "JobStatus",
- "JobStatusResponse",
- "JobWebhookPayload",
- "SubmitJobRequest",
- "SubmitJobResponse",
- "VersionInfo",
- # Typed request schema
- "DocSpec",
- "DocType",
+ "ProblemDetails",
+ # ------------------------------------------------------------------
+ # Event-type constants
+ # ------------------------------------------------------------------
+ "ALL_EVENT_TYPES",
+ "EVENT_TYPE_EXTRACTION_COMPLETED",
+ "EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED",
+ "EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED",
+ "EVENT_TYPE_EXTRACTION_SUBMITTED",
+ # ------------------------------------------------------------------
+ # Wire models -- request side
+ # ------------------------------------------------------------------
+ "DocumentTypeSpec",
+ "EntityResolutionTransformation",
+ "EscalationConfig",
"ExtractionOptions",
+ "ExtractionRequest",
+ "Field",
"FieldGroup",
- "FieldItem",
- "FieldSpec",
"FieldType",
+ "FileInput",
+ "LlmTransformation",
"RuleFieldParent",
"RuleOutputSpec",
+ "RuleParent",
"RuleRuleParent",
"RuleSpec",
"RuleValidatorParent",
"StageToggles",
"StandardFormat",
- "StandardValidatorSpec",
- "StandardValidatorType",
+ "SubmitExtractionRequest",
+ "Transformation",
"TransformationScope",
- "ValidatorsSpec",
- "VisualValidatorSpec",
- "entity_resolution",
- "llm_transformation",
+ "ValidatorSpec",
+ "ValidatorType",
+ "VisualCheck",
+ # ------------------------------------------------------------------
+ # Wire models -- response side
+ # ------------------------------------------------------------------
+ "BboxQuality",
+ "BboxSource",
+ "BoundingBox",
+ "CheckStatus",
+ "ClassificationInfo",
+ "ContentAuthenticity",
+ "ContentCoherenceCheck",
+ "ContentIntegrityStatus",
+ "Document",
+ "DocumentAuthenticity",
+ "EscalationInfo",
+ "ExtractedField",
+ "ExtractedFieldGroup",
+ "ExtractionResult",
+ "FieldValidation",
+ "FieldValidationError",
+ "FileSummary",
+ "JudgeOutcome",
+ "JudgeStatus",
+ "PipelineError",
+ "PipelineMeta",
+ "RuleResult",
+ "TraceEntry",
+ "UsageBreakdown",
+ "ValidationResponse",
+ "ValidationRule",
+ "VisualCheckResult",
+ # ------------------------------------------------------------------
+ # Wire models -- extraction lifecycle
+ # ------------------------------------------------------------------
+ "BboxRefinementInfo",
+ "Extraction",
+ "ExtractionError",
+ "ExtractionListQuery",
+ "ExtractionListResponse",
+ "ExtractionResultEnvelope",
+ "ExtractionStatus",
+ "PostProcessing",
+ "PostProcessingStatus",
+ # ------------------------------------------------------------------
+ # Wire models -- identity + events
+ # ------------------------------------------------------------------
+ "EventEnvelope",
+ "VersionInfo",
+ # ------------------------------------------------------------------
# Webhooks
+ # ------------------------------------------------------------------
"WebhookVerificationError",
"WebhookVerifier",
]
diff --git a/sdks/python/src/flydocs_sdk/_transport.py b/sdks/python/src/flydocs_sdk/_transport.py
index e182522..b4f49a4 100644
--- a/sdks/python/src/flydocs_sdk/_transport.py
+++ b/sdks/python/src/flydocs_sdk/_transport.py
@@ -27,7 +27,7 @@
import httpx
from flydocs_sdk._version import __version__
-from flydocs_sdk.errors import FlydocsHTTPError, FlydocsTimeoutError
+from flydocs_sdk.errors import FlydocsHttpError, FlydocsTimeoutError
DEFAULT_USER_AGENT = f"flydocs-sdk-python/{__version__}"
@@ -35,20 +35,23 @@
def build_headers(
*,
extra: dict[str, str] | None,
+ api_key: str | None = None,
idempotency_key: str | None = None,
correlation_id: str | None = None,
) -> dict[str, str]:
"""Compose the headers a request will go out with.
- ``Accept`` and ``User-Agent`` are always set. The client's default
- headers (configured at construction time) layer underneath any
- per-call ``extra`` and the two well-known optional headers, which
- win.
+ ``Accept`` and ``User-Agent`` are always set. ``Authorization`` is
+ added when ``api_key`` is non-empty. The client's default headers
+ (configured at construction time) layer underneath any per-call
+ ``extra`` and the two well-known optional headers, which win.
"""
headers: dict[str, str] = {
"Accept": "application/json",
"User-Agent": DEFAULT_USER_AGENT,
}
+ if api_key:
+ headers["Authorization"] = f"Bearer {api_key}"
if extra:
for name, value in extra.items():
if value:
@@ -60,22 +63,26 @@ def build_headers(
return headers
-def decode_problem_detail(response: httpx.Response) -> FlydocsHTTPError:
- """Turn a non-2xx :class:`httpx.Response` into a typed :class:`FlydocsHTTPError`.
+def decode_problem_detail(response: httpx.Response) -> FlydocsHttpError:
+ """Turn a non-2xx :class:`httpx.Response` into a typed :class:`FlydocsHttpError`.
- The flydocs ``ExceptionAdvice`` always emits an RFC 7807-ish body
- with ``code`` / ``title`` / ``detail`` keys. We try hard to extract
- those, fall back to raw text when the body is not JSON, and never
- let a decode failure mask the underlying HTTP error.
+ The flydocs ``ExceptionAdvice`` emits an RFC 7807 body with
+ ``type`` / ``title`` / ``status`` / ``code`` / ``detail`` /
+ ``instance`` / ``extensions`` keys. FastAPI's ``HTTPException``
+ wrapper occasionally nests it under ``detail``; we walk both and
+ take the first match per field.
"""
code: str | None = None
title: str | None = None
detail: str | None = None
+ type_: str | None = None
+ instance: str | None = None
+ extensions: dict[str, Any] | None = None
payload: dict[str, Any] = {}
raw_text = ""
try:
raw_text = response.text
- except Exception: # noqa: BLE001 -- httpx can raise here for streaming responses
+ except Exception: # noqa: BLE001 -- httpx can raise for streaming responses
raw_text = ""
try:
data = response.json()
@@ -83,9 +90,6 @@ def decode_problem_detail(response: httpx.Response) -> FlydocsHTTPError:
data = None
if isinstance(data, dict):
payload = data
- # FastAPI's HTTPException wrapper nests the dict under ``detail``.
- # ExceptionAdvice in flydocs emits ``code`` at the top level OR
- # under ``detail`` -- handle both.
nested = data.get("detail") if isinstance(data.get("detail"), dict) else None
sources: tuple[dict[str, Any], ...] = (data, nested) if nested else (data,)
for src in sources:
@@ -95,11 +99,20 @@ def decode_problem_detail(response: httpx.Response) -> FlydocsHTTPError:
title = src["title"]
if detail is None and isinstance(src.get("detail"), str):
detail = src["detail"]
- return FlydocsHTTPError(
+ if type_ is None and isinstance(src.get("type"), str):
+ type_ = src["type"]
+ if instance is None and isinstance(src.get("instance"), str):
+ instance = src["instance"]
+ if extensions is None and isinstance(src.get("extensions"), dict):
+ extensions = src["extensions"]
+ return FlydocsHttpError(
status_code=response.status_code,
code=code,
title=title,
detail=detail,
+ type=type_,
+ instance=instance,
+ extensions=extensions,
payload=payload,
raw_text=raw_text,
)
@@ -109,8 +122,6 @@ def map_transport_error(exc: httpx.RequestError) -> Exception:
"""Translate an httpx transport failure into our typed hierarchy."""
if isinstance(exc, httpx.TimeoutException):
return FlydocsTimeoutError(str(exc) or "request timed out")
- # Importing FlydocsClientError lazily would buy nothing here -- the
- # symbol is already in the import chain via the package init.
from flydocs_sdk.errors import FlydocsClientError
return FlydocsClientError(str(exc) or type(exc).__name__)
diff --git a/sdks/python/src/flydocs_sdk/_version.py b/sdks/python/src/flydocs_sdk/_version.py
index 992987f..abe3665 100644
--- a/sdks/python/src/flydocs_sdk/_version.py
+++ b/sdks/python/src/flydocs_sdk/_version.py
@@ -19,4 +19,4 @@
imports and so the build pipeline can read it without parsing TOML.
"""
-__version__ = "26.5.2"
+__version__ = "26.6.0"
diff --git a/sdks/python/src/flydocs_sdk/async_client.py b/sdks/python/src/flydocs_sdk/async_client.py
index d1f3922..51e3659 100644
--- a/sdks/python/src/flydocs_sdk/async_client.py
+++ b/sdks/python/src/flydocs_sdk/async_client.py
@@ -14,42 +14,51 @@
"""Async client over httpx.
-The sync :class:`flydocs_sdk.FlydocsClient` wraps this class -- there
+The sync :class:`flydocs_sdk.client.Client` wraps this class -- there
is no separate sync implementation. Adding an endpoint means adding
one method here.
+
+API surface (mirrors the spec §12.1):
+
+* ``client.extract(req)`` -- POST /api/v1/extract
+* ``client.validate(req)`` -- POST /api/v1/extract:validate
+* ``client.extractions.create(req, idempotency_key=...)`` -- POST /api/v1/extractions
+* ``client.extractions.list(...)`` -- GET /api/v1/extractions
+* ``client.extractions.get(id)`` -- GET /api/v1/extractions/{id}
+* ``client.extractions.get_result(id, wait_for_bboxes=, timeout=)`` -- GET /api/v1/extractions/{id}/result
+* ``client.extractions.cancel(id)`` -- DELETE /api/v1/extractions/{id}
"""
from __future__ import annotations
+import asyncio
from datetime import datetime
from types import TracebackType
-from typing import Any
+from typing import Any, BinaryIO
import httpx
-from pydantic import BaseModel, TypeAdapter
+from pydantic import BaseModel
from flydocs_sdk._transport import build_headers, decode_problem_detail, map_transport_error
from flydocs_sdk.errors import FlydocsClientError
from flydocs_sdk.models import (
+ EventEnvelope,
+ Extraction,
+ ExtractionListResponse,
ExtractionRequest,
ExtractionResult,
- JobListResponse,
- JobResult,
- JobStatusResponse,
- SubmitJobRequest,
- SubmitJobResponse,
+ ExtractionResultEnvelope,
+ ExtractionStatus,
+ PostProcessingStatus,
+ SubmitExtractionRequest,
+ ValidationResponse,
VersionInfo,
)
DEFAULT_TIMEOUT_S = 60.0
-#: Statuses that mean the worker is done -- success or failure -- and
-#: the SDK's :meth:`AsyncFlydocsClient.wait_for_completion` polling
-#: loop can stop.
-TERMINAL_JOB_STATUSES = frozenset({"SUCCEEDED", "PARTIAL_SUCCEEDED", "FAILED", "CANCELLED"})
-
-class AsyncFlydocsClient:
+class AsyncClient:
"""Async client for the flydocs HTTP API.
Construct once per logical caller; the underlying
@@ -58,7 +67,7 @@ class AsyncFlydocsClient:
share a connection pool with the rest of your app -- the SDK will
not close transports it did not create.
- async with AsyncFlydocsClient("http://localhost:8400") as flydocs:
+ async with AsyncClient("http://localhost:8400") as flydocs:
result = await flydocs.extract(ExtractionRequest(...))
"""
@@ -66,12 +75,14 @@ def __init__(
self,
base_url: str,
*,
+ api_key: str | None = None,
timeout: float = DEFAULT_TIMEOUT_S,
default_headers: dict[str, str] | None = None,
transport: httpx.AsyncBaseTransport | None = None,
http_client: httpx.AsyncClient | None = None,
) -> None:
self._base_url = base_url.rstrip("/")
+ self._api_key = api_key
self._default_headers = dict(default_headers or {})
if http_client is not None:
self._http = http_client
@@ -83,12 +94,13 @@ def __init__(
transport=transport,
)
self._owns_http = True
+ self._extractions = AsyncExtractionsResource(self)
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
- async def __aenter__(self) -> AsyncFlydocsClient:
+ async def __aenter__(self) -> AsyncClient:
return self
async def __aexit__(
@@ -100,15 +112,12 @@ async def __aexit__(
await self.aclose()
async def aclose(self) -> None:
- """Close the underlying transport if the SDK owns it.
-
- Idempotent. Safe to call from cleanup hooks.
- """
+ """Close the underlying transport if the SDK owns it (idempotent)."""
if self._owns_http:
await self._http.aclose()
# ------------------------------------------------------------------
- # Identity / health
+ # Identity / health (kept for operational integrations)
# ------------------------------------------------------------------
async def version(self) -> VersionInfo:
@@ -132,171 +141,102 @@ async def health(self, probe: str = "readiness") -> dict[str, Any]:
# Sync extraction
# ------------------------------------------------------------------
- async def validate(self, request: ExtractionRequest | dict[str, Any]) -> dict[str, Any]:
+ async def validate(self, request: ExtractionRequest | dict[str, Any]) -> ValidationResponse:
"""``POST /api/v1/extract:validate`` -- dry-run the semantic validator.
- Always returns a dict with ``ok`` / ``error_count`` /
- ``warning_count`` / ``errors`` / ``warnings``. Never raises on
- validation failure -- that is a normal outcome of this
- endpoint.
+ Always returns 200 with a :class:`ValidationResponse` body.
+ Inspect ``response.ok`` to decide whether to submit.
"""
payload = _to_jsonable(request)
data = await self._request_json("POST", "/api/v1/extract:validate", json=payload)
- if not isinstance(data, dict):
- raise FlydocsClientError(f"unexpected validate response: {data!r}")
- return data
+ return ValidationResponse.model_validate(data)
async def extract(
self,
request: ExtractionRequest | dict[str, Any],
*,
+ files: list[BinaryIO] | None = None,
idempotency_key: str | None = None,
correlation_id: str | None = None,
) -> ExtractionResult:
"""``POST /api/v1/extract`` -- run the full pipeline synchronously.
- Raises :class:`FlydocsHTTPError` (status 408) when the service
- signals an extraction timeout; the caller should fall back to
- :meth:`submit_job` for long-running workloads.
+ When ``files`` is provided, the SDK switches to multipart upload:
+ the file binaries ride as ``files`` parts, the JSON body (with
+ ``files`` removed) rides under the ``request`` part. Otherwise
+ the JSON body is posted as ``application/json``.
+
+ Raises :class:`FlydocsHttpError(408, code='timeout')` when the
+ service signals an extraction timeout; fall back to
+ :meth:`AsyncExtractionsResource.create` for long-running workloads.
"""
- payload = _to_jsonable(request)
- data = await self._request_json(
- "POST",
+ return await self._post_for_result(
"/api/v1/extract",
- json=payload,
+ request=request,
+ files=files,
idempotency_key=idempotency_key,
correlation_id=correlation_id,
+ model_cls=ExtractionResult,
)
- return ExtractionResult.model_validate(data)
+
+ @property
+ def extractions(self) -> AsyncExtractionsResource:
+ """Sub-resource handle covering the async extraction endpoints."""
+ return self._extractions
# ------------------------------------------------------------------
- # Async-job lifecycle
+ # Convenience: webhook deserialization
# ------------------------------------------------------------------
- async def submit_job(
- self,
- request: SubmitJobRequest | dict[str, Any],
- *,
- idempotency_key: str | None = None,
- correlation_id: str | None = None,
- ) -> SubmitJobResponse:
- """``POST /api/v1/jobs`` -- enqueue an extraction job."""
- payload = _to_jsonable(request)
- data = await self._request_json(
- "POST",
- "/api/v1/jobs",
- json=payload,
- idempotency_key=idempotency_key,
- correlation_id=correlation_id,
- )
- return SubmitJobResponse.model_validate(data)
-
- async def get_job(self, job_id: str) -> JobStatusResponse:
- """``GET /api/v1/jobs/{job_id}`` -- read the current status."""
- data = await self._request_json("GET", f"/api/v1/jobs/{job_id}")
- return JobStatusResponse.model_validate(data)
+ @staticmethod
+ def parse_event(raw_body: bytes) -> EventEnvelope:
+ """Deserialise a raw webhook body into a typed :class:`EventEnvelope`.
- async def get_job_result(
- self,
- job_id: str,
- *,
- wait_for_bboxes: bool = False,
- timeout: float = 60.0,
- ) -> JobResult:
- """``GET /api/v1/jobs/{job_id}/result`` -- fetch the result.
-
- ``wait_for_bboxes=True`` long-polls until the bbox refiner
- finishes or ``timeout`` seconds elapse.
+ Convenience that pairs with :class:`flydocs_sdk.WebhookVerifier`.
"""
- params = {
- "wait_for_bboxes": "true" if wait_for_bboxes else "false",
- "timeout": str(timeout),
- }
- data = await self._request_json("GET", f"/api/v1/jobs/{job_id}/result", params=params)
- return JobResult.model_validate(data)
+ return EventEnvelope.model_validate_json(raw_body)
- async def list_jobs(
- self,
- *,
- status: list[str] | str | None = None,
- bbox_refine_status: list[str] | str | None = None,
- idempotency_key: str | None = None,
- created_after: datetime | str | None = None,
- created_before: datetime | str | None = None,
- limit: int = 50,
- offset: int = 0,
- ) -> JobListResponse:
- """``GET /api/v1/jobs`` -- paginated listing with filters.
-
- ``status`` and ``bbox_refine_status`` accept either a single
- string or a list -- a list is joined with commas to match the
- controller's CSV decoding.
- """
- params: dict[str, str] = {"limit": str(limit), "offset": str(offset)}
- if status is not None:
- params["status"] = ",".join(status) if isinstance(status, list) else status
- if bbox_refine_status is not None:
- params["bbox_refine_status"] = (
- ",".join(bbox_refine_status) if isinstance(bbox_refine_status, list) else bbox_refine_status
- )
- if idempotency_key:
- params["idempotency_key"] = idempotency_key
- if created_after is not None:
- params["created_after"] = _to_iso(created_after)
- if created_before is not None:
- params["created_before"] = _to_iso(created_before)
- data = await self._request_json("GET", "/api/v1/jobs", params=params)
- return JobListResponse.model_validate(data)
+ # ------------------------------------------------------------------
+ # Polling helper
+ # ------------------------------------------------------------------
async def wait_for_completion(
self,
- job_id: str,
+ extraction_id: str,
*,
poll_interval: float = 2.0,
timeout: float = 600.0,
- ) -> JobStatusResponse:
- """Poll a job until it reaches a terminal status, then return.
+ ) -> Extraction:
+ """Poll an extraction until it reaches a terminal status, then return.
Waits at most ``timeout`` seconds, polling every
- ``poll_interval`` seconds. Returns the final
- :class:`JobStatusResponse` whether the job succeeded or failed
- -- inspect ``.status`` to decide what to do next. Raises
- :class:`TimeoutError` if the deadline elapses before the
- worker finishes.
-
- async with AsyncFlydocsClient("http://localhost:8400") as flydocs:
- submit = await flydocs.submit_job(req)
- final = await flydocs.wait_for_completion(submit.job_id)
- if final.status == JobStatus.SUCCEEDED:
- result = await flydocs.get_job_result(submit.job_id)
+ ``poll_interval`` seconds. Returns the final :class:`Extraction`
+ whether it succeeded, failed, or was cancelled. Raises
+ :class:`TimeoutError` if the deadline elapses while the worker
+ is still in flight.
+
+ async with AsyncClient("http://localhost:8400") as flydocs:
+ ext = await flydocs.extractions.create(req)
+ final = await flydocs.wait_for_completion(ext.id)
+ if final.status == ExtractionStatus.SUCCEEDED:
+ envelope = await flydocs.extractions.get_result(ext.id)
"""
- import asyncio
-
loop = asyncio.get_event_loop()
deadline = loop.time() + max(0.0, float(timeout))
while True:
- status = await self.get_job(job_id)
- if str(status.status) in TERMINAL_JOB_STATUSES:
+ status = await self.extractions.get(extraction_id)
+ if status.status.is_terminal:
return status
remaining = deadline - loop.time()
if remaining <= 0:
raise TimeoutError(
- f"job {job_id!r} did not reach a terminal status within "
+ f"extraction {extraction_id!r} did not reach a terminal status within "
f"{timeout}s (last status: {status.status!s})"
)
await asyncio.sleep(min(poll_interval, max(remaining, 0.01)))
- async def cancel_job(self, job_id: str) -> JobStatusResponse:
- """``DELETE /api/v1/jobs/{job_id}`` -- cancel a queued job.
-
- Raises :class:`FlydocsHTTPError(409, code='job_not_cancellable')`
- once the worker has started the job.
- """
- data = await self._request_json("DELETE", f"/api/v1/jobs/{job_id}")
- return JobStatusResponse.model_validate(data)
-
# ------------------------------------------------------------------
- # Low-level transport
+ # Internal: low-level transport
# ------------------------------------------------------------------
async def _request_json(
@@ -311,6 +251,7 @@ async def _request_json(
) -> Any:
headers = build_headers(
extra=self._default_headers,
+ api_key=self._api_key,
idempotency_key=idempotency_key,
correlation_id=correlation_id,
)
@@ -335,20 +276,199 @@ async def _request_json(
f"expected JSON response, got {response.headers.get('content-type', 'unknown')}"
) from exc
+ async def _request_multipart_json(
+ self,
+ method: str,
+ path: str,
+ *,
+ json_part: dict[str, Any],
+ files: list[BinaryIO],
+ idempotency_key: str | None = None,
+ correlation_id: str | None = None,
+ ) -> Any:
+ import json as _json
+
+ headers = build_headers(
+ extra=self._default_headers,
+ api_key=self._api_key,
+ idempotency_key=idempotency_key,
+ correlation_id=correlation_id,
+ )
+ multipart: list[tuple[str, tuple[str | None, Any, str]]] = [
+ ("request", (None, _json.dumps(json_part), "application/json"))
+ ]
+ for f in files:
+ name = getattr(f, "name", "upload")
+ multipart.append(("files", (str(name).rsplit("/", 1)[-1], f, "application/octet-stream")))
+ try:
+ response = await self._http.request(
+ method,
+ path,
+ files=multipart,
+ headers=headers,
+ )
+ except httpx.RequestError as exc:
+ raise map_transport_error(exc) from exc
+ if response.status_code >= 400:
+ raise decode_problem_detail(response)
+ if response.status_code == 204 or not response.content:
+ return None
+ try:
+ return response.json()
+ except ValueError as exc:
+ raise FlydocsClientError(
+ f"expected JSON response, got {response.headers.get('content-type', 'unknown')}"
+ ) from exc
+
+ async def _post_for_result(
+ self,
+ path: str,
+ *,
+ request: BaseModel | dict[str, Any],
+ files: list[BinaryIO] | None,
+ idempotency_key: str | None,
+ correlation_id: str | None,
+ model_cls: type[BaseModel],
+ ) -> Any:
+ payload = _to_jsonable(request)
+ if files:
+ # Strip ``files`` from the JSON part so the binaries don't show up
+ # twice on the wire; the multipart ``files`` field carries the bytes.
+ payload = {k: v for k, v in payload.items() if k != "files"}
+ data = await self._request_multipart_json(
+ "POST",
+ path,
+ json_part=payload,
+ files=files,
+ idempotency_key=idempotency_key,
+ correlation_id=correlation_id,
+ )
+ else:
+ data = await self._request_json(
+ "POST",
+ path,
+ json=payload,
+ idempotency_key=idempotency_key,
+ correlation_id=correlation_id,
+ )
+ return model_cls.model_validate(data)
+
+
+# ---------------------------------------------------------------------------
+# Sub-resource: extractions
+# ---------------------------------------------------------------------------
+
+
+class AsyncExtractionsResource:
+ """Async sub-resource for the ``/api/v1/extractions`` endpoint family."""
+
+ def __init__(self, client: AsyncClient) -> None:
+ self._client = client
+
+ async def create(
+ self,
+ request: SubmitExtractionRequest | dict[str, Any],
+ *,
+ files: list[BinaryIO] | None = None,
+ idempotency_key: str | None = None,
+ correlation_id: str | None = None,
+ ) -> Extraction:
+ """``POST /api/v1/extractions`` -- enqueue an extraction.
+
+ Same multipart semantics as :meth:`AsyncClient.extract` when
+ ``files`` is non-empty.
+ """
+ return await self._client._post_for_result(
+ "/api/v1/extractions",
+ request=request,
+ files=files,
+ idempotency_key=idempotency_key,
+ correlation_id=correlation_id,
+ model_cls=Extraction,
+ )
+
+ async def get(self, extraction_id: str) -> Extraction:
+ """``GET /api/v1/extractions/{id}`` -- read the current status."""
+ data = await self._client._request_json("GET", f"/api/v1/extractions/{extraction_id}")
+ return Extraction.model_validate(data)
+
+ async def get_result(
+ self,
+ extraction_id: str,
+ *,
+ wait_for_bboxes: bool = False,
+ timeout: float = 60.0,
+ ) -> ExtractionResultEnvelope:
+ """``GET /api/v1/extractions/{id}/result`` -- fetch the result envelope.
+
+ ``wait_for_bboxes=True`` long-polls until the bbox refiner
+ finishes or ``timeout`` seconds elapse. The server's query
+ parameter name is ``wait_for_post_processing`` (the bbox leg is
+ the only post-processing case today); the SDK keeps the more
+ intuitive ``wait_for_bboxes`` kwarg.
+ """
+ params = {
+ "wait_for_post_processing": "true" if wait_for_bboxes else "false",
+ "timeout": str(timeout),
+ }
+ data = await self._client._request_json(
+ "GET",
+ f"/api/v1/extractions/{extraction_id}/result",
+ params=params,
+ )
+ return ExtractionResultEnvelope.model_validate(data)
+
+ async def cancel(self, extraction_id: str) -> Extraction:
+ """``DELETE /api/v1/extractions/{id}`` -- cancel a queued extraction.
+
+ Raises :class:`FlydocsHttpError(409, code='not_cancellable')`
+ once the worker has started the extraction.
+ """
+ data = await self._client._request_json("DELETE", f"/api/v1/extractions/{extraction_id}")
+ return Extraction.model_validate(data)
+
+ async def list(
+ self,
+ *,
+ status: list[ExtractionStatus | str] | ExtractionStatus | str | None = None,
+ post_processing_status: (list[PostProcessingStatus | str] | PostProcessingStatus | str | None) = None,
+ idempotency_key: str | None = None,
+ created_after: datetime | str | None = None,
+ created_before: datetime | str | None = None,
+ limit: int = 50,
+ offset: int = 0,
+ ) -> ExtractionListResponse:
+ """``GET /api/v1/extractions`` -- paginated listing with filters.
+
+ ``status`` and ``post_processing_status`` accept either a single
+ value or a list; a list is joined with commas to match the
+ controller's CSV decoding.
+ """
+ params: dict[str, str] = {"limit": str(limit), "offset": str(offset)}
+ if status is not None:
+ params["status"] = _csv(status)
+ if post_processing_status is not None:
+ params["post_processing_status"] = _csv(post_processing_status)
+ if idempotency_key:
+ params["idempotency_key"] = idempotency_key
+ if created_after is not None:
+ params["created_after"] = _to_iso(created_after)
+ if created_before is not None:
+ params["created_before"] = _to_iso(created_before)
+ data = await self._client._request_json("GET", "/api/v1/extractions", params=params)
+ return ExtractionListResponse.model_validate(data)
+
# ---------------------------------------------------------------------------
# Module helpers
# ---------------------------------------------------------------------------
-def _to_jsonable(value: BaseModel | dict[str, Any]) -> Any:
- """Pydantic instance -> dict via JSON serialisation (preserves field aliases)."""
+def _to_jsonable(value: BaseModel | dict[str, Any]) -> dict[str, Any]:
+ """Pydantic instance -> dict via JSON serialisation (preserves aliases)."""
if isinstance(value, BaseModel):
- # mode="json" makes UUID / datetime serialise to their JSON form
- # instead of leaving them as Python objects, which httpx would
- # then have to handle via its default encoder.
- return value.model_dump(mode="json", by_alias=True)
- return value
+ return value.model_dump(mode="json", by_alias=True, exclude_none=False)
+ return dict(value)
def _to_iso(value: datetime | str) -> str:
@@ -357,8 +477,10 @@ def _to_iso(value: datetime | str) -> str:
return value
-# ``TypeAdapter`` is only imported above so that downstream type-checkers
-# can resolve it when this module is read in isolation; nothing in the
-# class body uses it at runtime.
-__all__ = ["AsyncFlydocsClient", "DEFAULT_TIMEOUT_S"]
-_ = TypeAdapter # noqa: F841 -- silence unused-import in older linters
+def _csv(value: list[Any] | Any) -> str:
+ if isinstance(value, list | tuple):
+ return ",".join(str(v) for v in value)
+ return str(value)
+
+
+__all__ = ["AsyncClient", "AsyncExtractionsResource", "DEFAULT_TIMEOUT_S"]
diff --git a/sdks/python/src/flydocs_sdk/client.py b/sdks/python/src/flydocs_sdk/client.py
index bdde0ef..9697014 100644
--- a/sdks/python/src/flydocs_sdk/client.py
+++ b/sdks/python/src/flydocs_sdk/client.py
@@ -14,18 +14,10 @@
"""Synchronous client.
-A thin wrapper around :class:`flydocs_sdk.AsyncFlydocsClient` that
-drives it with a per-instance asyncio event loop. The async client is
-the source of truth for endpoint signatures; this class only delegates
-and exposes a sync surface.
-
-Why this design rather than two parallel implementations:
-
-* The endpoint table grows; keeping it in one place means new endpoints
- show up in both APIs the moment they ship.
-* httpx already supports both flavours, but each ``httpx.Client``
- carries its own connection pool. Sharing one async client + one
- dedicated loop is cheaper than juggling two transports.
+A thin wrapper around :class:`flydocs_sdk.AsyncClient` that drives it
+with a per-instance asyncio event loop. The async client is the source
+of truth for endpoint signatures; this class only delegates and
+exposes a sync surface.
Caveat: instances are not safe to share across threads. Construct one
per thread (or use the async client directly if you are already
@@ -39,28 +31,34 @@
import threading
from datetime import datetime
from types import TracebackType
-from typing import Any
+from typing import Any, BinaryIO
import httpx
-from flydocs_sdk.async_client import DEFAULT_TIMEOUT_S, AsyncFlydocsClient
+from flydocs_sdk.async_client import (
+ DEFAULT_TIMEOUT_S,
+ AsyncClient,
+ AsyncExtractionsResource,
+)
from flydocs_sdk.models import (
+ Extraction,
+ ExtractionListResponse,
ExtractionRequest,
ExtractionResult,
- JobListResponse,
- JobResult,
- JobStatusResponse,
- SubmitJobRequest,
- SubmitJobResponse,
+ ExtractionResultEnvelope,
+ ExtractionStatus,
+ PostProcessingStatus,
+ SubmitExtractionRequest,
+ ValidationResponse,
VersionInfo,
)
-class FlydocsClient:
- """Synchronous client over the same endpoint set as :class:`AsyncFlydocsClient`.
+class Client:
+ """Synchronous client over the same endpoint set as :class:`AsyncClient`.
- with FlydocsClient("http://localhost:8400") as flydocs:
- print(flydocs.version().version)
+ with Client("http://localhost:8400") as flydocs:
+ result = flydocs.extract(request)
Calling :meth:`close` (or using the context manager) shuts the
background event loop down cleanly. After ``close()`` the instance
@@ -71,25 +69,28 @@ def __init__(
self,
base_url: str,
*,
+ api_key: str | None = None,
timeout: float = DEFAULT_TIMEOUT_S,
default_headers: dict[str, str] | None = None,
transport: httpx.AsyncBaseTransport | None = None,
) -> None:
self._loop = asyncio.new_event_loop()
self._lock = threading.Lock()
- self._inner = AsyncFlydocsClient(
+ self._inner = AsyncClient(
base_url,
+ api_key=api_key,
timeout=timeout,
default_headers=default_headers,
transport=transport,
)
self._closed = False
+ self._extractions = ExtractionsResource(self)
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
- def __enter__(self) -> FlydocsClient:
+ def __enter__(self) -> Client:
return self
def __exit__(
@@ -101,10 +102,7 @@ def __exit__(
self.close()
def close(self) -> None:
- """Close the underlying transport and tear down the event loop.
-
- Idempotent. After close the instance must not be used.
- """
+ """Close the underlying transport and tear down the event loop."""
if self._closed:
return
self._closed = True
@@ -118,75 +116,138 @@ def __del__(self) -> None: # pragma: no cover -- best-effort cleanup
self.close()
# ------------------------------------------------------------------
- # Public API mirror
+ # Public API
# ------------------------------------------------------------------
def version(self) -> VersionInfo:
+ """``GET /api/v1/version``."""
return self._run(self._inner.version())
def health(self, probe: str = "readiness") -> dict[str, Any]:
+ """``GET /actuator/health/{probe}``."""
return self._run(self._inner.health(probe))
- def validate(self, request: ExtractionRequest | dict[str, Any]) -> dict[str, Any]:
+ def validate(self, request: ExtractionRequest | dict[str, Any]) -> ValidationResponse:
+ """``POST /api/v1/extract:validate`` -- dry-run the semantic validator."""
return self._run(self._inner.validate(request))
def extract(
self,
request: ExtractionRequest | dict[str, Any],
*,
+ files: list[BinaryIO] | None = None,
idempotency_key: str | None = None,
correlation_id: str | None = None,
) -> ExtractionResult:
+ """``POST /api/v1/extract`` -- run the full pipeline synchronously.
+
+ Pass ``files=[...]`` to switch to multipart upload; see
+ :meth:`AsyncClient.extract` for the contract.
+ """
return self._run(
self._inner.extract(
request,
+ files=files,
idempotency_key=idempotency_key,
correlation_id=correlation_id,
)
)
- def submit_job(
+ @property
+ def extractions(self) -> ExtractionsResource:
+ """Sub-resource handle covering the async extraction endpoints."""
+ return self._extractions
+
+ def wait_for_completion(
self,
- request: SubmitJobRequest | dict[str, Any],
+ extraction_id: str,
*,
+ poll_interval: float = 2.0,
+ timeout: float = 600.0,
+ ) -> Extraction:
+ """Synchronous wrapper around :meth:`AsyncClient.wait_for_completion`."""
+ return self._run(
+ self._inner.wait_for_completion(
+ extraction_id,
+ poll_interval=poll_interval,
+ timeout=timeout,
+ )
+ )
+
+ # ------------------------------------------------------------------
+ # Internal: drive coroutines on the dedicated loop
+ # ------------------------------------------------------------------
+
+ def _run(self, coro: Any) -> Any:
+ if self._closed:
+ raise RuntimeError("Client is closed; construct a new instance")
+ with self._lock:
+ return self._loop.run_until_complete(coro)
+
+ @property
+ def _async_extractions(self) -> AsyncExtractionsResource:
+ return self._inner.extractions
+
+
+class ExtractionsResource:
+ """Synchronous sub-resource for ``/api/v1/extractions``."""
+
+ def __init__(self, client: Client) -> None:
+ self._client = client
+
+ def create(
+ self,
+ request: SubmitExtractionRequest | dict[str, Any],
+ *,
+ files: list[BinaryIO] | None = None,
idempotency_key: str | None = None,
correlation_id: str | None = None,
- ) -> SubmitJobResponse:
- return self._run(
- self._inner.submit_job(
+ ) -> Extraction:
+ return self._client._run(
+ self._client._async_extractions.create(
request,
+ files=files,
idempotency_key=idempotency_key,
correlation_id=correlation_id,
)
)
- def get_job(self, job_id: str) -> JobStatusResponse:
- return self._run(self._inner.get_job(job_id))
+ def get(self, extraction_id: str) -> Extraction:
+ return self._client._run(self._client._async_extractions.get(extraction_id))
- def get_job_result(
+ def get_result(
self,
- job_id: str,
+ extraction_id: str,
*,
wait_for_bboxes: bool = False,
timeout: float = 60.0,
- ) -> JobResult:
- return self._run(self._inner.get_job_result(job_id, wait_for_bboxes=wait_for_bboxes, timeout=timeout))
+ ) -> ExtractionResultEnvelope:
+ return self._client._run(
+ self._client._async_extractions.get_result(
+ extraction_id,
+ wait_for_bboxes=wait_for_bboxes,
+ timeout=timeout,
+ )
+ )
+
+ def cancel(self, extraction_id: str) -> Extraction:
+ return self._client._run(self._client._async_extractions.cancel(extraction_id))
- def list_jobs(
+ def list(
self,
*,
- status: list[str] | str | None = None,
- bbox_refine_status: list[str] | str | None = None,
+ status: list[ExtractionStatus | str] | ExtractionStatus | str | None = None,
+ post_processing_status: (list[PostProcessingStatus | str] | PostProcessingStatus | str | None) = None,
idempotency_key: str | None = None,
created_after: datetime | str | None = None,
created_before: datetime | str | None = None,
limit: int = 50,
offset: int = 0,
- ) -> JobListResponse:
- return self._run(
- self._inner.list_jobs(
+ ) -> ExtractionListResponse:
+ return self._client._run(
+ self._client._async_extractions.list(
status=status,
- bbox_refine_status=bbox_refine_status,
+ post_processing_status=post_processing_status,
idempotency_key=idempotency_key,
created_after=created_after,
created_before=created_before,
@@ -195,31 +256,5 @@ def list_jobs(
)
)
- def cancel_job(self, job_id: str) -> JobStatusResponse:
- return self._run(self._inner.cancel_job(job_id))
- def wait_for_completion(
- self,
- job_id: str,
- *,
- poll_interval: float = 2.0,
- timeout: float = 600.0,
- ) -> JobStatusResponse:
- """Synchronous wrapper around :meth:`AsyncFlydocsClient.wait_for_completion`."""
- return self._run(
- self._inner.wait_for_completion(
- job_id,
- poll_interval=poll_interval,
- timeout=timeout,
- )
- )
-
- # ------------------------------------------------------------------
- # Internal: drive coroutines on the dedicated loop
- # ------------------------------------------------------------------
-
- def _run(self, coro: Any) -> Any:
- if self._closed:
- raise RuntimeError("FlydocsClient is closed; construct a new instance")
- with self._lock:
- return self._loop.run_until_complete(coro)
+__all__ = ["Client", "ExtractionsResource"]
diff --git a/sdks/python/src/flydocs_sdk/errors.py b/sdks/python/src/flydocs_sdk/errors.py
index 87306e3..7a161b3 100644
--- a/sdks/python/src/flydocs_sdk/errors.py
+++ b/sdks/python/src/flydocs_sdk/errors.py
@@ -12,34 +12,42 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-"""Typed exception hierarchy.
+"""Typed exception hierarchy + RFC 7807 :class:`ProblemDetails` model.
Every error the SDK raises subclasses :class:`FlydocsError`. We split
along two axes the caller actually wants to branch on:
-* **Transport vs. application** -- ``FlydocsClientError`` (timeouts,
- network) vs. ``FlydocsHTTPError`` (the service answered with a 4xx/5xx).
-* **Problem code** -- the RFC 7807-ish ``code`` field that the service's
- ``ExceptionAdvice`` puts on every error response (``extraction_timeout``,
- ``document_too_large``, ``invalid_base64``, ``invalid_request``,
- ``job_not_ready``, ``job_not_cancellable``, ``JOB_NOT_FOUND``, ...).
-
-``FlydocsAPIError`` is an alias for :class:`FlydocsHTTPError` kept for
-readability at call-sites where the caller wants to discriminate on the
-``code``.
+* **Transport vs. application** -- :class:`FlydocsClientError` (timeouts,
+ network) vs. :class:`FlydocsHttpError` (the service answered with a
+ 4xx/5xx).
+* **Problem code** -- the RFC 7807 ``code`` field on the response body.
+ In v1 the server emits ``not_found``, ``not_ready``, ``not_cancellable``,
+ ``timeout``, ``file_too_large``, ``unsupported_file``,
+ ``validation_failed``, ``invalid_base64``, ``invalid_request``,
+ ``encrypted_pdf``, ``office_conversion_failed``,
+ ``archive_extraction_failed``, ``image_conversion_failed``,
+ ``unauthorized``. The SDK doesn't pin to that set; it just exposes
+ whatever the server sends as :attr:`FlydocsHttpError.code`.
+
+:class:`FlydocsHttpError` carries every field of :class:`ProblemDetails`
+(``type``, ``title``, ``status``, ``code``, ``detail``, ``instance``,
+``extensions``) plus the raw response text for forensic debugging.
"""
from __future__ import annotations
from typing import Any
+from pydantic import BaseModel, ConfigDict
+from pydantic import Field as _F # noqa: N814 -- private alias avoids shadowing
+
class FlydocsError(Exception):
"""Root exception for everything this SDK raises."""
class FlydocsClientError(FlydocsError):
- """Transport-level failure: network unreachable, DNS, connect timeout.
+ """Transport-level failure: network unreachable, DNS, connect failure.
The HTTP request never completed in a way the service could answer.
Retrying with backoff is almost always the right move.
@@ -49,18 +57,49 @@ class FlydocsClientError(FlydocsError):
class FlydocsTimeoutError(FlydocsClientError):
"""The HTTP request exceeded the configured timeout.
- Distinct from :class:`FlydocsHTTPError(408)` -- the latter is the
+ Distinct from :class:`FlydocsHttpError(408)` -- the latter is the
service telling the SDK that the extraction pipeline itself timed
out and the caller should retry through the async API. This one
means the HTTP request did not complete on the wire.
"""
-class FlydocsHTTPError(FlydocsError):
+class ProblemDetails(BaseModel):
+ """RFC 7807 ``application/problem+json`` body.
+
+ Mirrors :class:`flydocs.interfaces.dtos.error.ProblemDetails`. The
+ SDK ``extra="allow"`` lets callers read any forward-compat fields
+ the service might add.
+ """
+
+ model_config = ConfigDict(extra="allow", populate_by_name=True)
+
+ type: str = "about:blank"
+ title: str = ""
+ status: int = 0
+ detail: str | None = None
+ instance: str | None = None
+ code: str | None = None
+ extensions: dict[str, Any] | None = _F(default=None)
+
+
+class FlydocsHttpError(FlydocsError):
"""The service returned a non-2xx response.
- Holds the HTTP status, the parsed RFC 7807 problem-detail body when
- one was returned, and the raw response text as a fallback.
+ Carries the HTTP status, the parsed RFC 7807 problem-detail body
+ (when one was returned), and the raw response text as a fallback.
+
+ Exposed attributes mirror :class:`ProblemDetails`:
+
+ * :attr:`status_code` -- the HTTP status (int).
+ * :attr:`code` -- the application error code (snake_case string), or empty.
+ * :attr:`title` -- short human-readable summary, or empty.
+ * :attr:`detail` -- longer human-readable explanation, or empty.
+ * :attr:`type` -- problem-type URI, defaulting to ``about:blank``.
+ * :attr:`instance` -- problem-instance URI, when the server sets one.
+ * :attr:`extensions` -- per-occurrence extension dict, when present.
+ * :attr:`payload` -- the full raw decoded JSON body (or ``{}``).
+ * :attr:`raw_text` -- the response text (for non-JSON bodies).
"""
def __init__(
@@ -70,6 +109,9 @@ def __init__(
code: str | None = None,
title: str | None = None,
detail: str | None = None,
+ type: str | None = None,
+ instance: str | None = None,
+ extensions: dict[str, Any] | None = None,
payload: dict[str, Any] | None = None,
raw_text: str = "",
) -> None:
@@ -77,6 +119,9 @@ def __init__(
self.code = code or ""
self.title = title or ""
self.detail = detail or ""
+ self.type = type or "about:blank"
+ self.instance = instance or ""
+ self.extensions = extensions or {}
self.payload = payload or {}
self.raw_text = raw_text
message = f"HTTP {status_code}"
@@ -88,6 +133,22 @@ def __init__(
message += f": {self.title}"
super().__init__(message)
-
-# Readability alias -- some call-sites prefer the "API" framing.
-FlydocsAPIError = FlydocsHTTPError
+ def as_problem_details(self) -> ProblemDetails:
+ """Return the typed :class:`ProblemDetails` view of this error."""
+ return ProblemDetails(
+ type=self.type,
+ title=self.title,
+ status=self.status_code,
+ detail=self.detail or None,
+ instance=self.instance or None,
+ code=self.code or None,
+ extensions=self.extensions or None,
+ )
+
+
+# Legacy alias kept so v0 callers' ``except FlydocsHTTPError`` / ``except
+# FlydocsAPIError`` lines do not break. The canonical class name in v1 is
+# :class:`FlydocsHttpError` (lowercase ``ttp``) to match standard Python
+# naming style and the docs.
+FlydocsHTTPError = FlydocsHttpError
+FlydocsAPIError = FlydocsHttpError
diff --git a/sdks/python/src/flydocs_sdk/models.py b/sdks/python/src/flydocs_sdk/models.py
index a7c7ae1..07e675c 100644
--- a/sdks/python/src/flydocs_sdk/models.py
+++ b/sdks/python/src/flydocs_sdk/models.py
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-"""Wire-level Pydantic models for the flydocs HTTP API.
+"""Wire-level Pydantic models for the flydocs v1 HTTP API.
The SDK ships its own copies of the request / response shapes rather
than re-exporting :mod:`flydocs.interfaces` from the service package.
@@ -27,26 +27,23 @@
new fields out of ``model_extra`` even when the SDK has no typed
attribute for them.
-For deeply-nested shapes that vary by caller schema -- e.g. the inside
-of ``DocSpec``, ``ExtractedFieldGroup``, ``RuleSpec``,
-``Transformation`` -- the SDK keeps them as opaque dicts. Callers that
-want fully-typed traversal should depend on the ``flydocs`` service
-package directly and use its ``flydocs.interfaces.dtos`` modules; the
-SDK is intentionally lighter.
+The v1 contract is snake_case everywhere on the wire. This module
+mirrors :mod:`flydocs.interfaces.dtos` on the service side, but every
+model declares ``ConfigDict(extra="allow", populate_by_name=True)`` for
+forward compatibility.
"""
from __future__ import annotations
import base64
import uuid
-from datetime import datetime
+from datetime import UTC, datetime
from enum import StrEnum
from pathlib import Path
-from typing import Any
+from typing import Annotated, Any, Literal
-from pydantic import BaseModel, ConfigDict, Field, field_validator
-
-from flydocs_sdk.request import DocSpec, ExtractionOptions, RuleSpec
+from pydantic import BaseModel, ConfigDict, field_validator
+from pydantic import Field as _F # noqa: N814 -- private alias keeps pydantic.Field unshadowed
# ---------------------------------------------------------------------------
# Permissive base
@@ -59,9 +56,7 @@ class _WireBase(BaseModel):
* ``extra="allow"`` -- the SDK doesn't have to be updated in
lockstep with new fields the service starts emitting.
* ``populate_by_name=True`` -- callers can construct models with
- either Python-style snake_case or the JSON alias the service uses
- (some fields like ``fieldName`` / ``fieldValueFound`` are
- camelCase on the wire).
+ either Python-style snake_case or any explicit JSON alias.
"""
model_config = ConfigDict(extra="allow", populate_by_name=True)
@@ -72,47 +67,441 @@ class _WireBase(BaseModel):
# ---------------------------------------------------------------------------
-class JobStatus(StrEnum):
- """Lifecycle states a job can occupy.
+class ExtractionStatus(StrEnum):
+ """Main lifecycle states for an async extraction job.
+
+ Mirrors :class:`flydocs.interfaces.enums.extraction_status.ExtractionStatus`.
+ Values are lowercase snake_case in v1 (`queued`, `running`, ...).
+ """
+
+ QUEUED = "queued"
+ RUNNING = "running"
+ SUCCEEDED = "succeeded"
+ FAILED = "failed"
+ CANCELLED = "cancelled"
+
+ @property
+ def is_terminal(self) -> bool:
+ """True when no further state transition is expected."""
+ return self in (
+ ExtractionStatus.SUCCEEDED,
+ ExtractionStatus.FAILED,
+ ExtractionStatus.CANCELLED,
+ )
+
+
+class PostProcessingStatus(StrEnum):
+ """Sub-state for additive post-processing legs (bbox refinement today)."""
+
+ PENDING = "pending"
+ RUNNING = "running"
+ SUCCEEDED = "succeeded"
+ FAILED = "failed"
+
+ @property
+ def is_terminal(self) -> bool:
+ return self in (PostProcessingStatus.SUCCEEDED, PostProcessingStatus.FAILED)
+
+
+class FieldType(StrEnum):
+ """JSON-Schema-aligned primitive set for the public Field model."""
+
+ STRING = "string"
+ NUMBER = "number"
+ INTEGER = "integer"
+ BOOLEAN = "boolean"
+ ARRAY = "array"
+ OBJECT = "object"
+
+
+class StandardFormat(StrEnum):
+ """Standard format hints applied to typed field values at validation time."""
+
+ DATE = "date"
+ DATE_TIME = "date-time"
+ TIME = "time"
+ EMAIL = "email"
+ URI = "uri"
+ UUID = "uuid"
+ CURRENCY = "currency"
+
- Mirrors :class:`flydocs.interfaces.enums.job_status.JobStatus`. We
- keep this as :class:`StrEnum` so unknown future values from the
- service still serialise/deserialise as strings instead of failing
- parsing -- :class:`_WireBase` extras pick those up.
+class ValidatorType(StrEnum):
+ """Built-in validator catalogue applied to extracted field values.
+
+ Mirrors :class:`flydocs.interfaces.enums.validator.ValidatorType`.
"""
- QUEUED = "QUEUED"
- RUNNING = "RUNNING"
- SUCCEEDED = "SUCCEEDED"
- PARTIAL_SUCCEEDED = "PARTIAL_SUCCEEDED"
- REFINING_BBOXES = "REFINING_BBOXES"
- FAILED = "FAILED"
- CANCELLED = "CANCELLED"
+ # Network / web
+ EMAIL = "email"
+ URI = "uri"
+ URL = "url"
+ DOMAIN = "domain"
+ SLUG = "slug"
+ IPV4 = "ipv4"
+ IPV6 = "ipv6"
+
+ # Temporal
+ DATE = "date"
+ DATETIME = "datetime"
+ TIME = "time"
+ ISO_8601 = "iso_8601"
+
+ # Identifiers
+ UUID = "uuid"
+ JSON = "json"
+ HEX_COLOR = "hex_color"
+
+ # Finance
+ IBAN = "iban"
+ BIC = "bic"
+ CREDIT_CARD = "credit_card"
+ CURRENCY_CODE = "currency_code"
+ AMOUNT = "amount"
+
+ # Telephony
+ PHONE_E164 = "phone_e164"
+
+ # Geographic
+ COUNTRY_CODE = "country_code"
+ LANGUAGE_CODE = "language_code"
+ POSTAL_CODE = "postal_code"
+ LATITUDE = "latitude"
+ LONGITUDE = "longitude"
+
+ # National IDs
+ NIF = "nif"
+ NIE = "nie"
+ CIF = "cif"
+ VAT_ID = "vat_id"
+ SSN = "ssn"
+ PASSPORT_NUMBER = "passport_number"
+
+
+class ValidationRule(StrEnum):
+ """Which validation check produced a given error."""
+
+ TYPE = "type"
+ PATTERN = "pattern"
+ FORMAT = "format"
+ ENUM = "enum"
+ MINIMUM = "minimum"
+ MAXIMUM = "maximum"
+ VALIDATOR = "validator"
+
+
+class JudgeStatus(StrEnum):
+ PASS = "pass"
+ FAIL = "fail"
+ UNCERTAIN = "uncertain"
+
+
+class ContentIntegrityStatus(StrEnum):
+ VALID = "valid"
+ INVALID = "invalid"
+ UNCERTAIN = "uncertain"
+
+
+class CheckStatus(StrEnum):
+ PASS = "pass"
+ FAIL = "fail"
+ UNCERTAIN = "uncertain"
+
+
+class BboxQuality(StrEnum):
+ """Coarse-grained verdict on whether a bbox is trustworthy."""
+
+ GOOD = "good"
+ POOR = "poor"
+ SUSPICIOUS = "suspicious"
+ INVALID = "invalid"
+
+
+class BboxSource(StrEnum):
+ """How the coordinates on this bbox were produced."""
+
+ LLM = "llm"
+ PDF_TEXT = "pdf_text"
+ OCR = "ocr"
+
+
+class TransformationScope(StrEnum):
+ """Whether a transformation applies per-document or across the whole request."""
+
+ TASK = "task"
+ REQUEST = "request"
+
+
+# ---------------------------------------------------------------------------
+# Bounding box
+# ---------------------------------------------------------------------------
+
+
+class BoundingBox(_WireBase):
+ """Normalised rectangle on a single page (coordinates in [0, 1])."""
+
+ xmin: float = _F(..., ge=0.0, le=1.0)
+ ymin: float = _F(..., ge=0.0, le=1.0)
+ xmax: float = _F(..., ge=0.0, le=1.0)
+ ymax: float = _F(..., ge=0.0, le=1.0)
+ quality: BboxQuality | None = None
+ quality_score: float = _F(default=0.0, ge=0.0, le=1.0)
+ source: BboxSource | None = None
+ refinement_confidence: float | None = _F(default=None, ge=0.0, le=1.0)
# ---------------------------------------------------------------------------
-# Document input
+# Validators (request side)
# ---------------------------------------------------------------------------
-class DocumentInput(_WireBase):
- """One input file for an extraction request."""
+class ValidatorSpec(_WireBase):
+ """One built-in validator applied to a field.
+
+ Replaces the v0 ``StandardValidatorSpec``. Dispatch key is ``name``
+ (not ``type``) so it doesn't collide with :class:`Field.type` when
+ both appear on the same parent envelope.
+
+ Examples::
+
+ ValidatorSpec(name=ValidatorType.IBAN)
+ ValidatorSpec(name="phone_e164", params={"country": "ES"})
+ ValidatorSpec(name="vat_id", params={"country": "ES"}, severity="warning")
+ """
+
+ name: ValidatorType
+ params: dict[str, Any] = _F(default_factory=dict)
+ severity: Literal["error", "warning"] = "error"
+
+
+# ---------------------------------------------------------------------------
+# Recursive Field schema (request side)
+# ---------------------------------------------------------------------------
+
+
+class Field_(_WireBase): # noqa: N801 -- public alias ``Field`` exported below
+ """One field in a schema. Recursive for arrays and objects.
+
+ * Primitives: any of ``string`` / ``number`` / ``integer`` / ``boolean``;
+ ``items`` and ``fields`` MUST be ``None``.
+ * Array: ``type=array`` + a single ``items`` (a :class:`Field` describing
+ the row shape, typically of type ``object``).
+ * Object: ``type=object`` + a non-empty ``fields`` list of :class:`Field`.
+
+ Exposed publicly as ``Field``; the trailing underscore avoids clashing
+ with :func:`pydantic.Field` inside this module.
+ """
+
+ name: str = _F(..., min_length=1)
+ description: str | None = None
+ type: FieldType = FieldType.STRING
+ required: bool = False
+ pattern: str | None = None
+ format: StandardFormat | None = None
+ enum: list[Any] | None = None
+ minimum: float | None = None
+ maximum: float | None = None
+ items: Field_ | None = None
+ fields: list[Field_] | None = None
+ validators: list[ValidatorSpec] = _F(default_factory=list)
- filename: str = Field(..., min_length=1)
- content_base64: str = Field(
- ...,
- description=(
- "Base64-encoded document bytes. Accepts a bare base64 string or a "
- "``data:;base64,...`` data URL (the prefix is stripped "
- "server-side)."
- ),
+
+# Public name. The internal class is :class:`Field_` (trailing underscore
+# only to avoid shadowing :func:`pydantic.Field` inside this module).
+Field = Field_
+
+
+class FieldGroup(_WireBase):
+ """A named bundle of fields the service should extract together."""
+
+ name: str = _F(..., min_length=1)
+ description: str | None = None
+ fields: list[Field_] = _F(..., min_length=1)
+
+
+Field_.model_rebuild()
+FieldGroup.model_rebuild()
+
+
+# ---------------------------------------------------------------------------
+# Document type (request side)
+# ---------------------------------------------------------------------------
+
+
+class VisualCheck(_WireBase):
+ """One visual check to run against the document (signature, watermark, ...)."""
+
+ name: str = _F(..., min_length=1)
+ description: str
+
+
+class DocumentTypeSpec(_WireBase):
+ """One expected document type the caller is submitting fields for.
+
+ Replaces the v0 ``DocSpec`` / ``DocType`` pair. ``id`` is the stable
+ identifier (e.g. ``"invoice"``); ``description`` / ``country`` are the
+ flattened metadata that used to live under ``DocType``.
+ """
+
+ id: str = _F(..., min_length=1)
+ description: str | None = None
+ country: str | None = None
+ field_groups: list[FieldGroup] = _F(..., min_length=1)
+ visual_checks: list[VisualCheck] = _F(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Rules
+# ---------------------------------------------------------------------------
+
+
+class _BaseRuleParent(_WireBase):
+ pass
+
+
+class RuleFieldParent(_BaseRuleParent):
+ kind: Literal["field"] = "field"
+ document_type: str
+ fields: list[str] = _F(..., min_length=1)
+
+
+class RuleValidatorParent(_BaseRuleParent):
+ kind: Literal["validator"] = "validator"
+ document_type: str
+ validator: str
+
+
+class RuleRuleParent(_BaseRuleParent):
+ kind: Literal["rule"] = "rule"
+ rule: str
+
+
+RuleParent = Annotated[
+ RuleFieldParent | RuleValidatorParent | RuleRuleParent,
+ _F(discriminator="kind"),
+]
+
+
+class RuleOutputSpec(_WireBase):
+ """How the rule's output is interpreted."""
+
+ type: str = "boolean"
+ valid_outputs: list[str] | None = None
+
+
+class RuleSpec(_WireBase):
+ """One business rule expressed as a natural-language predicate.
+
+ RuleSpec(
+ id="invoice_total_matches",
+ predicate="Total equals the sum of line items",
+ parents=[RuleFieldParent(document_type="invoice", fields=["total", "line_items"])],
)
+ """
+
+ id: str = _F(..., min_length=1)
+ predicate: str = _F(..., min_length=1)
+ parents: list[RuleParent] = _F(default_factory=list)
+ output: RuleOutputSpec = _F(default_factory=RuleOutputSpec)
+
+
+# ---------------------------------------------------------------------------
+# Transformations
+# ---------------------------------------------------------------------------
+
+
+class _BaseTransformation(_WireBase):
+ id: str = _F(default_factory=lambda: str(uuid.uuid4()))
+ target_group: str = _F(..., min_length=1)
+ output_group: str | None = None
+ scope: TransformationScope = TransformationScope.TASK
+
+
+class EntityResolutionTransformation(_BaseTransformation):
+ """Deterministic deduplication of an array field group's rows."""
+
+ type: Literal["entity_resolution"] = "entity_resolution"
+ match_by: list[str] = _F(..., min_length=1)
+ min_shared_tokens: int = _F(default=2, ge=1)
+
+
+class LlmTransformation(_BaseTransformation):
+ """Free-form LLM transformation of an array field group's rows."""
+
+ type: Literal["llm"] = "llm"
+ intention: str = _F(..., min_length=10)
+ prompt_id: str | None = None
+
+
+Transformation = Annotated[
+ EntityResolutionTransformation | LlmTransformation,
+ _F(discriminator="type"),
+]
+
+
+# ---------------------------------------------------------------------------
+# Pipeline options
+# ---------------------------------------------------------------------------
+
+
+class StageToggles(_WireBase):
+ """Opt-in switches for every optional pipeline stage."""
+
+ splitter: bool = False
+ classifier: bool = True
+ field_validation: bool = True
+ visual_authenticity: bool = False
+ content_authenticity: bool = False
+ judge: bool = False
+ judge_escalation: bool = False
+ bbox_refine: bool = False
+ transform: bool = False
+ rule_engine: bool = False
+
+
+class EscalationConfig(_WireBase):
+ """Configuration for the judge_escalation stage."""
+
+ threshold: float = _F(..., ge=0.0, le=1.0)
+ model: str = _F(..., min_length=1)
+
+
+class ExtractionOptions(_WireBase):
+ """Per-request pipeline knobs."""
+
+ model: str | None = None
+ language_hint: str | None = _F(default=None, max_length=16)
+ return_bboxes: bool = True
+ declared_media_type: str | None = None
+ stages: StageToggles = _F(default_factory=StageToggles)
+ escalation: EscalationConfig | None = None
+ transformations: list[Transformation | dict[str, Any]] = _F(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Input file
+# ---------------------------------------------------------------------------
+
+
+class FileInput(_WireBase):
+ """One input file for an extraction request.
+
+ Replaces the v0 ``DocumentInput``. JSON mode: caller sets
+ ``content_base64``. Multipart mode: the binary rides in a separate
+ file part; ``content_base64`` is absent and ``filename`` /
+ ``content_type`` come from the part headers.
+ """
+
+ filename: str = _F(..., min_length=1)
+ content_base64: str | None = _F(default=None)
content_type: str | None = None
- document_type: str | None = None
+ expected_type: str | None = None
@field_validator("content_base64")
@classmethod
- def _strip_data_url_prefix(cls, value: str) -> str:
+ def _strip_data_url_prefix(cls, value: str | None) -> str | None:
+ if value is None:
+ return None
if value.startswith("data:") and "," in value:
return value.split(",", 1)[1]
return value
@@ -124,14 +513,14 @@ def from_bytes(
*,
filename: str,
content_type: str | None = None,
- document_type: str | None = None,
- ) -> DocumentInput:
- """Build a :class:`DocumentInput` from raw bytes (encodes to base64)."""
+ expected_type: str | None = None,
+ ) -> FileInput:
+ """Build a :class:`FileInput` from raw bytes (encodes to base64)."""
return cls(
filename=filename,
content_base64=base64.b64encode(data).decode("ascii"),
content_type=content_type,
- document_type=document_type,
+ expected_type=expected_type,
)
@classmethod
@@ -140,153 +529,352 @@ def from_path(
path: str | Path,
*,
content_type: str | None = None,
- document_type: str | None = None,
- ) -> DocumentInput:
- """Read a file off disk and produce a :class:`DocumentInput`."""
+ expected_type: str | None = None,
+ ) -> FileInput:
+ """Read a file off disk and produce a :class:`FileInput`."""
path = Path(path)
return cls.from_bytes(
path.read_bytes(),
filename=path.name,
content_type=content_type,
- document_type=document_type,
+ expected_type=expected_type,
)
# ---------------------------------------------------------------------------
-# Extraction request / response
+# Extraction request
# ---------------------------------------------------------------------------
class ExtractionRequest(_WireBase):
- """Request body for ``POST /api/v1/extract`` and ``POST /api/v1/jobs``.
+ """Request body for ``POST /api/v1/extract`` and ``POST /api/v1/extract:validate``."""
+
+ intention: str = "Extract structured data from the document."
+ files: list[FileInput] = _F(..., min_length=1)
+ document_types: list[DocumentTypeSpec | dict[str, Any]] = _F(..., min_length=1)
+ rules: list[RuleSpec | dict[str, Any]] = _F(default_factory=list)
+ options: ExtractionOptions | dict[str, Any] = _F(default_factory=ExtractionOptions)
- ``docs`` / ``rules`` / ``options`` accept either the typed models
- from :mod:`flydocs_sdk.request` or plain dicts -- typed instances
- give autocomplete + validation, dicts give forward-compatibility
- against new service-side fields the SDK has not surfaced yet.
+
+class SubmitExtractionRequest(ExtractionRequest):
+ """Request body for ``POST /api/v1/extractions``.
+
+ Superset of :class:`ExtractionRequest` plus the async-only
+ ``callback_url`` and ``metadata`` fields.
"""
- request_id: uuid.UUID = Field(default_factory=uuid.uuid4)
- intention: str = "Extract structured data from the document."
- documents: list[DocumentInput] = Field(..., min_length=1)
- docs: list[DocSpec | dict[str, Any]] = Field(..., min_length=1)
- rules: list[RuleSpec | dict[str, Any]] = Field(default_factory=list)
- options: ExtractionOptions | dict[str, Any] = Field(default_factory=ExtractionOptions)
+ callback_url: str | None = None
+ metadata: dict[str, Any] = _F(default_factory=dict)
-class ExtractionResult(_WireBase):
- """Response body for ``POST /api/v1/extract`` and the ``result`` field
- of an async job.
+# ---------------------------------------------------------------------------
+# Response side -- field validation
+# ---------------------------------------------------------------------------
+
- Top-level scalar identity is typed; the inside of each ``documents``
- entry is kept as a permissive dict so the SDK keeps working when
- the service adds new per-field fields.
+class FieldValidationError(_WireBase):
+ rule: ValidationRule
+ message: str
+
+
+class FieldValidation(_WireBase):
+ valid: bool = True
+ errors: list[FieldValidationError] = _F(default_factory=list)
+
+
+class JudgeOutcome(_WireBase):
+ status: JudgeStatus = JudgeStatus.UNCERTAIN
+ confidence: float = _F(default=0.0, ge=0.0, le=1.0)
+ evidence: str | None = None
+ notes: str | None = None
+ flag_for_review: bool = False
+
+
+class ExtractedField(_WireBase):
+ """One extracted field. Recursive for arrays and objects.
+
+ Replaces the v0 ``ExtractedField`` (which had camelCase aliases and
+ a separate ``ExtractedArrayField`` shape). Canonical keys in v1 are
+ ``name``, ``value``, ``pages``, ``confidence``, ``bbox``,
+ ``validation``, ``judge``, ``notes``.
"""
- request_id: uuid.UUID
- files: list[dict[str, Any]] = Field(default_factory=list)
- documents: list[dict[str, Any]] = Field(default_factory=list)
- additional_documents: list[dict[str, Any]] = Field(default_factory=list)
- rule_results: list[dict[str, Any]] = Field(default_factory=list)
- request_transformations: list[dict[str, Any]] = Field(default_factory=list)
- model: str
- latency_ms: int = Field(..., ge=0)
- pipeline_errors: list[dict[str, Any]] = Field(default_factory=list)
- escalation: dict[str, Any] | None = None
- usage: dict[str, Any] | None = None
- trace: list[dict[str, Any]] = Field(default_factory=list)
+ name: str
+ value: str | int | float | bool | list[ExtractedField] | None = None
+ pages: list[int] = _F(default_factory=list)
+ confidence: float = _F(default=0.0, ge=0.0, le=1.0)
+ bbox: BoundingBox | None = None
+ validation: FieldValidation = _F(default_factory=FieldValidation)
+ judge: JudgeOutcome = _F(default_factory=JudgeOutcome)
+ notes: str | None = None
+
+
+ExtractedField.model_rebuild()
+
+
+class ExtractedFieldGroup(_WireBase):
+ name: str
+ fields: list[ExtractedField]
+
+
+# ---------------------------------------------------------------------------
+# Response side -- authenticity
+# ---------------------------------------------------------------------------
+
+
+class VisualCheckResult(_WireBase):
+ name: str
+ passed: bool
+ confidence: float = _F(default=0.0, ge=0.0, le=1.0)
+ notes: str | None = None
+
+
+class ContentCoherenceCheck(_WireBase):
+ name: str
+ description: str
+ status: CheckStatus
+ evidence: str | None = None
+ reasoning: str | None = None
+
+
+class ContentAuthenticity(_WireBase):
+ overall_integrity_status: ContentIntegrityStatus = ContentIntegrityStatus.UNCERTAIN
+ checks: list[ContentCoherenceCheck] = _F(default_factory=list)
+
+
+class DocumentAuthenticity(_WireBase):
+ visual: list[VisualCheckResult] = _F(default_factory=list)
+ content: ContentAuthenticity | None = None
# ---------------------------------------------------------------------------
-# Async-job DTOs
+# Response side -- top-level
# ---------------------------------------------------------------------------
-class SubmitJobRequest(_WireBase):
- """Request body for ``POST /api/v1/jobs``.
+class ClassificationInfo(_WireBase):
+ """Per-file classifier verdict."""
- A superset of :class:`ExtractionRequest` -- adds the optional
- ``callback_url`` (for webhook delivery on terminal status) and a
- free-form ``metadata`` dict that the service echoes back on the
- webhook payload.
+ document_type: str
+ matched: bool = True
+ confidence: float = _F(default=0.0, ge=0.0, le=1.0)
+ description: str | None = None
+ notes: str | None = None
+
+
+class FileSummary(_WireBase):
+ """Summary of one input file in the response.
+
+ Replaces v0 ``DocumentInfo``. ``matched_type`` replaces the v0
+ ``document_type`` field; it carries the caller's pinned
+ ``expected_type`` when set, the classifier's verdict otherwise.
"""
- intention: str = "Extract structured data from the document."
- documents: list[DocumentInput] = Field(..., min_length=1)
- docs: list[DocSpec | dict[str, Any]] = Field(..., min_length=1)
- rules: list[RuleSpec | dict[str, Any]] = Field(default_factory=list)
- options: ExtractionOptions | dict[str, Any] = Field(default_factory=ExtractionOptions)
- callback_url: str | None = None
- metadata: dict[str, Any] = Field(default_factory=dict)
+ filename: str
+ media_type: str
+ page_count: int
+ bytes: int
+ matched_type: str | None = None
+ classification: ClassificationInfo | None = None
-class SubmitJobResponse(_WireBase):
- job_id: str
- status: JobStatus
- submitted_at: datetime
+class Document(_WireBase):
+ """Result for one extracted document instance.
+
+ Replaces v0 ``ExtractedDocument``. ``type`` replaces v0
+ ``document_type``; ``field_groups`` replaces v0 ``fields``.
+ """
+ type: str
+ source_file: str | None = None
+ missing: bool = False
+ pages: list[int] = _F(default_factory=list)
+ confidence: float = _F(default=0.0, ge=0.0, le=1.0)
+ description: str | None = None
+ notes: str | None = None
+ field_groups: list[ExtractedFieldGroup] = _F(default_factory=list)
+ authenticity: DocumentAuthenticity = _F(default_factory=DocumentAuthenticity)
+
+
+class TraceEntry(_WireBase):
+ node: str
+ started_at: datetime
+ completed_at: datetime
+ latency_ms: float
+ status: Literal["success", "failed", "skipped"]
+
+
+class PipelineError(_WireBase):
+ node: str
+ code: str
+ message: str
+
+
+class EscalationInfo(_WireBase):
+ triggered: bool = False
+ primary_model: str | None = None
+ escalation_model: str | None = None
+ primary_fail_rate: float = _F(default=0.0, ge=0.0, le=1.0)
+ escalation_fail_rate: float = _F(default=0.0, ge=0.0, le=1.0)
+ accepted: bool = False
+
+
+class UsageBreakdown(_WireBase):
+ """Aggregated token usage and cost across every LLM call of one request."""
+
+ total_input_tokens: int = 0
+ total_output_tokens: int = 0
+ total_tokens: int = 0
+ total_cost_usd: float = 0.0
+ total_requests: int = 0
+ total_latency_ms: float = 0.0
+ record_count: int = 0
+ cache_creation_tokens: int = 0
+ cache_read_tokens: int = 0
+ by_agent: dict[str, dict[str, Any]] = _F(default_factory=dict)
+ by_model: dict[str, dict[str, Any]] = _F(default_factory=dict)
+
+
+class PipelineMeta(_WireBase):
+ """Pipeline-level instrumentation metadata.
+
+ Replaces the top-level ``model``/``latency_ms``/``trace``/
+ ``pipeline_errors``/``escalation``/``usage`` fields from the v0
+ ``ExtractionResult`` shape. v1 nests them all under one block.
+ """
+
+ model: str
+ latency_ms: int = _F(..., ge=0)
+ trace: list[TraceEntry] = _F(default_factory=list)
+ errors: list[PipelineError] = _F(default_factory=list)
+ escalation: EscalationInfo | None = None
+ usage: UsageBreakdown | None = None
+
+
+class RuleResult(_WireBase):
+ """Per-rule outcome returned in the response.
+
+ Both ``summary`` and ``human_revision`` are optional in v1.
+ """
-class JobStatusResponse(_WireBase):
- job_id: str
- status: JobStatus
+ rule_id: str
+ predicate: str
+ output: str = ""
+ summary: str | None = None
+ notes: list[str] = _F(default_factory=list)
+ human_revision: str | None = None
+
+
+class ExtractionResult(_WireBase):
+ """Top-level response shape (sync ``/extract`` + async result envelope).
+
+ Replaces v0 ``ExtractionResult``. ``id`` replaces ``request_id``;
+ ``discovered_documents`` replaces ``additional_documents``;
+ ``pipeline`` collapses the v0 top-level meta fields into one block.
+ """
+
+ id: str
+ status: Literal["success", "partial"] = "success"
+ files: list[FileSummary] = _F(default_factory=list)
+ documents: list[Document] = _F(default_factory=list)
+ discovered_documents: list[Document] = _F(default_factory=list)
+ rule_results: list[RuleResult] = _F(default_factory=list)
+ request_transformations: list[ExtractedFieldGroup] = _F(default_factory=list)
+ pipeline: PipelineMeta
+
+
+# ---------------------------------------------------------------------------
+# Extraction lifecycle (async)
+# ---------------------------------------------------------------------------
+
+
+class ExtractionError(_WireBase):
+ """Terminal-state error info for a failed extraction."""
+
+ code: str
+ message: str
+
+
+class BboxRefinementInfo(_WireBase):
+ """Lifecycle info for the bbox-refinement post-processing leg."""
+
+ status: PostProcessingStatus
+ started_at: datetime | None = None
+ finished_at: datetime | None = None
+ attempts: int = 0
+ error: ExtractionError | None = None
+
+
+class PostProcessing(_WireBase):
+ """Container for post-processing legs attached to a succeeded extraction."""
+
+ bbox_refinement: BboxRefinementInfo | None = None
+
+
+class Extraction(_WireBase):
+ """Current state snapshot of an async extraction job.
+
+ Replaces v0 ``JobStatusResponse`` and ``SubmitJobResponse`` (both are
+ the same shape in v1).
+ """
+
+ id: str
+ status: ExtractionStatus
submitted_at: datetime
started_at: datetime | None = None
finished_at: datetime | None = None
attempts: int = 0
- error_code: str | None = None
- error_message: str | None = None
- bbox_refine_status: str | None = None
- bbox_refine_attempts: int = 0
- bbox_refine_started_at: datetime | None = None
- bbox_refine_finished_at: datetime | None = None
- bbox_refine_error_code: str | None = None
- bbox_refine_error_message: str | None = None
+ error: ExtractionError | None = None
+ post_processing: PostProcessing | None = None
-class JobResult(_WireBase):
- """Response body for ``GET /api/v1/jobs/{id}/result``."""
+class ExtractionResultEnvelope(_WireBase):
+ """Response body for ``GET /api/v1/extractions/{id}/result``.
- job_id: str
+ Replaces v0 ``JobResult``.
+ """
+
+ id: str
result: ExtractionResult
-class JobListResponse(_WireBase):
- """Response body for ``GET /api/v1/jobs``."""
+class ExtractionListQuery(_WireBase):
+ """Query parameters for ``GET /api/v1/extractions``.
+
+ Carried as the typed input to :meth:`flydocs_sdk.client.ExtractionsResource.list`.
+ """
+
+ statuses: list[ExtractionStatus] = _F(default_factory=list)
+ post_processing_statuses: list[PostProcessingStatus] = _F(default_factory=list)
+ created_after: datetime | None = None
+ created_before: datetime | None = None
+ idempotency_key: str | None = None
+ limit: int = _F(default=50, ge=1, le=500)
+ offset: int = _F(default=0, ge=0)
+
- items: list[JobStatusResponse]
+class ExtractionListResponse(_WireBase):
+ """Paginated list response."""
+
+ items: list[Extraction]
total: int
limit: int
offset: int
# ---------------------------------------------------------------------------
-# Webhook payload
+# Validation response (POST /api/v1/extract:validate)
# ---------------------------------------------------------------------------
-class JobWebhookPayload(_WireBase):
- """Body the service POSTs to ``callback_url`` on terminal status.
-
- Signed with HMAC-SHA256 in the ``X-Flydocs-Signature`` header when
- ``FLYDOCS_WEBHOOK_HMAC_SECRET`` is configured on the service. Use
- :class:`flydocs_sdk.WebhookVerifier` to verify the signature.
- """
+class ValidationResponse(_WireBase):
+ """Dry-run validator output. Always returned with 200 OK."""
- event_id: str
- event_type: str = "IDPJobCompleted"
- version: str = "1.0.0"
- job_id: str
- status: JobStatus
- occurred_at: datetime
- started_at: datetime | None = None
- finished_at: datetime | None = None
- attempts: int = 1
- correlation_id: str | None = None
- tenant_id: str | None = None
- metadata: dict[str, Any] = Field(default_factory=dict)
- result: ExtractionResult | None = None
- error_code: str | None = None
- error_message: str | None = None
+ ok: bool
+ error_count: int = 0
+ warning_count: int = 0
+ errors: list[dict[str, Any]] = _F(default_factory=list)
+ warnings: list[dict[str, Any]] = _F(default_factory=list)
# ---------------------------------------------------------------------------
@@ -302,3 +890,49 @@ class VersionInfo(_WireBase):
model: str
fallback_model: str = ""
eda_adapter: str
+
+
+# ---------------------------------------------------------------------------
+# Event envelope (EDA + webhooks)
+# ---------------------------------------------------------------------------
+
+
+EVENT_TYPE_EXTRACTION_SUBMITTED = "extraction.submitted"
+EVENT_TYPE_EXTRACTION_COMPLETED = "extraction.completed"
+EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED = "extraction.post_processing.requested"
+EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED = "extraction.post_processing.completed"
+
+ALL_EVENT_TYPES = (
+ EVENT_TYPE_EXTRACTION_SUBMITTED,
+ EVENT_TYPE_EXTRACTION_COMPLETED,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED,
+)
+
+
+def _now_utc() -> datetime:
+ return datetime.now(UTC)
+
+
+def _new_event_id() -> str:
+ return str(uuid.uuid4())
+
+
+class EventEnvelope(_WireBase):
+ """Shared envelope for EDA events and webhook deliveries.
+
+ Replaces v0 ``JobWebhookPayload``. ``extraction`` carries a current-
+ state snapshot of the resource. ``result`` is populated only on
+ ``extraction.completed`` events when the terminal status is
+ ``succeeded``; null otherwise.
+ """
+
+ event_id: str = _F(default_factory=_new_event_id)
+ event_type: str
+ version: str = "1.0.0"
+ occurred_at: datetime = _F(default_factory=_now_utc)
+ correlation_id: str | None = None
+ tenant_id: str | None = None
+ extraction: Extraction
+ result: ExtractionResult | None = None
+ metadata: dict[str, Any] = _F(default_factory=dict)
diff --git a/sdks/python/src/flydocs_sdk/request.py b/sdks/python/src/flydocs_sdk/request.py
deleted file mode 100644
index 585fb6e..0000000
--- a/sdks/python/src/flydocs_sdk/request.py
+++ /dev/null
@@ -1,486 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Typed request-side models for the flydocs API.
-
-These are the SDK's parallel of the service-side ``flydocs.interfaces.dtos``
-tree, kept independent so installing ``flydocs-sdk`` does not pull in
-the service runtime. The shapes are pinned to the on-wire JSON
-contract — fields are named to match the keys the service expects,
-camelCase keys (``fieldName``, ``fieldGroupFields``) are reproduced
-verbatim, and snake_case keys (``content_base64``, ``submitted_at``)
-are accepted via Pydantic aliases.
-
-Forward-compatibility: every model declares ``extra="allow"`` so an
-older SDK keeps round-tripping payloads that carry fields it does
-not know about yet.
-"""
-
-from __future__ import annotations
-
-from enum import StrEnum
-from typing import Annotated, Any, Literal
-
-from pydantic import BaseModel, ConfigDict, Field
-
-# ---------------------------------------------------------------------------
-# Common base
-# ---------------------------------------------------------------------------
-
-
-class _RequestBase(BaseModel):
- """Common config for every request-side model in the SDK.
-
- * ``extra="allow"`` — tolerate unknown fields.
- * ``populate_by_name=True`` — accept both Python snake_case and the
- JSON aliases (``fieldName`` / ``fieldGroupFields`` / …) that the
- service expects on the wire.
- """
-
- model_config = ConfigDict(extra="allow", populate_by_name=True)
-
-
-# ---------------------------------------------------------------------------
-# Enums
-# ---------------------------------------------------------------------------
-
-
-class FieldType(StrEnum):
- """Supported primitives for a :class:`FieldSpec`.
-
- Mirrors ``flydocs.interfaces.enums.field_type.FieldType``.
- """
-
- STRING = "string"
- NUMBER = "number"
- INTEGER = "integer"
- BOOLEAN = "boolean"
- ARRAY = "array"
-
-
-class StandardFormat(StrEnum):
- """JSON-Schema-style format hints applied by the field validator."""
-
- DATE = "date"
- DATE_TIME = "date-time"
- EMAIL = "email"
- URI = "uri"
- UUID = "uuid"
-
-
-class StandardValidatorType(StrEnum):
- """Built-in field validators the service ships with.
-
- Mirrors ``flydocs.interfaces.enums.standard_validator.StandardValidatorType``
- member-for-member. New validators added on the service side will
- show up as unknown :class:`StrEnum` values; pass them as raw strings
- to :class:`StandardValidatorSpec.type` until the SDK ships a release
- that exposes them as enum members.
- """
-
- # --- network / web ---------------------------------------------------
- EMAIL = "email"
- URI = "uri"
- URL = "url"
- IPV4 = "ipv4"
- IPV6 = "ipv6"
- DOMAIN = "domain"
- SLUG = "slug"
-
- # --- temporal --------------------------------------------------------
- DATE = "date"
- DATETIME = "datetime"
- TIME = "time"
- ISO_8601 = "iso_8601"
-
- # --- identifiers -----------------------------------------------------
- UUID = "uuid"
- JSON = "json"
- HEX_COLOR = "hex_color"
-
- # --- finance ---------------------------------------------------------
- IBAN = "iban"
- BIC = "bic"
- CREDIT_CARD = "credit_card"
- CURRENCY_CODE = "currency_code"
- AMOUNT = "amount"
-
- # --- telephony -------------------------------------------------------
- PHONE_E164 = "phone_e164"
-
- # --- geographic ------------------------------------------------------
- COUNTRY_CODE = "country_code"
- LANGUAGE_CODE = "language_code"
- POSTAL_CODE = "postal_code"
- LATITUDE = "latitude"
- LONGITUDE = "longitude"
-
- # --- national identifiers -------------------------------------------
- NIF = "nif"
- NIE = "nie"
- CIF = "cif"
- VAT_ID = "vat_id"
- SSN = "ssn"
- PASSPORT_NUMBER = "passport_number"
-
-
-# ---------------------------------------------------------------------------
-# Standard validator
-# ---------------------------------------------------------------------------
-
-
-class StandardValidatorSpec(_RequestBase):
- """One built-in validator declaration attached to a :class:`FieldSpec`.
-
- StandardValidatorSpec(type=StandardValidatorType.IBAN)
- StandardValidatorSpec(type="phone_e164", params={"country": "ES"})
- StandardValidatorSpec(type="vat_id", params={"country": "ES"}, severity="warning")
- """
-
- type: StandardValidatorType
- params: dict[str, Any] = Field(default_factory=dict)
- severity: Literal["error", "warning"] = "error"
-
-
-# ---------------------------------------------------------------------------
-# Pipeline options
-# ---------------------------------------------------------------------------
-
-
-class StageToggles(_RequestBase):
- """Opt-in switches for every optional pipeline stage.
-
- The multimodal extractor is always on; everything else is opt-in.
- Defaults match the service-side defaults so an empty
- :class:`StageToggles` produces the same behaviour as omitting
- the field.
- """
-
- splitter: bool = False
- classifier: bool = True
- field_validation: bool = True
- visual_authenticity: bool = False
- content_authenticity: bool = False
- judge: bool = False
- rule_engine: bool = False
- judge_escalation: bool = False
- bbox_refine: bool = False
- transform: bool = False
-
-
-class ExtractionOptions(_RequestBase):
- """Per-request knobs.
-
- ``transformations`` and ``model``/``escalation_model`` are strings
- on purpose: model ids are arbitrary provider-specific tokens the
- service routes to the right backend.
- """
-
- return_bboxes: bool = True
- language_hint: str | None = Field(default=None, max_length=16)
- model: str | None = None
- declared_media_type: str | None = None
- stages: StageToggles = Field(default_factory=StageToggles)
- escalation_threshold: float | None = Field(default=None, ge=0.0, le=1.0)
- escalation_model: str | None = None
- transformations: list[dict[str, Any]] = Field(
- default_factory=list,
- description=(
- "Post-extraction transformations applied by the ``transform`` "
- "stage. Each entry is the discriminated union from "
- "``flydocs.interfaces.dtos.transformation`` — kept as raw "
- "dicts so callers can pick the right shape without the SDK "
- "shipping the full transformation tree."
- ),
- )
-
-
-# ---------------------------------------------------------------------------
-# Field schema
-# ---------------------------------------------------------------------------
-
-
-class FieldItem(_RequestBase):
- """One sub-field declared inside an array field's ``items`` list."""
-
- field_name: str = Field(..., alias="fieldName", min_length=1)
- field_description: str = Field(default="", alias="fieldDescription")
- field_type: FieldType = Field(default=FieldType.STRING, alias="fieldType")
- pattern: str | None = None
- format: StandardFormat | None = None
- enum: list[Any] | None = None
- minimum: float | None = None
- maximum: float | None = None
- standard_validators: list[StandardValidatorSpec] = Field(default_factory=list)
-
-
-class FieldSpec(_RequestBase):
- """One field the caller wants extracted.
-
- For ``field_type == FieldType.ARRAY`` use ``items`` to describe
- the repeating row's columns. The Pydantic aliases match the
- service's camelCase JSON keys so the dump round-trips with the
- ``flydocs.interfaces`` DTOs on the server side.
-
- FieldSpec(name="total_amount", type=FieldType.NUMBER, required=True)
- FieldSpec(
- name="line_items",
- type=FieldType.ARRAY,
- items=[
- FieldItem(field_name="description", field_type=FieldType.STRING),
- FieldItem(field_name="amount", field_type=FieldType.NUMBER),
- ],
- )
- """
-
- field_name: str = Field(..., alias="name", min_length=1)
- field_description: str = Field(default="", alias="description")
- field_type: FieldType = Field(default=FieldType.STRING, alias="type")
- required: bool = False
- pattern: str | None = None
- format: StandardFormat | None = None
- enum: list[Any] | None = None
- minimum: float | None = None
- maximum: float | None = None
- items: list[FieldItem] | None = None
- standard_validators: list[StandardValidatorSpec] = Field(default_factory=list)
-
-
-class FieldGroup(_RequestBase):
- """A named bundle of fields the service should extract together.
-
- FieldGroup(
- name="totals",
- fields=[
- FieldSpec(name="total_amount", type=FieldType.NUMBER, required=True),
- FieldSpec(name="currency", type=FieldType.STRING),
- ],
- )
- """
-
- field_group_name: str = Field(..., alias="fieldGroupName", min_length=1)
- field_group_desc: str = Field(default="", alias="fieldGroupDesc")
- field_group_fields: list[FieldSpec] = Field(..., alias="fieldGroupFields", min_length=1)
-
- @classmethod
- def of(cls, name: str, *fields: FieldSpec, description: str = "") -> FieldGroup:
- """Concise factory: ``FieldGroup.of("totals", FieldSpec(...), FieldSpec(...))``."""
- return cls(
- field_group_name=name,
- field_group_desc=description,
- field_group_fields=list(fields),
- )
-
-
-# ---------------------------------------------------------------------------
-# Doc spec
-# ---------------------------------------------------------------------------
-
-
-class DocType(_RequestBase):
- document_type: str = Field(..., alias="documentType", min_length=1)
- description: str = ""
- country: str = Field(default="", max_length=2)
-
-
-class VisualValidatorSpec(_RequestBase):
- """One visual check the service should run (e.g. signature presence)."""
-
- name: str = Field(..., min_length=1)
- description: str
-
-
-class ValidatorsSpec(_RequestBase):
- visual: list[VisualValidatorSpec] = Field(default_factory=list)
-
-
-class DocSpec(_RequestBase):
- """One expected document type plus its field schema.
-
- DocSpec(
- doc_type=DocType(document_type="invoice", description="Vendor invoice"),
- field_groups=[FieldGroup.of("totals", ...)],
- )
- """
-
- doc_type: DocType = Field(..., alias="docType")
- field_groups: list[FieldGroup] = Field(..., alias="fieldGroups", min_length=1)
- validators: ValidatorsSpec = Field(default_factory=ValidatorsSpec)
-
-
-# ---------------------------------------------------------------------------
-# Rule schema
-# ---------------------------------------------------------------------------
-
-
-class RuleFieldParent(_RequestBase):
- parent_type: Literal["field"] = Field(default="field", alias="parentType")
- document_type: str = Field(..., alias="documentType")
- field_names: list[str] = Field(..., alias="fieldNames", min_length=1)
-
-
-class RuleValidatorParent(_RequestBase):
- parent_type: Literal["validator"] = Field(default="validator", alias="parentType")
- document_type: str = Field(..., alias="documentType")
- validator_name: str = Field(..., alias="validatorName")
-
-
-class RuleRuleParent(_RequestBase):
- parent_type: Literal["rule"] = Field(default="rule", alias="parentType")
- rule_id: str = Field(..., alias="ruleId")
-
-
-RuleParent = Annotated[
- RuleFieldParent | RuleValidatorParent | RuleRuleParent,
- Field(discriminator="parent_type"),
-]
-
-
-class RuleOutputSpec(_RequestBase):
- type: str = "boolean"
- valid_outputs: list[str] | None = None
-
-
-class RuleSpec(_RequestBase):
- """One business rule expressed as a natural-language predicate over its parents.
-
- RuleSpec(
- id="invoice_total_matches",
- predicate="Total equals the sum of line items",
- parents=[RuleFieldParent(document_type="invoice", field_names=["total_amount", "line_items"])],
- )
- """
-
- id: str = Field(..., min_length=1)
- predicate: str = Field(..., min_length=1)
- parents: list[RuleParent] = Field(default_factory=list)
- output: RuleOutputSpec = Field(default_factory=RuleOutputSpec)
-
-
-# ---------------------------------------------------------------------------
-# Transformations
-# ---------------------------------------------------------------------------
-
-
-class TransformationScope(StrEnum):
- """Whether a transformation applies per-document or across the whole request.
-
- Mirrors ``flydocs.interfaces.dtos.transformation.TransformationScope``.
- """
-
- TASK = "task"
- """One pass per ``(segment, DocSpec)`` task. Use for single-document
- transformations (format normalisation, single-doc dedup)."""
-
- REQUEST = "request"
- """Groups with the matching ``target_group`` are concatenated across
- every task, the transformation runs once over the consolidated
- rows, and the result is emitted under
- ``ExtractionResult.request_transformations``."""
-
-
-def entity_resolution(
- *,
- target_group: str,
- match_by: list[str],
- min_shared_tokens: int = 2,
- output_group: str | None = None,
- scope: TransformationScope = TransformationScope.TASK,
- id: str | None = None,
-) -> dict[str, Any]:
- """Build an ``entity_resolution`` transformation payload.
-
- The matcher dedupes rows of an array field group by (1) exact-match
- on a normalised key (typically a DNI / VAT id) and (2) token-subset
- matching on names. See
- ``flydocs.interfaces.dtos.transformation.EntityResolutionTransformation``
- for the full algorithm.
-
- Returns a plain :class:`dict` ready for
- :attr:`ExtractionOptions.transformations`.
- """
- payload: dict[str, Any] = {
- "type": "entity_resolution",
- "target_group": target_group,
- "match_by": list(match_by),
- "min_shared_tokens": int(min_shared_tokens),
- "scope": scope.value,
- }
- if output_group is not None:
- payload["output_group"] = output_group
- if id is not None:
- payload["id"] = id
- return payload
-
-
-def llm_transformation(
- *,
- target_group: str,
- intention: str,
- prompt_id: str | None = None,
- output_group: str | None = None,
- scope: TransformationScope = TransformationScope.TASK,
- id: str | None = None,
-) -> dict[str, Any]:
- """Build a free-form ``llm`` transformation payload.
-
- ``intention`` is a one-sentence goal in any language describing what
- the LLM should do to the target group's rows.
- """
- if len(intention) < 10:
- raise ValueError("intention must be at least 10 characters")
- payload: dict[str, Any] = {
- "type": "llm",
- "target_group": target_group,
- "intention": intention,
- "scope": scope.value,
- }
- if prompt_id is not None:
- payload["prompt_id"] = prompt_id
- if output_group is not None:
- payload["output_group"] = output_group
- if id is not None:
- payload["id"] = id
- return payload
-
-
-# ---------------------------------------------------------------------------
-# Re-exports for callers that want one import line
-# ---------------------------------------------------------------------------
-
-
-__all__ = [
- "DocSpec",
- "DocType",
- "ExtractionOptions",
- "FieldGroup",
- "FieldItem",
- "FieldSpec",
- "FieldType",
- "RuleFieldParent",
- "RuleOutputSpec",
- "RuleParent",
- "RuleRuleParent",
- "RuleSpec",
- "RuleValidatorParent",
- "StageToggles",
- "StandardFormat",
- "StandardValidatorSpec",
- "StandardValidatorType",
- "TransformationScope",
- "ValidatorsSpec",
- "VisualValidatorSpec",
- "entity_resolution",
- "llm_transformation",
-]
diff --git a/sdks/python/src/flydocs_sdk/webhooks.py b/sdks/python/src/flydocs_sdk/webhooks.py
index b7d8927..6049803 100644
--- a/sdks/python/src/flydocs_sdk/webhooks.py
+++ b/sdks/python/src/flydocs_sdk/webhooks.py
@@ -28,6 +28,7 @@
* The raw request body must be passed in unchanged -- decoding /
re-encoding the JSON before verifying will change the bytes and
break the signature.
+* On success, the body is parsed into a typed :class:`EventEnvelope`.
"""
from __future__ import annotations
@@ -35,17 +36,20 @@
import hashlib
import hmac
+from flydocs_sdk.errors import FlydocsError
+from flydocs_sdk.models import EventEnvelope
-class WebhookVerificationError(Exception):
+
+class WebhookVerificationError(FlydocsError):
"""Raised when a webhook signature does not match the body."""
class WebhookVerifier:
- """Verify ``X-Flydocs-Signature`` HMACs.
+ """Verify ``X-Flydocs-Signature`` HMACs and parse the body.
verifier = WebhookVerifier(secret="...")
try:
- payload = verifier.verify(raw_body, signature_header)
+ envelope = verifier.verify(raw_body, signature_header)
except WebhookVerificationError:
return 403
@@ -70,11 +74,14 @@ def sign(self, body: bytes) -> str:
digest = hmac.new(self._secret, body, hashlib.sha256).hexdigest()
return f"{self._scheme}={digest}"
- def verify(self, body: bytes, signature_header: str) -> bytes:
- """Return ``body`` if the signature matches; else raise.
+ def verify(self, body: bytes, signature_header: str) -> EventEnvelope:
+ """Verify the signature and parse the body into an :class:`EventEnvelope`.
Both ``sha256=`` and a bare ```` are accepted, since
some intermediate proxies strip the scheme prefix.
+
+ Raises :class:`WebhookVerificationError` on signature mismatch,
+ missing header, or unsupported scheme.
"""
if not signature_header:
raise WebhookVerificationError("signature header missing")
@@ -88,4 +95,7 @@ def verify(self, body: bytes, signature_header: str) -> bytes:
expected = hmac.new(self._secret, body, hashlib.sha256).hexdigest()
if not hmac.compare_digest(candidate, expected):
raise WebhookVerificationError("signature mismatch")
- return body
+ return EventEnvelope.model_validate_json(body)
+
+
+__all__ = ["WebhookVerificationError", "WebhookVerifier"]
diff --git a/sdks/python/tests/conftest.py b/sdks/python/tests/conftest.py
index e1eb773..002fa80 100644
--- a/sdks/python/tests/conftest.py
+++ b/sdks/python/tests/conftest.py
@@ -27,18 +27,18 @@
import pytest
import pytest_asyncio
-from flydocs_sdk import AsyncFlydocsClient, FlydocsClient
+from flydocs_sdk import AsyncClient, Client
-BASE_URL = "http://flydocs.test"
+BASE_URL = "https://flydocs.test"
@pytest_asyncio.fixture
-async def async_client() -> AsyncIterator[AsyncFlydocsClient]:
- async with AsyncFlydocsClient(BASE_URL) as client:
+async def async_client() -> AsyncIterator[AsyncClient]:
+ async with AsyncClient(BASE_URL) as client:
yield client
@pytest.fixture
-def sync_client() -> Iterator[FlydocsClient]:
- with FlydocsClient(BASE_URL) as client:
+def sync_client() -> Iterator[Client]:
+ with Client(BASE_URL) as client:
yield client
diff --git a/sdks/python/tests/test_async_client.py b/sdks/python/tests/test_async_client.py
deleted file mode 100644
index c1c9bb4..0000000
--- a/sdks/python/tests/test_async_client.py
+++ /dev/null
@@ -1,331 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""End-to-end mock tests for the async client.
-
-Each test stands up a respx route that mimics what the real service
-would return, calls the SDK, and asserts both halves:
-
-* the request the SDK put on the wire matches the controller's
- contract (path, query, headers, body),
-* the response the SDK decoded into a model has the values from the
- mocked body.
-
-Together this exercises the URL builder, header assembly, problem-
-detail decoding, and the typed model layer in one pass.
-"""
-
-from __future__ import annotations
-
-import json
-from datetime import UTC, datetime
-
-import httpx
-import pytest
-import respx
-
-from flydocs_sdk import (
- AsyncFlydocsClient,
- DocumentInput,
- ExtractionRequest,
- FlydocsHTTPError,
- JobStatus,
- SubmitJobRequest,
-)
-
-BASE_URL = "http://flydocs.test"
-
-
-def _now_iso() -> str:
- return datetime(2026, 5, 17, 10, 0, 0, tzinfo=UTC).isoformat()
-
-
-# ---------------------------------------------------------------------------
-# Identity / health
-# ---------------------------------------------------------------------------
-
-
-@respx.mock
-async def test_version(async_client: AsyncFlydocsClient) -> None:
- respx.get(f"{BASE_URL}/api/v1/version").mock(
- return_value=httpx.Response(
- 200,
- json={
- "service": "flydocs",
- "version": "26.5.1",
- "model": "anthropic:claude-sonnet-4-6",
- "fallback_model": "",
- "eda_adapter": "postgres",
- },
- )
- )
- info = await async_client.version()
- assert info.service == "flydocs"
- assert info.eda_adapter == "postgres"
-
-
-@respx.mock
-async def test_health(async_client: AsyncFlydocsClient) -> None:
- respx.get(f"{BASE_URL}/actuator/health/readiness").mock(
- return_value=httpx.Response(200, json={"status": "UP", "components": {"db": "UP"}})
- )
- payload = await async_client.health()
- assert payload["status"] == "UP"
-
-
-# ---------------------------------------------------------------------------
-# Sync extraction
-# ---------------------------------------------------------------------------
-
-
-@respx.mock
-async def test_validate_returns_report(async_client: AsyncFlydocsClient) -> None:
- respx.post(f"{BASE_URL}/api/v1/extract:validate").mock(
- return_value=httpx.Response(
- 200,
- json={"ok": True, "error_count": 0, "warning_count": 0, "errors": [], "warnings": []},
- )
- )
- request = ExtractionRequest(
- documents=[DocumentInput.from_bytes(b"%PDF-1.4", filename="x.pdf")],
- docs=[{"docType": {"documentType": "invoice"}}],
- )
- report = await async_client.validate(request)
- assert report["ok"] is True
-
-
-@respx.mock
-async def test_extract_decodes_result_and_sends_idempotency_header(
- async_client: AsyncFlydocsClient,
-) -> None:
- captured = {}
-
- def _handler(request: httpx.Request) -> httpx.Response:
- captured["headers"] = dict(request.headers)
- captured["body"] = json.loads(request.content)
- return httpx.Response(
- 200,
- json={
- "request_id": "00000000-0000-0000-0000-000000000001",
- "model": "anthropic:claude-sonnet-4-6",
- "latency_ms": 4321,
- "documents": [],
- },
- )
-
- respx.post(f"{BASE_URL}/api/v1/extract").mock(side_effect=_handler)
- result = await async_client.extract(
- ExtractionRequest(
- documents=[DocumentInput.from_bytes(b"%PDF-1.4", filename="x.pdf")],
- docs=[{"docType": {"documentType": "invoice"}}],
- ),
- idempotency_key="abc-123",
- correlation_id="corr-1",
- )
- assert result.model == "anthropic:claude-sonnet-4-6"
- assert result.latency_ms == 4321
- assert captured["headers"]["idempotency-key"] == "abc-123"
- assert captured["headers"]["x-correlation-id"] == "corr-1"
- # The body should be a JSON object that includes our document.
- assert captured["body"]["documents"][0]["filename"] == "x.pdf"
-
-
-@respx.mock
-async def test_extract_timeout_maps_to_typed_error(
- async_client: AsyncFlydocsClient,
-) -> None:
- respx.post(f"{BASE_URL}/api/v1/extract").mock(
- return_value=httpx.Response(
- 408,
- json={
- "detail": {
- "code": "extraction_timeout",
- "title": "Extraction timed out",
- "detail": "Pipeline exceeded 60s sync ceiling",
- }
- },
- )
- )
- with pytest.raises(FlydocsHTTPError) as excinfo:
- await async_client.extract(
- ExtractionRequest(
- documents=[DocumentInput.from_bytes(b"%PDF-1.4", filename="x.pdf")],
- docs=[{"docType": {"documentType": "invoice"}}],
- )
- )
- err = excinfo.value
- assert err.status_code == 408
- assert err.code == "extraction_timeout"
- assert "Pipeline exceeded" in err.detail
-
-
-@respx.mock
-async def test_extract_problem_detail_at_top_level(
- async_client: AsyncFlydocsClient,
-) -> None:
- # Some flydocs error paths emit ``code`` at the top level rather
- # than nested under ``detail``. Decoder should handle both.
- respx.post(f"{BASE_URL}/api/v1/extract").mock(
- return_value=httpx.Response(
- 413,
- json={
- "code": "document_too_large",
- "title": "Document too large",
- "detail": "x.pdf is 50000000 bytes",
- },
- )
- )
- with pytest.raises(FlydocsHTTPError) as excinfo:
- await async_client.extract(
- ExtractionRequest(
- documents=[DocumentInput.from_bytes(b"%PDF-1.4", filename="x.pdf")],
- docs=[{"docType": {"documentType": "invoice"}}],
- )
- )
- assert excinfo.value.code == "document_too_large"
-
-
-# ---------------------------------------------------------------------------
-# Async-job lifecycle
-# ---------------------------------------------------------------------------
-
-
-@respx.mock
-async def test_submit_job_returns_queued(async_client: AsyncFlydocsClient) -> None:
- respx.post(f"{BASE_URL}/api/v1/jobs").mock(
- return_value=httpx.Response(
- 202,
- json={"job_id": "job-1", "status": "QUEUED", "submitted_at": _now_iso()},
- )
- )
- resp = await async_client.submit_job(
- SubmitJobRequest(
- documents=[DocumentInput.from_bytes(b"%PDF-1.4", filename="x.pdf")],
- docs=[{"docType": {"documentType": "invoice"}}],
- callback_url="https://example.com/webhook",
- metadata={"caller": "test"},
- ),
- idempotency_key="submit-once",
- )
- assert resp.job_id == "job-1"
- assert resp.status is JobStatus.QUEUED
-
-
-@respx.mock
-async def test_get_job_status(async_client: AsyncFlydocsClient) -> None:
- respx.get(f"{BASE_URL}/api/v1/jobs/job-1").mock(
- return_value=httpx.Response(
- 200,
- json={
- "job_id": "job-1",
- "status": "SUCCEEDED",
- "submitted_at": _now_iso(),
- "finished_at": _now_iso(),
- },
- )
- )
- status = await async_client.get_job("job-1")
- assert status.status is JobStatus.SUCCEEDED
- assert status.finished_at is not None
-
-
-@respx.mock
-async def test_get_job_result(async_client: AsyncFlydocsClient) -> None:
- respx.get(f"{BASE_URL}/api/v1/jobs/job-1/result").mock(
- return_value=httpx.Response(
- 200,
- json={
- "job_id": "job-1",
- "result": {
- "request_id": "00000000-0000-0000-0000-000000000002",
- "model": "anthropic:claude-sonnet-4-6",
- "latency_ms": 1500,
- "documents": [],
- },
- },
- )
- )
- result = await async_client.get_job_result("job-1", wait_for_bboxes=True, timeout=10.0)
- assert result.job_id == "job-1"
- assert result.result.model == "anthropic:claude-sonnet-4-6"
- # Verify the long-poll query params went on the wire.
- call = respx.calls.last
- assert "wait_for_bboxes=true" in str(call.request.url)
- assert "timeout=10.0" in str(call.request.url)
-
-
-@respx.mock
-async def test_list_jobs_csv_filters(async_client: AsyncFlydocsClient) -> None:
- captured = {}
-
- def _handler(request: httpx.Request) -> httpx.Response:
- captured["url"] = str(request.url)
- return httpx.Response(
- 200,
- json={
- "items": [
- {
- "job_id": "job-1",
- "status": "SUCCEEDED",
- "submitted_at": _now_iso(),
- }
- ],
- "total": 1,
- "limit": 25,
- "offset": 0,
- },
- )
-
- respx.get(f"{BASE_URL}/api/v1/jobs").mock(side_effect=_handler)
- resp = await async_client.list_jobs(
- status=["SUCCEEDED", "PARTIAL_SUCCEEDED"],
- limit=25,
- )
- assert resp.total == 1
- # The list-of-statuses argument joins with comma to match the
- # controller's CSV splitter.
- assert "status=SUCCEEDED%2CPARTIAL_SUCCEEDED" in captured["url"]
- assert "limit=25" in captured["url"]
-
-
-@respx.mock
-async def test_cancel_job_returns_status(async_client: AsyncFlydocsClient) -> None:
- respx.delete(f"{BASE_URL}/api/v1/jobs/job-1").mock(
- return_value=httpx.Response(
- 200,
- json={"job_id": "job-1", "status": "CANCELLED", "submitted_at": _now_iso()},
- )
- )
- resp = await async_client.cancel_job("job-1")
- assert resp.status is JobStatus.CANCELLED
-
-
-@respx.mock
-async def test_cancel_job_not_cancellable(async_client: AsyncFlydocsClient) -> None:
- respx.delete(f"{BASE_URL}/api/v1/jobs/job-1").mock(
- return_value=httpx.Response(
- 409,
- json={
- "detail": {
- "code": "job_not_cancellable",
- "title": "Job cannot be cancelled",
- "detail": "Job is RUNNING",
- }
- },
- )
- )
- with pytest.raises(FlydocsHTTPError) as excinfo:
- await async_client.cancel_job("job-1")
- assert excinfo.value.status_code == 409
- assert excinfo.value.code == "job_not_cancellable"
diff --git a/sdks/python/tests/test_client_async.py b/sdks/python/tests/test_client_async.py
new file mode 100644
index 0000000..3fa63d1
--- /dev/null
+++ b/sdks/python/tests/test_client_async.py
@@ -0,0 +1,410 @@
+# Copyright 2026 Firefly Software Solutions Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""End-to-end mock tests for the async :class:`flydocs_sdk.AsyncClient`.
+
+Each test stands up a respx route that mimics what the v1 service
+returns, calls the SDK, and asserts both halves:
+
+* the request the SDK put on the wire matches the controller's
+ contract (path, method, headers, body),
+* the response the SDK decoded into a model has the values from the
+ mocked body.
+"""
+
+from __future__ import annotations
+
+import base64
+import json
+
+import httpx
+import pytest
+import respx
+
+from flydocs_sdk import (
+ AsyncClient,
+ DocumentTypeSpec,
+ ExtractionRequest,
+ ExtractionStatus,
+ Field,
+ FieldGroup,
+ FieldType,
+ FileInput,
+ FlydocsHttpError,
+ PostProcessingStatus,
+ SubmitExtractionRequest,
+)
+
+BASE_URL = "https://flydocs.test"
+
+PDF_B64 = base64.b64encode(b"%PDF-1.4\n").decode()
+
+
+def _now_iso() -> str:
+ return "2026-05-26T10:00:00+00:00"
+
+
+def _bare_doc_type() -> DocumentTypeSpec:
+ return DocumentTypeSpec(
+ id="invoice",
+ field_groups=[
+ FieldGroup(
+ name="totals",
+ fields=[Field(name="total", type=FieldType.NUMBER)],
+ )
+ ],
+ )
+
+
+def _bare_request() -> ExtractionRequest:
+ return ExtractionRequest(
+ files=[FileInput.from_bytes(b"%PDF-1.4", filename="x.pdf")],
+ document_types=[_bare_doc_type()],
+ )
+
+
+def _bare_submit_request() -> SubmitExtractionRequest:
+ return SubmitExtractionRequest(
+ files=[FileInput.from_bytes(b"%PDF-1.4", filename="x.pdf")],
+ document_types=[_bare_doc_type()],
+ )
+
+
+def _stub_extraction_result() -> dict:
+ return {
+ "id": "ext_1",
+ "status": "success",
+ "files": [],
+ "documents": [],
+ "discovered_documents": [],
+ "rule_results": [],
+ "request_transformations": [],
+ "pipeline": {"model": "anthropic:claude-sonnet-4-6", "latency_ms": 1234},
+ }
+
+
+# ---------------------------------------------------------------------------
+# Identity / health
+# ---------------------------------------------------------------------------
+
+
+@respx.mock
+async def test_version(async_client: AsyncClient) -> None:
+ respx.get(f"{BASE_URL}/api/v1/version").mock(
+ return_value=httpx.Response(
+ 200,
+ json={
+ "service": "flydocs",
+ "version": "26.6.0",
+ "model": "anthropic:claude-sonnet-4-6",
+ "fallback_model": "",
+ "eda_adapter": "postgres",
+ },
+ )
+ )
+ info = await async_client.version()
+ assert info.service == "flydocs"
+ assert info.eda_adapter == "postgres"
+
+
+@respx.mock
+async def test_health(async_client: AsyncClient) -> None:
+ respx.get(f"{BASE_URL}/actuator/health/readiness").mock(
+ return_value=httpx.Response(200, json={"status": "UP"})
+ )
+ payload = await async_client.health()
+ assert payload["status"] == "UP"
+
+
+# ---------------------------------------------------------------------------
+# Sync extraction
+# ---------------------------------------------------------------------------
+
+
+@respx.mock
+async def test_validate_returns_typed_response(async_client: AsyncClient) -> None:
+ respx.post(f"{BASE_URL}/api/v1/extract:validate").mock(
+ return_value=httpx.Response(
+ 200,
+ json={"ok": True, "error_count": 0, "warning_count": 0, "errors": [], "warnings": []},
+ )
+ )
+ report = await async_client.validate(_bare_request())
+ assert report.ok is True
+ assert report.error_count == 0
+
+
+@respx.mock
+async def test_extract_posts_v1_body(async_client: AsyncClient) -> None:
+ captured: dict = {}
+
+ def _handler(request: httpx.Request) -> httpx.Response:
+ captured["headers"] = dict(request.headers)
+ captured["body"] = json.loads(request.content)
+ return httpx.Response(200, json=_stub_extraction_result())
+
+ respx.post(f"{BASE_URL}/api/v1/extract").mock(side_effect=_handler)
+ result = await async_client.extract(
+ _bare_request(),
+ idempotency_key="idem-1",
+ correlation_id="corr-1",
+ )
+ assert result.id == "ext_1"
+ assert result.pipeline.model == "anthropic:claude-sonnet-4-6"
+ assert captured["headers"]["idempotency-key"] == "idem-1"
+ assert captured["headers"]["x-correlation-id"] == "corr-1"
+ # v1 body uses ``files`` / ``document_types`` keys.
+ assert "files" in captured["body"]
+ assert "document_types" in captured["body"]
+ assert "documents" not in captured["body"]
+ assert "docs" not in captured["body"]
+
+
+@respx.mock
+async def test_extract_timeout_maps_to_typed_error(async_client: AsyncClient) -> None:
+ respx.post(f"{BASE_URL}/api/v1/extract").mock(
+ return_value=httpx.Response(
+ 408,
+ json={
+ "type": "about:blank",
+ "title": "Extraction timed out",
+ "status": 408,
+ "code": "timeout",
+ "detail": "Pipeline exceeded 60s sync ceiling",
+ },
+ )
+ )
+ with pytest.raises(FlydocsHttpError) as excinfo:
+ await async_client.extract(_bare_request())
+ err = excinfo.value
+ assert err.status_code == 408
+ assert err.code == "timeout"
+ assert "Pipeline exceeded" in err.detail
+
+
+@respx.mock
+async def test_extract_413_file_too_large(async_client: AsyncClient) -> None:
+ respx.post(f"{BASE_URL}/api/v1/extract").mock(
+ return_value=httpx.Response(
+ 413,
+ json={
+ "code": "file_too_large",
+ "title": "File too large",
+ "status": 413,
+ "detail": "x.pdf is 5000000 bytes",
+ },
+ )
+ )
+ with pytest.raises(FlydocsHttpError) as excinfo:
+ await async_client.extract(_bare_request())
+ assert excinfo.value.code == "file_too_large"
+
+
+@respx.mock
+async def test_extract_supports_multipart_upload(async_client: AsyncClient) -> None:
+ """Posting ``files=[...]`` should switch the wire format to multipart."""
+ captured: dict = {}
+
+ def _handler(request: httpx.Request) -> httpx.Response:
+ captured["content_type"] = request.headers.get("content-type", "")
+ captured["body_size"] = len(request.content)
+ return httpx.Response(200, json=_stub_extraction_result())
+
+ respx.post(f"{BASE_URL}/api/v1/extract").mock(side_effect=_handler)
+ import io
+
+ buf = io.BytesIO(b"%PDF-1.4-binary-bytes")
+ buf.name = "invoice.pdf"
+ result = await async_client.extract(_bare_request(), files=[buf])
+ assert result.id == "ext_1"
+ assert captured["content_type"].startswith("multipart/form-data")
+
+
+# ---------------------------------------------------------------------------
+# Async extraction lifecycle
+# ---------------------------------------------------------------------------
+
+
+@respx.mock
+async def test_extractions_create_returns_extraction(async_client: AsyncClient) -> None:
+ respx.post(f"{BASE_URL}/api/v1/extractions").mock(
+ return_value=httpx.Response(
+ 202,
+ json={"id": "ext_1", "status": "queued", "submitted_at": _now_iso()},
+ )
+ )
+ ext = await async_client.extractions.create(_bare_submit_request(), idempotency_key="submit-1")
+ assert ext.id == "ext_1"
+ assert ext.status is ExtractionStatus.QUEUED
+
+
+@respx.mock
+async def test_extractions_get_returns_status(async_client: AsyncClient) -> None:
+ respx.get(f"{BASE_URL}/api/v1/extractions/ext_1").mock(
+ return_value=httpx.Response(
+ 200,
+ json={
+ "id": "ext_1",
+ "status": "succeeded",
+ "submitted_at": _now_iso(),
+ "finished_at": _now_iso(),
+ },
+ )
+ )
+ ext = await async_client.extractions.get("ext_1")
+ assert ext.status is ExtractionStatus.SUCCEEDED
+ assert ext.finished_at is not None
+
+
+@respx.mock
+async def test_extractions_get_result_envelope(async_client: AsyncClient) -> None:
+ respx.get(f"{BASE_URL}/api/v1/extractions/ext_1/result").mock(
+ return_value=httpx.Response(
+ 200,
+ json={"id": "ext_1", "result": _stub_extraction_result()},
+ )
+ )
+ env = await async_client.extractions.get_result("ext_1", wait_for_bboxes=True, timeout=10.0)
+ assert env.id == "ext_1"
+ assert env.result.id == "ext_1"
+ # Long-poll params went on the wire under the server's name.
+ call = respx.calls.last
+ assert "wait_for_post_processing=true" in str(call.request.url)
+ assert "timeout=10.0" in str(call.request.url)
+
+
+@respx.mock
+async def test_extractions_cancel_returns_status(async_client: AsyncClient) -> None:
+ respx.delete(f"{BASE_URL}/api/v1/extractions/ext_1").mock(
+ return_value=httpx.Response(
+ 200,
+ json={"id": "ext_1", "status": "cancelled", "submitted_at": _now_iso()},
+ )
+ )
+ ext = await async_client.extractions.cancel("ext_1")
+ assert ext.status is ExtractionStatus.CANCELLED
+
+
+@respx.mock
+async def test_extractions_cancel_not_cancellable_raises(async_client: AsyncClient) -> None:
+ respx.delete(f"{BASE_URL}/api/v1/extractions/ext_1").mock(
+ return_value=httpx.Response(
+ 409,
+ json={
+ "code": "not_cancellable",
+ "title": "Extraction cannot be cancelled",
+ "status": 409,
+ "detail": "Extraction is already running",
+ },
+ )
+ )
+ with pytest.raises(FlydocsHttpError) as excinfo:
+ await async_client.extractions.cancel("ext_1")
+ assert excinfo.value.status_code == 409
+ assert excinfo.value.code == "not_cancellable"
+
+
+@respx.mock
+async def test_extractions_list_csv_filters(async_client: AsyncClient) -> None:
+ captured: dict = {}
+
+ def _handler(request: httpx.Request) -> httpx.Response:
+ captured["url"] = str(request.url)
+ return httpx.Response(
+ 200,
+ json={
+ "items": [{"id": "ext_1", "status": "succeeded", "submitted_at": _now_iso()}],
+ "total": 1,
+ "limit": 25,
+ "offset": 0,
+ },
+ )
+
+ respx.get(f"{BASE_URL}/api/v1/extractions").mock(side_effect=_handler)
+ resp = await async_client.extractions.list(
+ status=[ExtractionStatus.SUCCEEDED, ExtractionStatus.FAILED],
+ post_processing_status=[PostProcessingStatus.PENDING],
+ limit=25,
+ )
+ assert resp.total == 1
+ assert resp.items[0].id == "ext_1"
+ # Comma-encoded list params on the wire.
+ assert "status=succeeded%2Cfailed" in captured["url"]
+ assert "post_processing_status=pending" in captured["url"]
+ assert "limit=25" in captured["url"]
+
+
+# ---------------------------------------------------------------------------
+# Polling helper
+# ---------------------------------------------------------------------------
+
+
+@respx.mock
+async def test_wait_for_completion_loops_until_terminal(async_client: AsyncClient) -> None:
+ respx.get(f"{BASE_URL}/api/v1/extractions/ext_1").mock(
+ side_effect=[
+ httpx.Response(200, json={"id": "ext_1", "status": "queued", "submitted_at": _now_iso()}),
+ httpx.Response(200, json={"id": "ext_1", "status": "running", "submitted_at": _now_iso()}),
+ httpx.Response(
+ 200,
+ json={"id": "ext_1", "status": "succeeded", "submitted_at": _now_iso()},
+ ),
+ ]
+ )
+ final = await async_client.wait_for_completion("ext_1", poll_interval=0.001, timeout=5.0)
+ assert final.status is ExtractionStatus.SUCCEEDED
+
+
+@respx.mock
+async def test_wait_for_completion_times_out(async_client: AsyncClient) -> None:
+ respx.get(f"{BASE_URL}/api/v1/extractions/ext_1").mock(
+ return_value=httpx.Response(
+ 200, json={"id": "ext_1", "status": "running", "submitted_at": _now_iso()}
+ )
+ )
+ with pytest.raises(TimeoutError):
+ await async_client.wait_for_completion("ext_1", poll_interval=0.001, timeout=0.05)
+
+
+# ---------------------------------------------------------------------------
+# API key
+# ---------------------------------------------------------------------------
+
+
+@respx.mock
+async def test_api_key_sets_bearer_header() -> None:
+ captured: dict = {}
+
+ def _handler(request: httpx.Request) -> httpx.Response:
+ captured["auth"] = request.headers.get("authorization", "")
+ return httpx.Response(
+ 200,
+ json={
+ "service": "flydocs",
+ "version": "26.6.0",
+ "model": "m",
+ "fallback_model": "",
+ "eda_adapter": "memory",
+ },
+ )
+
+ respx.get(f"{BASE_URL}/api/v1/version").mock(side_effect=_handler)
+ async with AsyncClient(BASE_URL, api_key="topsecret") as client:
+ await client.version()
+ assert captured["auth"] == "Bearer topsecret"
+
+
+def test_pdf_b64_helper_is_valid() -> None:
+ """Sanity: the test fixture's base64 decodes back to the PDF magic bytes."""
+ assert base64.b64decode(PDF_B64).startswith(b"%PDF-1.4")
diff --git a/sdks/python/tests/test_client_sync.py b/sdks/python/tests/test_client_sync.py
new file mode 100644
index 0000000..9fbc120
--- /dev/null
+++ b/sdks/python/tests/test_client_sync.py
@@ -0,0 +1,214 @@
+# Copyright 2026 Firefly Software Solutions Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Smoke tests for the synchronous :class:`flydocs_sdk.Client`.
+
+The sync client wraps the async one; full coverage of the endpoint
+shapes lives in :mod:`tests.test_client_async`. The job of these tests
+is to prove the sync wrapper plumbs everything through correctly --
+each endpoint reachable, each sub-resource accessor wired up, the
+context manager closing cleanly.
+"""
+
+from __future__ import annotations
+
+import base64
+
+import httpx
+import respx
+
+from flydocs_sdk import (
+ Client,
+ DocumentTypeSpec,
+ ExtractionRequest,
+ ExtractionStatus,
+ Field,
+ FieldGroup,
+ FieldType,
+ FileInput,
+ SubmitExtractionRequest,
+)
+
+BASE_URL = "https://flydocs.test"
+
+PDF_B64 = base64.b64encode(b"%PDF-1.4\n").decode()
+
+
+def _doc_type() -> DocumentTypeSpec:
+ return DocumentTypeSpec(
+ id="invoice",
+ field_groups=[
+ FieldGroup(
+ name="g",
+ fields=[Field(name="x", type=FieldType.STRING)],
+ )
+ ],
+ )
+
+
+def _stub_result() -> dict:
+ return {
+ "id": "ext_1",
+ "status": "success",
+ "files": [],
+ "documents": [],
+ "discovered_documents": [],
+ "rule_results": [],
+ "request_transformations": [],
+ "pipeline": {"model": "m", "latency_ms": 1},
+ }
+
+
+# ---------------------------------------------------------------------------
+# Identity
+# ---------------------------------------------------------------------------
+
+
+@respx.mock
+def test_sync_version() -> None:
+ respx.get(f"{BASE_URL}/api/v1/version").mock(
+ return_value=httpx.Response(
+ 200,
+ json={
+ "service": "flydocs",
+ "version": "26.6.0",
+ "model": "anthropic:claude-sonnet-4-6",
+ "fallback_model": "",
+ "eda_adapter": "postgres",
+ },
+ )
+ )
+ with Client(BASE_URL) as client:
+ info = client.version()
+ assert info.service == "flydocs"
+
+
+# ---------------------------------------------------------------------------
+# Sync extraction
+# ---------------------------------------------------------------------------
+
+
+@respx.mock
+def test_sync_validate_returns_typed_response() -> None:
+ respx.post(f"{BASE_URL}/api/v1/extract:validate").mock(
+ return_value=httpx.Response(
+ 200,
+ json={"ok": True, "error_count": 0, "warning_count": 0, "errors": [], "warnings": []},
+ )
+ )
+ req = ExtractionRequest(
+ files=[FileInput.from_bytes(b"%PDF-1.4", filename="x.pdf")],
+ document_types=[_doc_type()],
+ )
+ with Client(BASE_URL) as client:
+ result = client.validate(req)
+ assert result.ok is True
+
+
+@respx.mock
+def test_sync_extract_returns_result() -> None:
+ respx.post(f"{BASE_URL}/api/v1/extract").mock(return_value=httpx.Response(200, json=_stub_result()))
+ req = ExtractionRequest(
+ files=[FileInput.from_bytes(b"%PDF-1.4", filename="x.pdf")],
+ document_types=[_doc_type()],
+ )
+ with Client(BASE_URL) as client:
+ result = client.extract(req)
+ assert result.id == "ext_1"
+ assert result.pipeline.model == "m"
+
+
+# ---------------------------------------------------------------------------
+# Extractions sub-resource
+# ---------------------------------------------------------------------------
+
+
+@respx.mock
+def test_sync_extractions_create() -> None:
+ respx.post(f"{BASE_URL}/api/v1/extractions").mock(
+ return_value=httpx.Response(
+ 202,
+ json={"id": "ext_1", "status": "queued", "submitted_at": "2026-05-26T10:00:00+00:00"},
+ )
+ )
+ req = SubmitExtractionRequest(
+ files=[FileInput.from_bytes(b"%PDF-1.4", filename="x.pdf")],
+ document_types=[_doc_type()],
+ )
+ with Client(BASE_URL) as client:
+ ext = client.extractions.create(req, idempotency_key="k")
+ assert ext.status is ExtractionStatus.QUEUED
+
+
+@respx.mock
+def test_sync_extractions_get_result() -> None:
+ respx.get(f"{BASE_URL}/api/v1/extractions/ext_1/result").mock(
+ return_value=httpx.Response(200, json={"id": "ext_1", "result": _stub_result()})
+ )
+ with Client(BASE_URL) as client:
+ env = client.extractions.get_result("ext_1", wait_for_bboxes=True, timeout=10.0)
+ assert env.id == "ext_1"
+ assert env.result.id == "ext_1"
+
+
+@respx.mock
+def test_sync_extractions_get_status() -> None:
+ respx.get(f"{BASE_URL}/api/v1/extractions/ext_1").mock(
+ return_value=httpx.Response(
+ 200,
+ json={"id": "ext_1", "status": "running", "submitted_at": "2026-05-26T10:00:00+00:00"},
+ )
+ )
+ with Client(BASE_URL) as client:
+ ext = client.extractions.get("ext_1")
+ assert ext.status is ExtractionStatus.RUNNING
+
+
+@respx.mock
+def test_sync_extractions_cancel() -> None:
+ respx.delete(f"{BASE_URL}/api/v1/extractions/ext_1").mock(
+ return_value=httpx.Response(
+ 200,
+ json={
+ "id": "ext_1",
+ "status": "cancelled",
+ "submitted_at": "2026-05-26T10:00:00+00:00",
+ },
+ )
+ )
+ with Client(BASE_URL) as client:
+ ext = client.extractions.cancel("ext_1")
+ assert ext.status is ExtractionStatus.CANCELLED
+
+
+@respx.mock
+def test_sync_extractions_list() -> None:
+ respx.get(f"{BASE_URL}/api/v1/extractions").mock(
+ return_value=httpx.Response(200, json={"items": [], "total": 0, "limit": 50, "offset": 0})
+ )
+ with Client(BASE_URL) as client:
+ listing = client.extractions.list(limit=50)
+ assert listing.total == 0
+
+
+# ---------------------------------------------------------------------------
+# Lifecycle
+# ---------------------------------------------------------------------------
+
+
+def test_sync_client_close_idempotent() -> None:
+ client = Client(BASE_URL)
+ client.close()
+ # Second close is a no-op.
+ client.close()
diff --git a/sdks/python/tests/test_imports.py b/sdks/python/tests/test_imports.py
new file mode 100644
index 0000000..baf19c8
--- /dev/null
+++ b/sdks/python/tests/test_imports.py
@@ -0,0 +1,176 @@
+# Copyright 2026 Firefly Software Solutions Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Smoke import test for the full v1 SDK surface.
+
+If a symbol disappears from ``__all__`` (or its module) this test
+fails noisily before the package is shipped. Each name maps to one
+of the four surface families described in the spec:
+
+* clients
+* errors
+* event-type constants + envelope
+* wire-level Pydantic models + the recursive Field shape
+"""
+
+from __future__ import annotations
+
+import pytest
+
+import flydocs_sdk
+
+EXPECTED_EXPORTS: tuple[str, ...] = (
+ # Version
+ "__version__",
+ # Clients
+ "AsyncClient",
+ "AsyncExtractionsResource",
+ "Client",
+ "ExtractionsResource",
+ # Errors
+ "FlydocsAPIError",
+ "FlydocsClientError",
+ "FlydocsError",
+ "FlydocsHTTPError",
+ "FlydocsHttpError",
+ "FlydocsTimeoutError",
+ "ProblemDetails",
+ # Event-type constants
+ "ALL_EVENT_TYPES",
+ "EVENT_TYPE_EXTRACTION_COMPLETED",
+ "EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED",
+ "EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED",
+ "EVENT_TYPE_EXTRACTION_SUBMITTED",
+ # Request side
+ "DocumentTypeSpec",
+ "EntityResolutionTransformation",
+ "EscalationConfig",
+ "ExtractionOptions",
+ "ExtractionRequest",
+ "Field",
+ "FieldGroup",
+ "FieldType",
+ "FileInput",
+ "LlmTransformation",
+ "RuleFieldParent",
+ "RuleOutputSpec",
+ "RuleParent",
+ "RuleRuleParent",
+ "RuleSpec",
+ "RuleValidatorParent",
+ "StageToggles",
+ "StandardFormat",
+ "SubmitExtractionRequest",
+ "Transformation",
+ "TransformationScope",
+ "ValidatorSpec",
+ "ValidatorType",
+ "VisualCheck",
+ # Response side
+ "BboxQuality",
+ "BboxSource",
+ "BoundingBox",
+ "CheckStatus",
+ "ClassificationInfo",
+ "ContentAuthenticity",
+ "ContentCoherenceCheck",
+ "ContentIntegrityStatus",
+ "Document",
+ "DocumentAuthenticity",
+ "EscalationInfo",
+ "ExtractedField",
+ "ExtractedFieldGroup",
+ "ExtractionResult",
+ "FieldValidation",
+ "FieldValidationError",
+ "FileSummary",
+ "JudgeOutcome",
+ "JudgeStatus",
+ "PipelineError",
+ "PipelineMeta",
+ "RuleResult",
+ "TraceEntry",
+ "UsageBreakdown",
+ "ValidationResponse",
+ "ValidationRule",
+ # Extraction lifecycle
+ "BboxRefinementInfo",
+ "Extraction",
+ "ExtractionError",
+ "ExtractionListQuery",
+ "ExtractionListResponse",
+ "ExtractionResultEnvelope",
+ "ExtractionStatus",
+ "PostProcessing",
+ "PostProcessingStatus",
+ # Identity + events
+ "EventEnvelope",
+ "VersionInfo",
+ # Webhooks
+ "WebhookVerificationError",
+ "WebhookVerifier",
+)
+
+
+@pytest.mark.parametrize("name", EXPECTED_EXPORTS)
+def test_exported(name: str) -> None:
+ assert hasattr(flydocs_sdk, name), f"flydocs_sdk is missing export {name!r}"
+
+
+def test_version_string() -> None:
+ assert isinstance(flydocs_sdk.__version__, str)
+ assert flydocs_sdk.__version__.count(".") >= 1
+
+
+def test_all_set_matches_expected() -> None:
+ public = {name for name in flydocs_sdk.__all__ if not name.startswith("_")}
+ missing = set(EXPECTED_EXPORTS) - set(flydocs_sdk.__all__) - {"__version__"}
+ extras = public - set(EXPECTED_EXPORTS) - {"__version__"}
+ assert not missing, f"__all__ is missing: {missing}"
+ # ``extras`` is informational only -- we allow forward-compat new names.
+ assert isinstance(extras, set)
+
+
+def test_event_type_constants_are_strings() -> None:
+ assert flydocs_sdk.EVENT_TYPE_EXTRACTION_SUBMITTED == "extraction.submitted"
+ assert flydocs_sdk.EVENT_TYPE_EXTRACTION_COMPLETED == "extraction.completed"
+ assert (
+ flydocs_sdk.EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED == "extraction.post_processing.requested"
+ )
+ assert (
+ flydocs_sdk.EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED == "extraction.post_processing.completed"
+ )
+
+
+def test_legacy_v0_names_are_gone() -> None:
+ """The old v0 type names must not appear at the package top level."""
+ for legacy in (
+ "DocumentInput",
+ "DocSpec",
+ "DocType",
+ "FieldItem",
+ "FieldSpec",
+ "JobStatus",
+ "JobStatusResponse",
+ "JobResult",
+ "JobWebhookPayload",
+ "JobListResponse",
+ "SubmitJobRequest",
+ "SubmitJobResponse",
+ "StandardValidatorSpec",
+ "StandardValidatorType",
+ "VisualValidatorSpec",
+ "ValidatorsSpec",
+ ):
+ assert not hasattr(flydocs_sdk, legacy), f"v0 symbol {legacy!r} should not be re-exported"
diff --git a/sdks/python/tests/test_models.py b/sdks/python/tests/test_models.py
index 1ff7712..1d27cc9 100644
--- a/sdks/python/tests/test_models.py
+++ b/sdks/python/tests/test_models.py
@@ -12,12 +12,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-"""Unit tests for the wire-level models.
+"""Unit tests for the v1 wire-level models.
-The SDK keeps its own copies of the DTOs and the contract here is that
-they (a) accept the on-wire shape the service emits and (b) stay
-forward-compatible when the service adds new fields. Both halves of
-that contract are pinned down below.
+Three things are pinned down here:
+
+1. **Enum shape.** Every v1 enum is lowercase snake_case on the wire;
+ the legacy v0 names (``QUEUED`` capitalised, ``PARTIAL_SUCCEEDED``,
+ ``REFINING_BBOXES``) must not be present.
+2. **Recursion.** A :class:`Field` can carry ``items`` (array row) and
+ ``fields`` (object members); a roundtripped JSON dump preserves both
+ levels.
+3. **Forward compat.** Every model declares ``extra="allow"``, so unknown
+ fields the service sends are preserved in ``model_extra`` rather than
+ silently dropped or failing validation.
"""
from __future__ import annotations
@@ -29,135 +36,479 @@
import pytest
from flydocs_sdk import (
- DocumentInput,
+ BboxQuality,
+ BboxSource,
+ BoundingBox,
+ CheckStatus,
+ ContentIntegrityStatus,
+ DocumentTypeSpec,
+ EventEnvelope,
+ Extraction,
ExtractionRequest,
ExtractionResult,
- JobListResponse,
- JobStatus,
- JobStatusResponse,
- JobWebhookPayload,
- SubmitJobResponse,
+ ExtractionStatus,
+ Field,
+ FieldGroup,
+ FieldType,
+ FileInput,
+ JudgeStatus,
+ PostProcessing,
+ PostProcessingStatus,
+ RuleFieldParent,
+ RuleRuleParent,
+ RuleSpec,
+ StandardFormat,
+ SubmitExtractionRequest,
+ ValidationRule,
+ ValidatorSpec,
+ ValidatorType,
+ VisualCheck,
)
+PDF_B64 = base64.b64encode(b"%PDF-1.4\n").decode()
+
+
+# ---------------------------------------------------------------------------
+# Enums
+# ---------------------------------------------------------------------------
+
+
+def test_extraction_status_lowercase_values() -> None:
+ assert ExtractionStatus.QUEUED.value == "queued"
+ assert ExtractionStatus.RUNNING.value == "running"
+ assert ExtractionStatus.SUCCEEDED.value == "succeeded"
+ assert ExtractionStatus.FAILED.value == "failed"
+ assert ExtractionStatus.CANCELLED.value == "cancelled"
+
+
+def test_extraction_status_drops_legacy_members() -> None:
+ members = {m.name for m in ExtractionStatus}
+ assert "PARTIAL_SUCCEEDED" not in members
+ assert "REFINING_BBOXES" not in members
+ assert members == {"QUEUED", "RUNNING", "SUCCEEDED", "FAILED", "CANCELLED"}
+
+
+def test_post_processing_status_lowercase_values() -> None:
+ assert PostProcessingStatus.PENDING.value == "pending"
+ assert PostProcessingStatus.RUNNING.value == "running"
+ assert PostProcessingStatus.SUCCEEDED.value == "succeeded"
+ assert PostProcessingStatus.FAILED.value == "failed"
+
+
+def test_extraction_status_is_terminal_helper() -> None:
+ assert not ExtractionStatus.QUEUED.is_terminal
+ assert not ExtractionStatus.RUNNING.is_terminal
+ assert ExtractionStatus.SUCCEEDED.is_terminal
+ assert ExtractionStatus.FAILED.is_terminal
+ assert ExtractionStatus.CANCELLED.is_terminal
+
+
+def test_judge_status_lowercase() -> None:
+ assert JudgeStatus.PASS.value == "pass"
+ assert JudgeStatus.FAIL.value == "fail"
+ assert JudgeStatus.UNCERTAIN.value == "uncertain"
+
+
+def test_check_status_lowercase() -> None:
+ assert CheckStatus.PASS.value == "pass"
+
+
+def test_content_integrity_status_lowercase() -> None:
+ assert ContentIntegrityStatus.VALID.value == "valid"
+
+
+def test_bbox_source_drops_none() -> None:
+ members = {m.name for m in BboxSource}
+ assert "NONE" not in members
+ assert members == {"LLM", "PDF_TEXT", "OCR"}
+
+
+def test_bbox_quality_drops_empty() -> None:
+ members = {m.name for m in BboxQuality}
+ assert "EMPTY" not in members
+ assert members == {"GOOD", "POOR", "SUSPICIOUS", "INVALID"}
+
+
+def test_field_type_object_added() -> None:
+ assert FieldType.OBJECT.value == "object"
+
+
+def test_standard_format_adds_time_and_currency() -> None:
+ assert StandardFormat.TIME.value == "time"
+ assert StandardFormat.CURRENCY.value == "currency"
+
+
+def test_validation_rule_validator_replaces_standard() -> None:
+ assert ValidationRule.VALIDATOR.value == "validator"
+ members = {m.name for m in ValidationRule}
+ assert "STANDARD" not in members
+
-def test_document_input_from_bytes_roundtrips() -> None:
- doc = DocumentInput.from_bytes(b"hello", filename="hello.txt", content_type="text/plain")
- assert base64.b64decode(doc.content_base64) == b"hello"
- assert doc.filename == "hello.txt"
- assert doc.content_type == "text/plain"
+# ---------------------------------------------------------------------------
+# FileInput
+# ---------------------------------------------------------------------------
-def test_document_input_from_path(tmp_path: Path) -> None:
- f = tmp_path / "x.bin"
- f.write_bytes(b"abc")
- doc = DocumentInput.from_path(f)
- assert doc.filename == "x.bin"
- assert base64.b64decode(doc.content_base64) == b"abc"
+def test_file_input_basic() -> None:
+ f = FileInput(filename="a.pdf", content_base64=PDF_B64, expected_type="invoice")
+ assert f.filename == "a.pdf"
+ assert f.expected_type == "invoice"
-def test_document_input_strips_data_url_prefix() -> None:
- doc = DocumentInput(
- filename="x.pdf",
- content_base64="data:application/pdf;base64,YWJj",
+def test_file_input_from_bytes_roundtrip() -> None:
+ f = FileInput.from_bytes(b"hello", filename="x.txt", content_type="text/plain")
+ assert base64.b64decode(f.content_base64) == b"hello"
+ assert f.filename == "x.txt"
+ assert f.content_type == "text/plain"
+
+
+def test_file_input_from_path(tmp_path: Path) -> None:
+ p = tmp_path / "x.bin"
+ p.write_bytes(b"abc")
+ f = FileInput.from_path(p, expected_type="invoice")
+ assert f.filename == "x.bin"
+ assert base64.b64decode(f.content_base64) == b"abc"
+ assert f.expected_type == "invoice"
+
+
+def test_file_input_strips_data_url_prefix() -> None:
+ f = FileInput(filename="x.pdf", content_base64="data:application/pdf;base64,YWJj")
+ assert f.content_base64 == "YWJj"
+
+
+# ---------------------------------------------------------------------------
+# Recursive Field
+# ---------------------------------------------------------------------------
+
+
+def test_field_primitive() -> None:
+ f = Field(name="total", type=FieldType.NUMBER, required=True, minimum=0.0)
+ assert f.name == "total"
+ assert f.type == FieldType.NUMBER
+ assert f.items is None
+ assert f.fields is None
+
+
+def test_field_array_with_items() -> None:
+ f = Field(
+ name="line_items",
+ type=FieldType.ARRAY,
+ items=Field(
+ name="row",
+ type=FieldType.OBJECT,
+ fields=[
+ Field(name="description", type=FieldType.STRING),
+ Field(name="amount", type=FieldType.NUMBER),
+ ],
+ ),
+ )
+ assert f.items is not None
+ assert f.items.type == FieldType.OBJECT
+ assert f.items.fields is not None
+ assert len(f.items.fields) == 2
+
+
+def test_field_object_with_fields() -> None:
+ f = Field(
+ name="address",
+ type=FieldType.OBJECT,
+ fields=[
+ Field(name="street", type=FieldType.STRING),
+ Field(name="zip", type=FieldType.STRING),
+ ],
)
- assert doc.content_base64 == "YWJj"
+ assert f.type == FieldType.OBJECT
+ assert f.fields is not None
+ assert len(f.fields) == 2
+
+
+def test_field_dumps_recursive_shape() -> None:
+ f = Field(
+ name="rows",
+ type=FieldType.ARRAY,
+ items=Field(
+ name="r",
+ type=FieldType.OBJECT,
+ fields=[Field(name="x", type=FieldType.STRING)],
+ ),
+ )
+ dumped = f.model_dump(mode="json", by_alias=True)
+ assert dumped["type"] == "array"
+ assert dumped["items"]["type"] == "object"
+ assert dumped["items"]["fields"][0]["name"] == "x"
+
+
+def test_field_validators_list() -> None:
+ f = Field(
+ name="iban",
+ validators=[ValidatorSpec(name=ValidatorType.IBAN)],
+ )
+ dumped = f.model_dump(mode="json")
+ # The dispatch key is ``name`` in v1 (not ``type``).
+ assert dumped["validators"][0]["name"] == "iban"
+
+
+# ---------------------------------------------------------------------------
+# DocumentTypeSpec
+# ---------------------------------------------------------------------------
+
+
+def test_document_type_spec_flat() -> None:
+ spec = DocumentTypeSpec(
+ id="invoice",
+ description="Vendor invoice",
+ country="ES",
+ field_groups=[
+ FieldGroup(
+ name="header",
+ fields=[Field(name="invoice_number", type=FieldType.STRING)],
+ )
+ ],
+ visual_checks=[VisualCheck(name="signature_present", description="visible signature")],
+ )
+ dumped = spec.model_dump(mode="json")
+ assert dumped["id"] == "invoice"
+ assert dumped["description"] == "Vendor invoice"
+ assert dumped["country"] == "ES"
+ assert dumped["field_groups"][0]["name"] == "header"
+ assert dumped["visual_checks"][0]["name"] == "signature_present"
+
+
+# ---------------------------------------------------------------------------
+# Rules
+# ---------------------------------------------------------------------------
-def test_extraction_request_accepts_dict_docs() -> None:
- # The SDK deliberately keeps ``docs`` permissive so callers can
- # send the rich shape without depending on the service's DTOs.
+def test_rule_field_parent_uses_kind_discriminator() -> None:
+ parent = RuleFieldParent(document_type="invoice", fields=["total", "currency"])
+ dumped = parent.model_dump(mode="json")
+ assert dumped["kind"] == "field"
+ assert dumped["document_type"] == "invoice"
+ assert dumped["fields"] == ["total", "currency"]
+
+
+def test_rule_rule_parent_field_renamed() -> None:
+ parent = RuleRuleParent(rule="other-rule-id")
+ dumped = parent.model_dump(mode="json")
+ assert dumped["kind"] == "rule"
+ assert dumped["rule"] == "other-rule-id"
+
+
+def test_rule_spec_roundtrip() -> None:
+ rule = RuleSpec(
+ id="totals_ok",
+ predicate="subtotal + tax = total",
+ parents=[RuleFieldParent(document_type="invoice", fields=["subtotal", "tax", "total"])],
+ )
+ dumped = rule.model_dump(mode="json")
+ assert dumped["id"] == "totals_ok"
+ assert dumped["parents"][0]["kind"] == "field"
+
+
+# ---------------------------------------------------------------------------
+# ExtractionRequest envelope
+# ---------------------------------------------------------------------------
+
+
+def test_extraction_request_uses_v1_keys() -> None:
req = ExtractionRequest(
- documents=[DocumentInput.from_bytes(b"x", filename="x.pdf")],
- docs=[
- {
- "docType": {"documentType": "invoice", "description": "Test invoice"},
- "groups": [{"fieldGroupName": "totals", "fieldGroupFields": []}],
- }
+ files=[FileInput.from_bytes(b"%PDF-1.4", filename="x.pdf")],
+ document_types=[
+ DocumentTypeSpec(
+ id="invoice",
+ field_groups=[
+ FieldGroup(
+ name="g",
+ fields=[Field(name="x", type=FieldType.STRING)],
+ )
+ ],
+ )
],
)
dumped = req.model_dump(mode="json")
- assert dumped["docs"][0]["docType"]["documentType"] == "invoice"
- assert "request_id" in dumped # auto-generated UUID
+ assert "files" in dumped
+ assert "documents" not in dumped
+ assert "document_types" in dumped
+ assert "docs" not in dumped
+ assert "request_id" not in dumped
-def test_extraction_result_tolerates_unknown_fields() -> None:
- # Forward-compat: the service can ship new top-level fields without
- # breaking SDK clients pinned to an older version.
- payload = {
- "request_id": "00000000-0000-0000-0000-000000000000",
- "model": "anthropic:claude-sonnet-4-6",
- "latency_ms": 1234,
- "documents": [],
- # New field invented by the service post-SDK-release:
- "future_field": {"shiny": True},
- }
- result = ExtractionResult.model_validate(payload)
- assert result.model == "anthropic:claude-sonnet-4-6"
- # Unknown field is preserved in ``model_extra`` so callers can still read it.
- assert result.model_extra is not None
- assert result.model_extra["future_field"] == {"shiny": True}
+def test_submit_extraction_request_adds_callback_and_metadata() -> None:
+ req = SubmitExtractionRequest(
+ files=[FileInput.from_bytes(b"x", filename="a.pdf")],
+ document_types=[
+ DocumentTypeSpec(
+ id="x",
+ field_groups=[FieldGroup(name="g", fields=[Field(name="a", type=FieldType.STRING)])],
+ )
+ ],
+ callback_url="https://example.com/wh",
+ metadata={"caller": "test"},
+ )
+ dumped = req.model_dump(mode="json")
+ assert dumped["callback_url"] == "https://example.com/wh"
+ assert dumped["metadata"] == {"caller": "test"}
-def test_job_status_response_parses_full_shape() -> None:
- payload = {
- "job_id": "job_123",
- "status": "RUNNING",
- "submitted_at": "2026-05-17T10:00:00+00:00",
- "started_at": "2026-05-17T10:00:01+00:00",
- "attempts": 2,
- "bbox_refine_status": "pending",
- }
- resp = JobStatusResponse.model_validate(payload)
- assert resp.status is JobStatus.RUNNING
- assert resp.attempts == 2
- assert resp.started_at == datetime.fromisoformat("2026-05-17T10:00:01+00:00")
- assert resp.bbox_refine_status == "pending"
+# ---------------------------------------------------------------------------
+# Extraction lifecycle
+# ---------------------------------------------------------------------------
+
+
+def test_extraction_minimal_parse() -> None:
+ ext = Extraction.model_validate(
+ {
+ "id": "ext_1",
+ "status": "queued",
+ "submitted_at": "2026-01-01T00:00:00Z",
+ }
+ )
+ assert ext.id == "ext_1"
+ assert ext.status == ExtractionStatus.QUEUED
-def test_webhook_payload_parses_full_shape() -> None:
- parsed = JobWebhookPayload.model_validate(
+def test_extraction_with_post_processing() -> None:
+ ext = Extraction.model_validate(
{
- "event_id": "evt-1",
- "event_type": "IDPJobCompleted",
- "job_id": "job-1",
- "status": "SUCCEEDED",
- "occurred_at": "2026-05-17T10:00:00+00:00",
- "metadata": {"caller": "test"},
+ "id": "ext_1",
+ "status": "succeeded",
+ "submitted_at": "2026-01-01T00:00:00Z",
+ "finished_at": "2026-01-01T00:01:00Z",
+ "post_processing": {
+ "bbox_refinement": {
+ "status": "running",
+ "started_at": "2026-01-01T00:01:00Z",
+ "attempts": 1,
+ }
+ },
}
)
- assert parsed.status is JobStatus.SUCCEEDED
- assert parsed.metadata == {"caller": "test"}
+ assert ext.post_processing is not None
+ assert ext.post_processing.bbox_refinement is not None
+ assert ext.post_processing.bbox_refinement.status == PostProcessingStatus.RUNNING
-def test_submit_job_response_parses() -> None:
- resp = SubmitJobResponse.model_validate(
+def test_extraction_extra_allow() -> None:
+ # Forward-compat: a new field shows up on the wire and the SDK
+ # surfaces it via ``model_extra`` rather than failing validation.
+ ext = Extraction.model_validate(
{
- "job_id": "job-xyz",
- "status": "QUEUED",
- "submitted_at": "2026-05-17T10:00:00+00:00",
+ "id": "ext_1",
+ "status": "queued",
+ "submitted_at": "2026-01-01T00:00:00Z",
+ "future_field": {"shiny": True},
}
)
- assert resp.status is JobStatus.QUEUED
+ assert ext.model_extra is not None
+ assert ext.model_extra["future_field"] == {"shiny": True}
-def test_job_list_response_parses() -> None:
- resp = JobListResponse.model_validate(
+# ---------------------------------------------------------------------------
+# ExtractionResult
+# ---------------------------------------------------------------------------
+
+
+def test_extraction_result_pipeline_nested() -> None:
+ result = ExtractionResult.model_validate(
{
- "items": [],
- "total": 0,
- "limit": 50,
- "offset": 0,
+ "id": "ext_1",
+ "status": "success",
+ "files": [],
+ "documents": [],
+ "discovered_documents": [],
+ "rule_results": [],
+ "request_transformations": [],
+ "pipeline": {
+ "model": "anthropic:claude-sonnet-4-6",
+ "latency_ms": 1234,
+ "trace": [],
+ "errors": [],
+ },
}
)
- assert resp.total == 0
- assert resp.items == []
-
-
-def test_unknown_job_status_raises_on_known_enum() -> None:
- # StrEnum still rejects unknown values when the field is typed
- # against the enum strictly -- only the webhook ``status`` is
- # permissive because we declared it as a string-mode field above.
- # Here we just sanity-check the typed enum still validates known
- # values:
- with pytest.raises(Exception): # noqa: B017
- JobStatus("ZZZ_UNKNOWN")
+ assert result.id == "ext_1"
+ # In v1 the model + latency live under ``pipeline``, not at top level.
+ assert result.pipeline.model == "anthropic:claude-sonnet-4-6"
+ assert result.pipeline.latency_ms == 1234
+
+
+def test_extraction_result_tolerates_unknown_fields() -> None:
+ payload = {
+ "id": "ext_1",
+ "status": "success",
+ "documents": [],
+ "discovered_documents": [],
+ "pipeline": {"model": "m", "latency_ms": 0},
+ "future_top_level": {"shiny": True},
+ }
+ result = ExtractionResult.model_validate(payload)
+ assert result.model_extra is not None
+ assert result.model_extra["future_top_level"] == {"shiny": True}
+
+
+# ---------------------------------------------------------------------------
+# Bounding box
+# ---------------------------------------------------------------------------
+
+
+def test_bounding_box_construct() -> None:
+ bbox = BoundingBox(xmin=0.1, ymin=0.2, xmax=0.5, ymax=0.6, source=BboxSource.PDF_TEXT)
+ assert bbox.source == BboxSource.PDF_TEXT
+
+
+# ---------------------------------------------------------------------------
+# EventEnvelope
+# ---------------------------------------------------------------------------
+
+
+def test_event_envelope_parses_completed_event() -> None:
+ body = {
+ "event_id": "evt-1",
+ "event_type": "extraction.completed",
+ "version": "1.0.0",
+ "occurred_at": "2026-01-01T00:00:00Z",
+ "extraction": {
+ "id": "ext_1",
+ "status": "succeeded",
+ "submitted_at": "2026-01-01T00:00:00Z",
+ },
+ "metadata": {"caller": "test"},
+ }
+ env = EventEnvelope.model_validate(body)
+ assert env.event_type == "extraction.completed"
+ assert env.extraction.status == ExtractionStatus.SUCCEEDED
+ assert env.metadata == {"caller": "test"}
+
+
+def test_event_envelope_defaults() -> None:
+ # Constructing without explicit event_id / occurred_at fills in defaults.
+ env = EventEnvelope(
+ event_type="extraction.submitted",
+ extraction=Extraction(
+ id="ext_1",
+ status=ExtractionStatus.QUEUED,
+ submitted_at=datetime.fromisoformat("2026-01-01T00:00:00+00:00"),
+ ),
+ )
+ assert env.event_id # auto-generated
+ assert env.occurred_at # auto-generated
+
+
+# ---------------------------------------------------------------------------
+# PostProcessing
+# ---------------------------------------------------------------------------
+
+
+def test_post_processing_optional_bbox_refinement() -> None:
+ pp = PostProcessing()
+ assert pp.bbox_refinement is None
+
+
+# ---------------------------------------------------------------------------
+# Hard-fail enum sanity
+# ---------------------------------------------------------------------------
+
+
+def test_unknown_extraction_status_value_raises() -> None:
+ with pytest.raises(ValueError):
+ ExtractionStatus("running_zzz")
diff --git a/sdks/python/tests/test_request_models.py b/sdks/python/tests/test_request_models.py
deleted file mode 100644
index df0714d..0000000
--- a/sdks/python/tests/test_request_models.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for the typed request-side models.
-
-These verify two things:
-
-1. Constructing a request with the typed sub-models produces the JSON
- the service expects on the wire (camelCase keys where required,
- snake_case for the rest).
-2. The :class:`ExtractionRequest` / :class:`SubmitJobRequest` envelopes
- still accept the legacy dict-based form so callers can mix-and-match
- while migrating.
-"""
-
-from __future__ import annotations
-
-from flydocs_sdk import (
- DocSpec,
- DocType,
- DocumentInput,
- ExtractionOptions,
- ExtractionRequest,
- FieldGroup,
- FieldSpec,
- FieldType,
- RuleFieldParent,
- RuleSpec,
- StageToggles,
- StandardFormat,
- StandardValidatorSpec,
- StandardValidatorType,
- SubmitJobRequest,
-)
-
-
-def test_field_spec_dumps_with_camelcase_keys() -> None:
- field = FieldSpec(
- field_name="total_amount",
- field_description="Total to pay",
- field_type=FieldType.NUMBER,
- required=True,
- minimum=0.0,
- standard_validators=[StandardValidatorSpec(type=StandardValidatorType.IBAN)],
- )
- dumped = field.model_dump(by_alias=True)
- # The service expects camelCase top-level keys for the field schema.
- assert dumped["name"] == "total_amount"
- assert dumped["description"] == "Total to pay"
- assert dumped["type"] == FieldType.NUMBER
- assert dumped["required"] is True
- assert dumped["minimum"] == 0.0
- assert dumped["standard_validators"][0]["type"] == "iban"
-
-
-def test_field_group_factory_constructs_named_fields() -> None:
- group = FieldGroup.of(
- "totals",
- FieldSpec(field_name="total", field_type=FieldType.NUMBER, required=True),
- FieldSpec(field_name="currency", field_type=FieldType.STRING, required=True),
- description="Invoice totals block",
- )
- dumped = group.model_dump(by_alias=True)
- assert dumped["fieldGroupName"] == "totals"
- assert dumped["fieldGroupDesc"] == "Invoice totals block"
- assert len(dumped["fieldGroupFields"]) == 2
-
-
-def test_doc_spec_round_trips() -> None:
- spec = DocSpec(
- doc_type=DocType(document_type="invoice", description="Vendor invoice"),
- field_groups=[
- FieldGroup.of(
- "totals",
- FieldSpec(field_name="total", field_type=FieldType.NUMBER, required=True),
- )
- ],
- )
- dumped = spec.model_dump(by_alias=True)
- assert dumped["docType"]["documentType"] == "invoice"
- assert dumped["fieldGroups"][0]["fieldGroupName"] == "totals"
-
-
-def test_standard_format_round_trips() -> None:
- field = FieldSpec(field_name="dob", field_type=FieldType.STRING, format=StandardFormat.DATE)
- dumped = field.model_dump(by_alias=True)
- assert dumped["format"] == "date"
-
-
-def test_rule_spec_round_trips_field_parent() -> None:
- rule = RuleSpec(
- id="total_matches_lines",
- predicate="Total equals the sum of line items",
- parents=[RuleFieldParent(document_type="invoice", field_names=["total", "line_items"])],
- )
- dumped = rule.model_dump(by_alias=True)
- assert dumped["id"] == "total_matches_lines"
- parent = dumped["parents"][0]
- assert parent["parentType"] == "field"
- assert parent["documentType"] == "invoice"
- assert parent["fieldNames"] == ["total", "line_items"]
-
-
-def test_stage_toggles_defaults_match_service_defaults() -> None:
- s = StageToggles()
- assert s.splitter is False
- assert s.classifier is True
- assert s.field_validation is True
- assert s.judge is False
- assert s.bbox_refine is False
-
-
-def test_extraction_options_typed() -> None:
- opts = ExtractionOptions(
- return_bboxes=True,
- language_hint="es",
- model="anthropic:claude-sonnet-4-6",
- stages=StageToggles(judge=True, bbox_refine=True),
- escalation_threshold=0.25,
- escalation_model="anthropic:claude-opus-4-7",
- )
- dumped = opts.model_dump(by_alias=True)
- assert dumped["language_hint"] == "es"
- assert dumped["stages"]["judge"] is True
- assert dumped["stages"]["bbox_refine"] is True
- assert dumped["escalation_threshold"] == 0.25
-
-
-def test_extraction_request_accepts_typed_models() -> None:
- req = ExtractionRequest(
- documents=[DocumentInput.from_bytes(b"%PDF-1.4", filename="x.pdf")],
- docs=[
- DocSpec(
- doc_type=DocType(document_type="invoice"),
- field_groups=[
- FieldGroup.of(
- "totals",
- FieldSpec(field_name="total", field_type=FieldType.NUMBER, required=True),
- )
- ],
- )
- ],
- rules=[RuleSpec(id="r1", predicate="Total > 0")],
- options=ExtractionOptions(stages=StageToggles(bbox_refine=True)),
- )
- dumped = req.model_dump(by_alias=True, mode="json")
- # JSON keys must match the service's contract.
- assert dumped["documents"][0]["filename"] == "x.pdf"
- assert dumped["docs"][0]["docType"]["documentType"] == "invoice"
- assert dumped["rules"][0]["id"] == "r1"
- assert dumped["options"]["stages"]["bbox_refine"] is True
-
-
-def test_extraction_request_still_accepts_dicts() -> None:
- # Forward-compat for callers who haven't migrated to the typed
- # request models yet.
- req = ExtractionRequest(
- documents=[DocumentInput.from_bytes(b"%PDF-1.4", filename="x.pdf")],
- docs=[
- {
- "docType": {"documentType": "invoice"},
- "fieldGroups": [{"fieldGroupName": "g", "fieldGroupFields": [{"name": "x"}]}],
- }
- ],
- options={"stages": {"bbox_refine": True}},
- )
- dumped = req.model_dump(by_alias=True, mode="json")
- assert dumped["docs"][0]["docType"]["documentType"] == "invoice"
- assert dumped["options"]["stages"]["bbox_refine"] is True
-
-
-def test_submit_job_request_accepts_typed_models() -> None:
- req = SubmitJobRequest(
- documents=[DocumentInput.from_bytes(b"%PDF-1.4", filename="x.pdf")],
- docs=[
- DocSpec(
- doc_type=DocType(document_type="invoice"),
- field_groups=[
- FieldGroup.of(
- "totals",
- FieldSpec(field_name="total", field_type=FieldType.NUMBER, required=True),
- )
- ],
- )
- ],
- options=ExtractionOptions(stages=StageToggles(judge=True)),
- callback_url="https://example.com/webhook",
- metadata={"caller": "test"},
- )
- dumped = req.model_dump(by_alias=True, mode="json")
- assert dumped["callback_url"] == "https://example.com/webhook"
- assert dumped["options"]["stages"]["judge"] is True
- assert dumped["metadata"] == {"caller": "test"}
diff --git a/sdks/python/tests/test_sync_client.py b/sdks/python/tests/test_sync_client.py
deleted file mode 100644
index 448fdbc..0000000
--- a/sdks/python/tests/test_sync_client.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Smoke tests for the synchronous client.
-
-The sync client wraps the async one; full coverage of the endpoint
-shapes lives in :mod:`tests.test_async_client`. The job of these tests
-is to prove the sync wrapper plumbs everything through correctly --
-the loop is created, calls succeed, and the context manager closes
-cleanly.
-"""
-
-from __future__ import annotations
-
-import httpx
-import respx
-
-from flydocs_sdk import FlydocsClient, JobStatus
-
-BASE_URL = "http://flydocs.test"
-
-
-@respx.mock
-def test_sync_version() -> None:
- respx.get(f"{BASE_URL}/api/v1/version").mock(
- return_value=httpx.Response(
- 200,
- json={
- "service": "flydocs",
- "version": "26.5.1",
- "model": "anthropic:claude-sonnet-4-6",
- "fallback_model": "",
- "eda_adapter": "postgres",
- },
- )
- )
- with FlydocsClient(BASE_URL) as client:
- info = client.version()
- assert info.service == "flydocs"
-
-
-@respx.mock
-def test_sync_submit_job_returns_typed_response() -> None:
- respx.post(f"{BASE_URL}/api/v1/jobs").mock(
- return_value=httpx.Response(
- 202,
- json={
- "job_id": "job-sync",
- "status": "QUEUED",
- "submitted_at": "2026-05-17T10:00:00+00:00",
- },
- )
- )
- with FlydocsClient(BASE_URL) as client:
- resp = client.submit_job(
- {
- "documents": [{"filename": "x.pdf", "content_base64": "YWJj"}],
- "docs": [{"docType": {"documentType": "invoice"}}],
- }
- )
- assert resp.status is JobStatus.QUEUED
-
-
-def test_sync_client_closed_after_exit() -> None:
- client = FlydocsClient(BASE_URL)
- client.close()
- # Second close is a no-op.
- client.close()
diff --git a/sdks/python/tests/test_wait_for_completion.py b/sdks/python/tests/test_wait_for_completion.py
deleted file mode 100644
index 686cc21..0000000
--- a/sdks/python/tests/test_wait_for_completion.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for the job-polling convenience helper.
-
-We pin three behaviours:
-
-1. The helper polls ``GET /api/v1/jobs/{id}`` until it sees a terminal
- status, then returns.
-2. It raises :class:`TimeoutError` if the deadline elapses before the
- worker finishes.
-3. The terminal-status set covers ``SUCCEEDED`` / ``PARTIAL_SUCCEEDED``
- / ``FAILED`` / ``CANCELLED``.
-"""
-
-from __future__ import annotations
-
-import httpx
-import pytest
-import respx
-
-from flydocs_sdk import AsyncFlydocsClient, JobStatus
-
-BASE_URL = "http://flydocs.test"
-
-
-def _status_body(status: str) -> dict[str, str]:
- return {
- "job_id": "job-1",
- "status": status,
- "submitted_at": "2026-05-17T10:00:00+00:00",
- }
-
-
-@respx.mock
-async def test_wait_for_completion_succeeds(async_client: AsyncFlydocsClient) -> None:
- # Three polls: QUEUED -> RUNNING -> SUCCEEDED. The helper should
- # return on the third poll without raising.
- respx.get(f"{BASE_URL}/api/v1/jobs/job-1").mock(
- side_effect=[
- httpx.Response(200, json=_status_body("QUEUED")),
- httpx.Response(200, json=_status_body("RUNNING")),
- httpx.Response(200, json=_status_body("SUCCEEDED")),
- ]
- )
- final = await async_client.wait_for_completion("job-1", poll_interval=0.001, timeout=5.0)
- assert final.status is JobStatus.SUCCEEDED
-
-
-@respx.mock
-async def test_wait_for_completion_returns_on_failure(
- async_client: AsyncFlydocsClient,
-) -> None:
- respx.get(f"{BASE_URL}/api/v1/jobs/job-1").mock(
- return_value=httpx.Response(200, json=_status_body("FAILED"))
- )
- final = await async_client.wait_for_completion("job-1", poll_interval=0.001, timeout=5.0)
- # FAILED is terminal too — caller decides what to do with it; the
- # helper does NOT raise.
- assert final.status is JobStatus.FAILED
-
-
-@respx.mock
-async def test_wait_for_completion_returns_on_cancelled(
- async_client: AsyncFlydocsClient,
-) -> None:
- respx.get(f"{BASE_URL}/api/v1/jobs/job-1").mock(
- return_value=httpx.Response(200, json=_status_body("CANCELLED"))
- )
- final = await async_client.wait_for_completion("job-1", poll_interval=0.001, timeout=5.0)
- assert final.status is JobStatus.CANCELLED
-
-
-@respx.mock
-async def test_wait_for_completion_returns_on_partial_succeeded(
- async_client: AsyncFlydocsClient,
-) -> None:
- respx.get(f"{BASE_URL}/api/v1/jobs/job-1").mock(
- return_value=httpx.Response(200, json=_status_body("PARTIAL_SUCCEEDED"))
- )
- final = await async_client.wait_for_completion("job-1", poll_interval=0.001, timeout=5.0)
- assert final.status is JobStatus.PARTIAL_SUCCEEDED
-
-
-@respx.mock
-async def test_wait_for_completion_times_out(async_client: AsyncFlydocsClient) -> None:
- respx.get(f"{BASE_URL}/api/v1/jobs/job-1").mock(
- return_value=httpx.Response(200, json=_status_body("RUNNING"))
- )
- with pytest.raises(TimeoutError, match="did not reach a terminal status"):
- await async_client.wait_for_completion("job-1", poll_interval=0.001, timeout=0.05)
-
-
-@respx.mock
-async def test_wait_for_completion_refining_bboxes_is_not_terminal(
- async_client: AsyncFlydocsClient,
-) -> None:
- # REFINING_BBOXES is an intermediate state — the helper should keep
- # polling until the bbox refiner finishes.
- respx.get(f"{BASE_URL}/api/v1/jobs/job-1").mock(
- side_effect=[
- httpx.Response(200, json=_status_body("REFINING_BBOXES")),
- httpx.Response(200, json=_status_body("SUCCEEDED")),
- ]
- )
- final = await async_client.wait_for_completion("job-1", poll_interval=0.001, timeout=5.0)
- assert final.status is JobStatus.SUCCEEDED
diff --git a/sdks/python/tests/test_webhooks.py b/sdks/python/tests/test_webhooks.py
index f24f669..257d849 100644
--- a/sdks/python/tests/test_webhooks.py
+++ b/sdks/python/tests/test_webhooks.py
@@ -12,61 +12,126 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-"""Tests for the HMAC webhook verifier.
+"""Tests for the v1 HMAC webhook verifier.
-Covers the three failure modes the SDK promises to detect: missing
-signature header, wrong scheme prefix, and digest mismatch. Plus a
-roundtrip with :meth:`WebhookVerifier.sign` so the two helpers can't
-drift out of sync.
+Pins three behaviours:
+
+1. A correctly-signed body verifies and is parsed into a typed
+ :class:`EventEnvelope`.
+2. Common failure modes -- missing header, wrong scheme prefix, digest
+ mismatch -- raise :class:`WebhookVerificationError`.
+3. :meth:`WebhookVerifier.sign` produces the canonical
+ ``sha256=`` form so callers can pin parity with the service.
"""
from __future__ import annotations
+import hashlib
+import hmac
+import json
+
import pytest
-from flydocs_sdk import WebhookVerificationError, WebhookVerifier
+from flydocs_sdk import (
+ EVENT_TYPE_EXTRACTION_COMPLETED,
+ EventEnvelope,
+ WebhookVerificationError,
+ WebhookVerifier,
+)
+
+SECRET = "topsecret"
+
+
+def _sign(body: bytes, secret: str = SECRET) -> str:
+ return "sha256=" + hmac.new(secret.encode(), body, hashlib.sha256).hexdigest()
+
+def _envelope_body() -> bytes:
+ return json.dumps(
+ {
+ "event_id": "e1",
+ "event_type": EVENT_TYPE_EXTRACTION_COMPLETED,
+ "version": "1.0.0",
+ "occurred_at": "2026-05-26T00:00:00Z",
+ "extraction": {
+ "id": "ext_1",
+ "status": "succeeded",
+ "submitted_at": "2026-05-26T00:00:00Z",
+ },
+ }
+ ).encode()
-def test_sign_and_verify_roundtrip() -> None:
- verifier = WebhookVerifier("topsecret")
- body = b'{"event_id":"abc","job_id":"job1","status":"SUCCEEDED"}'
- sig = verifier.sign(body)
+
+# ---------------------------------------------------------------------------
+# Round-trip
+# ---------------------------------------------------------------------------
+
+
+def test_sign_and_verify_round_trip() -> None:
+ body = _envelope_body()
+ sig = WebhookVerifier(SECRET).sign(body)
assert sig.startswith("sha256=")
- # round-trips with the scheme prefix
- verified = verifier.verify(body, sig)
- assert verified is body
- # and also accepts a bare hex digest (some proxies strip the scheme)
+ env = WebhookVerifier(SECRET).verify(body, sig)
+ assert isinstance(env, EventEnvelope)
+ assert env.event_type == EVENT_TYPE_EXTRACTION_COMPLETED
+ assert env.extraction.id == "ext_1"
+
+
+def test_verify_accepts_bare_hex() -> None:
+ body = _envelope_body()
+ sig = _sign(body)
bare = sig.split("=", 1)[1]
- assert verifier.verify(body, bare) is body
+ env = WebhookVerifier(SECRET).verify(body, bare)
+ assert isinstance(env, EventEnvelope)
+
+
+def test_sign_is_deterministic() -> None:
+ body = _envelope_body()
+ a = WebhookVerifier(SECRET).sign(body)
+ b = WebhookVerifier(SECRET).sign(body)
+ assert a == b
+
+
+# ---------------------------------------------------------------------------
+# Failure modes
+# ---------------------------------------------------------------------------
def test_verify_missing_header_raises() -> None:
- verifier = WebhookVerifier("topsecret")
with pytest.raises(WebhookVerificationError, match="signature header missing"):
- verifier.verify(b"{}", "")
+ WebhookVerifier(SECRET).verify(_envelope_body(), "")
-def test_verify_bad_scheme_raises() -> None:
- verifier = WebhookVerifier("topsecret")
+def test_verify_wrong_scheme_raises() -> None:
with pytest.raises(WebhookVerificationError, match="unsupported signature scheme"):
- verifier.verify(b"{}", "md5=deadbeef")
+ WebhookVerifier(SECRET).verify(_envelope_body(), "md5=deadbeef")
def test_verify_digest_mismatch_raises() -> None:
- verifier = WebhookVerifier("topsecret")
with pytest.raises(WebhookVerificationError, match="signature mismatch"):
- verifier.verify(b"{}", "sha256=" + "00" * 32)
+ WebhookVerifier(SECRET).verify(_envelope_body(), "sha256=" + "00" * 32)
+
+
+def test_verify_tampered_body_raises() -> None:
+ body = _envelope_body()
+ sig = _sign(body)
+ tampered = body.replace(b'"succeeded"', b'"failed" ')
+ with pytest.raises(WebhookVerificationError):
+ WebhookVerifier(SECRET).verify(tampered, sig)
+
+
+def test_verify_wrong_secret_raises() -> None:
+ body = _envelope_body()
+ sig = _sign(body, secret="someoneelses")
+ with pytest.raises(WebhookVerificationError):
+ WebhookVerifier(SECRET).verify(body, sig)
+
+
+# ---------------------------------------------------------------------------
+# Verifier construction
+# ---------------------------------------------------------------------------
def test_empty_secret_rejected() -> None:
with pytest.raises(ValueError):
WebhookVerifier("")
-
-
-def test_signing_is_deterministic() -> None:
- # Two verifiers with the same secret must produce the same digest
- # for the same body. Acts as a regression for any future change to
- # ``sign``.
- a = WebhookVerifier("s")
- b = WebhookVerifier("s")
- assert a.sign(b"hello") == b.sign(b"hello")
diff --git a/sdks/python/uv.lock b/sdks/python/uv.lock
index a55935d..6d97f9a 100644
--- a/sdks/python/uv.lock
+++ b/sdks/python/uv.lock
@@ -44,7 +44,7 @@ wheels = [
[[package]]
name = "flydocs-sdk"
-version = "26.5.1"
+version = "26.6.0"
source = { editable = "." }
dependencies = [
{ name = "httpx" },
diff --git a/src/flydocs/app.py b/src/flydocs/app.py
index 4b1d76d..8de341b 100644
--- a/src/flydocs/app.py
+++ b/src/flydocs/app.py
@@ -23,8 +23,8 @@
),
scan_packages=[
"flydocs.core", # @configuration class
- "flydocs.core.services.extract", # extract command handler
- "flydocs.core.services.jobs", # job command/query handlers
+ "flydocs.core.services.extract", # sync extract command handler
+ "flydocs.core.services.extractions", # async extraction handlers
"flydocs.web.controllers", # REST controllers
"flydocs.web.advice", # exception advice
],
diff --git a/src/flydocs/cli.py b/src/flydocs/cli.py
index 8dde2f4..3c60126 100644
--- a/src/flydocs/cli.py
+++ b/src/flydocs/cli.py
@@ -53,7 +53,7 @@ def cmd_serve(_: argparse.Namespace) -> int:
def cmd_worker(_: argparse.Namespace) -> int:
- """Boot pyfly, run the :class:`JobWorker` and :class:`JobReaper` together.
+ """Boot pyfly, run the :class:`ExtractionWorker` and :class:`ExtractionReaper`.
The reaper is colocated with the worker so a single container fulfils
both responsibilities: drain the EDA outbox AND revive orphans whose
@@ -71,31 +71,31 @@ async def _run() -> None:
from flydocs.config import IDPSettings
from flydocs.core.services.pipeline import PipelineOrchestrator
from flydocs.core.services.webhook import WebhookPublisher
- from flydocs.core.services.workers.job_reaper import JobReaper
- from flydocs.core.services.workers.job_worker import JobWorker
- from flydocs.models.repositories import ExtractionJobRepository
+ from flydocs.core.services.workers.job_reaper import ExtractionReaper
+ from flydocs.core.services.workers.job_worker import ExtractionWorker
+ from flydocs.models.repositories import ExtractionRepository
pyfly_app = PyFlyApplication(FlydocsApplication)
await pyfly_app.startup()
- worker: JobWorker | None = None
- reaper: JobReaper | None = None
+ worker: ExtractionWorker | None = None
+ reaper: ExtractionReaper | None = None
try:
container = pyfly_app.context.container
settings = container.resolve(IDPSettings)
- worker = JobWorker(
+ worker = ExtractionWorker(
orchestrator=container.resolve(PipelineOrchestrator),
- repository=container.resolve(ExtractionJobRepository),
+ repository=container.resolve(ExtractionRepository),
event_publisher=container.resolve(EventPublisher),
webhook=container.resolve(WebhookPublisher),
settings=settings,
)
- reaper = JobReaper(
- repository=container.resolve(ExtractionJobRepository),
+ reaper = ExtractionReaper(
+ repository=container.resolve(ExtractionRepository),
event_publisher=container.resolve(EventPublisher),
settings=settings,
)
- worker_task = asyncio.create_task(worker.run_forever(), name="job-worker")
- reaper_task = asyncio.create_task(reaper.run_forever(), name="job-reaper")
+ worker_task = asyncio.create_task(worker.run_forever(), name="extraction-worker")
+ reaper_task = asyncio.create_task(reaper.run_forever(), name="extraction-reaper")
done, pending = await asyncio.wait(
{worker_task, reaper_task},
return_when=asyncio.FIRST_COMPLETED,
@@ -131,7 +131,7 @@ async def _run() -> None:
from flydocs.core.services.webhook import WebhookPublisher
from flydocs.core.services.workers.bbox_reaper import BboxReaper
from flydocs.core.services.workers.bbox_refine_worker import BboxRefineWorker
- from flydocs.models.repositories import ExtractionJobRepository
+ from flydocs.models.repositories import ExtractionRepository
pyfly_app = PyFlyApplication(FlydocsApplication)
await pyfly_app.startup()
@@ -141,7 +141,7 @@ async def _run() -> None:
container = pyfly_app.context.container
settings = container.resolve(IDPSettings)
worker = BboxRefineWorker(
- repository=container.resolve(ExtractionJobRepository),
+ repository=container.resolve(ExtractionRepository),
event_publisher=container.resolve(EventPublisher),
webhook=container.resolve(WebhookPublisher),
normalizer=container.resolve(BinaryNormalizer),
@@ -149,7 +149,7 @@ async def _run() -> None:
settings=settings,
)
reaper = BboxReaper(
- repository=container.resolve(ExtractionJobRepository),
+ repository=container.resolve(ExtractionRepository),
event_publisher=container.resolve(EventPublisher),
settings=settings,
)
diff --git a/src/flydocs/config.py b/src/flydocs/config.py
index 461d1d7..7c876b5 100644
--- a/src/flydocs/config.py
+++ b/src/flydocs/config.py
@@ -41,15 +41,16 @@ class IDPSettings(BaseSettings):
# Postgres for persistence — no extra broker is required.
eda_adapter: str = Field(default="postgres", description="memory | postgres | redis | kafka")
redis_url: str = "redis://localhost:6379/0"
- jobs_topic: str = "flydocs.jobs"
- jobs_event_type: str = "IDPJobSubmitted"
- jobs_completed_event_type: str = "IDPJobCompleted"
- # Second-stage destination for the out-of-band bbox refiner. Triggered
- # by ``JobWorker`` after main extraction succeeds AND
+ # Main extraction topic. Workers subscribe to the
+ # ``extraction.submitted`` event type (declared as a constant in
+ # ``flydocs.interfaces.dtos.event``); the destination here is the
+ # broker channel the bus publishes / drains on.
+ jobs_topic: str = "flydocs.extractions"
+ # Post-processing topic for the out-of-band bbox refiner. Triggered
+ # by the ``ExtractionWorker`` after main extraction succeeds AND
# ``options.stages.bbox_refine == true``. Consumed by
# ``BboxRefineWorker``.
- bbox_refine_topic: str = "flydocs.bbox.refine"
- bbox_refine_event_type: str = "IDPBboxRefineRequested"
+ bbox_refine_topic: str = "flydocs.extractions.post_processing"
# Retry budget + timeout for the bbox refine leg, independent of the
# main extraction. Refinement is CPU-bound (PyMuPDF / OCR) so the
# default ceiling is generous.
@@ -111,13 +112,15 @@ class IDPSettings(BaseSettings):
bbox_refine_lease_s: int = 660
# ----- Reaper -----------------------------------------------------
# The reaper runs alongside each worker (one task per process) and
- # republishes events for jobs stuck in non-terminal states. It is
- # the only path that revives orphans:
- # * RUNNING whose claimant crashed past its lease;
- # * QUEUED whose submit handler crashed between row INSERT and
+ # republishes events for extractions stuck in non-terminal states.
+ # It is the only path that revives orphans:
+ # * ``running`` whose claimant crashed past its lease;
+ # * ``queued`` whose submit handler crashed between row INSERT and
# outbox PUBLISH (or whose worker died during ``_delayed_publish``);
- # * PARTIAL_SUCCEEDED whose bbox-refine event was never published;
- # * REFINING_BBOXES whose bbox claimant crashed past its lease.
+ # * ``succeeded`` with ``post_processing_bbox_status == pending``
+ # whose bbox-refine event was never published;
+ # * ``succeeded`` with ``post_processing_bbox_status == running``
+ # whose bbox claimant crashed past its lease.
# Each republish is deduped at claim time by the atomic ``mark_*``
# transitions, so running multiple replicas of the reaper is safe
# (it just wastes a few outbox INSERTs per stale job).
diff --git a/src/flydocs/core/configuration.py b/src/flydocs/core/configuration.py
index 7b4cff1..0a87fee 100644
--- a/src/flydocs/core/configuration.py
+++ b/src/flydocs/core/configuration.py
@@ -12,7 +12,7 @@
(``@service``/``@rest_controller``/``@command_handler``/``@query_handler``):
* Configuration: :class:`IDPSettings`
-* Infrastructure: :class:`ExtractionJobRepository`, :class:`WebhookPublisher`.
+* Infrastructure: :class:`ExtractionRepository`, :class:`WebhookPublisher`.
The :class:`pyfly.eda.EventPublisher` is provided upstream by
:class:`pyfly.eda.auto_configuration.EdaAutoConfiguration` (Postgres
outbox by default; see ``pyfly.yaml``).
@@ -74,8 +74,8 @@
from flydocs.core.services.transformations import LlmTransformer, TransformationEngine
from flydocs.core.services.validation import FieldValidator, RequestValidator
from flydocs.core.services.webhook import WebhookPublisher
-from flydocs.core.services.workers.job_worker import JobWorker
-from flydocs.models.repositories import ExtractionJobRepository
+from flydocs.core.services.workers.job_worker import ExtractionWorker
+from flydocs.models.repositories import ExtractionRepository
@configuration
@@ -91,8 +91,8 @@ def settings(self) -> IDPSettings:
return get_settings()
@bean
- def repository(self, settings: IDPSettings) -> ExtractionJobRepository:
- return ExtractionJobRepository.from_url(settings.database_url)
+ def repository(self, settings: IDPSettings) -> ExtractionRepository:
+ return ExtractionRepository.from_url(settings.database_url)
@bean
def webhook(self, settings: IDPSettings) -> WebhookPublisher:
@@ -113,7 +113,7 @@ def webhook(self, settings: IDPSettings) -> WebhookPublisher:
# ------------------------------------------------------------------
@bean(name="database_health")
- def database_health(self, repository: ExtractionJobRepository) -> SqlAlchemyHealthIndicator:
+ def database_health(self, repository: ExtractionRepository) -> SqlAlchemyHealthIndicator:
return SqlAlchemyHealthIndicator(repository.engine)
# ------------------------------------------------------------------
@@ -378,21 +378,21 @@ def orchestrator(
default_model=settings.model,
)
- # ``JobWorker`` is NOT a bean. It depends on the
+ # ``ExtractionWorker`` is NOT a bean. It depends on the
# :class:`EventPublisher` produced by pyfly's auto-configuration,
# which is registered AFTER user @configuration classes are
# processed. The CLI's ``flydocs worker`` command builds the
# worker manually post-startup so the ordering is correct.
- def _build_job_worker( # noqa: PLR0913 - explicit injection for the CLI helper
+ def _build_extraction_worker( # noqa: PLR0913 - explicit injection for the CLI helper
self,
orchestrator: PipelineOrchestrator,
- repository: ExtractionJobRepository,
+ repository: ExtractionRepository,
event_publisher: EventPublisher,
webhook: WebhookPublisher,
settings: IDPSettings,
- ) -> JobWorker:
- return JobWorker(
+ ) -> ExtractionWorker:
+ return ExtractionWorker(
orchestrator=orchestrator,
repository=repository,
event_publisher=event_publisher,
diff --git a/src/flydocs/core/services/authenticity/content_validator.py b/src/flydocs/core/services/authenticity/content_validator.py
index 06d2191..af7f4df 100644
--- a/src/flydocs/core/services/authenticity/content_validator.py
+++ b/src/flydocs/core/services/authenticity/content_validator.py
@@ -21,7 +21,7 @@
ContentAuthenticity,
ContentCoherenceCheck,
)
-from flydocs.interfaces.dtos.doc import DocSpec
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec
from flydocs.interfaces.enums.status import CheckStatus, ContentIntegrityStatus
logger = logging.getLogger(__name__)
@@ -57,14 +57,14 @@ async def check(
*,
document_bytes: bytes,
media_type: str,
- doc: DocSpec,
+ doc: DocumentTypeSpec,
intention: str,
model: str | None = None,
) -> ContentAuthenticity:
prompt = self._template.render(
- documentType=doc.docType.documentType,
- description=doc.docType.description,
- country=doc.docType.country,
+ documentType=doc.id,
+ description=doc.description,
+ country=doc.country,
intention=intention,
)
agent: FireflyAgent[Any, _ContentOutput] = FireflyAgent(
diff --git a/src/flydocs/core/services/authenticity/visual_validator.py b/src/flydocs/core/services/authenticity/visual_validator.py
index 6fe6016..cebca78 100644
--- a/src/flydocs/core/services/authenticity/visual_validator.py
+++ b/src/flydocs/core/services/authenticity/visual_validator.py
@@ -1,10 +1,10 @@
# Copyright 2026 Firefly Software Solutions Inc
-"""``VisualAuthenticityChecker`` -- runs caller-defined visual validators.
+"""``VisualAuthenticityChecker`` -- runs caller-defined visual checks.
-Each validator is a ``(name, description)`` pair the LLM evaluates
-against the document image; output is a :class:`VisualValidationOutcome`
-per validator with a yes/no verdict, confidence, and free-text notes.
-The prompt template is supplied by the DI container.
+Each check is a ``(name, description)`` pair the LLM evaluates against
+the document image; output is a :class:`VisualCheckResult` per check
+with a yes/no verdict, confidence, and free-text notes. The prompt
+template is supplied by the DI container.
"""
from __future__ import annotations
@@ -19,8 +19,8 @@
from pydantic import BaseModel, Field
from flydocs.core.observability import DEFAULT_MIDDLEWARE, timed_agent_run
-from flydocs.interfaces.dtos.authenticity import VisualValidationOutcome
-from flydocs.interfaces.dtos.doc import DocSpec
+from flydocs.interfaces.dtos.authenticity import VisualCheckResult
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec
logger = logging.getLogger(__name__)
@@ -53,21 +53,21 @@ async def check(
*,
document_bytes: bytes,
media_type: str,
- doc: DocSpec,
+ doc: DocumentTypeSpec,
intention: str,
model: str | None = None,
- ) -> list[VisualValidationOutcome]:
- if not doc.validators.visual:
+ ) -> list[VisualCheckResult]:
+ if not doc.visual_checks:
return []
validators_json = json.dumps(
- [v.model_dump(mode="json") for v in doc.validators.visual],
+ [v.model_dump(mode="json") for v in doc.visual_checks],
indent=2,
ensure_ascii=False,
)
prompt = self._template.render(
- documentType=doc.docType.documentType,
- country=doc.docType.country,
+ documentType=doc.id,
+ country=doc.country,
intention=intention,
validators_json=validators_json,
)
@@ -87,18 +87,16 @@ async def check(
]
run_result = await timed_agent_run(agent, content, op="visual_auth", model=model or self._model)
raw_by_name = {v.name: v for v in run_result.output.validations}
- outcomes: list[VisualValidationOutcome] = []
- for spec in doc.validators.visual:
+ outcomes: list[VisualCheckResult] = []
+ for spec in doc.visual_checks:
raw = raw_by_name.get(spec.name)
if raw is None:
outcomes.append(
- VisualValidationOutcome(
- name=spec.name, passed=False, confidence=0.0, notes="Not evaluated"
- )
+ VisualCheckResult(name=spec.name, passed=False, confidence=0.0, notes="Not evaluated")
)
continue
outcomes.append(
- VisualValidationOutcome(
+ VisualCheckResult(
name=raw.name, passed=bool(raw.passed), confidence=float(raw.confidence), notes=raw.notes
)
)
diff --git a/src/flydocs/core/services/bbox/bbox_refiner.py b/src/flydocs/core/services/bbox/bbox_refiner.py
index d6fffb3..67b48b9 100644
--- a/src/flydocs/core/services/bbox/bbox_refiner.py
+++ b/src/flydocs/core/services/bbox/bbox_refiner.py
@@ -82,14 +82,13 @@ async def refine(
leaves: list[ExtractedField] = []
targets: list[tuple[str, str, list[int] | None]] = []
for group in groups:
- for field in group.fieldGroupFields:
+ for field in group.fields:
self._collect_leaves(field, leaves)
for idx, field in enumerate(leaves):
- value_str = _value_as_string(field.fieldValueFound)
+ value_str = _value_as_string(field.value)
if not value_str:
- self._stamp_no_source(field.bbox)
continue
- targets.append((str(idx), value_str, field.pagesFound or None))
+ targets.append((str(idx), value_str, field.pages or None))
counters = _Counters()
counters.fields_seen = len(leaves)
@@ -99,7 +98,7 @@ async def refine(
# tokens / OCR CPU and risk replacing a high-confidence
# rectangle with a weaker one. Count those as already-grounded.
for field in leaves:
- src = field.bbox.source
+ src = field.bbox.source if field.bbox is not None else None
if src == BboxSource.PDF_TEXT:
counters.grounded_pdf_text += 1
elif src == BboxSource.OCR:
@@ -107,7 +106,7 @@ async def refine(
already_grounded_ids = {
str(idx)
for idx, field in enumerate(leaves)
- if field.bbox.source in (BboxSource.PDF_TEXT, BboxSource.OCR)
+ if field.bbox is not None and field.bbox.source in (BboxSource.PDF_TEXT, BboxSource.OCR)
}
residual_targets = [t for t in targets if t[0] not in already_grounded_ids]
if residual_targets:
@@ -116,8 +115,9 @@ async def refine(
field = leaves[int(field_id)]
match = results.get(field_id)
if match is None:
- field.bbox.source = BboxSource.LLM
- field.bbox.refinement_confidence = None
+ if field.bbox is not None:
+ field.bbox.source = BboxSource.LLM
+ field.bbox.refinement_confidence = None
counters.kept_llm += 1
continue
page_source = _page_source(match.page, pages)
@@ -150,8 +150,8 @@ async def refine(
def _collect_leaves(self, field: ExtractedField, sink: list[ExtractedField]) -> None:
"""Flatten array parents -- only leaf scalar fields are matched."""
- if isinstance(field.fieldValueFound, list):
- for child in field.fieldValueFound:
+ if isinstance(field.value, list):
+ for child in field.value:
if isinstance(child, ExtractedField):
self._collect_leaves(child, sink)
return
@@ -169,20 +169,14 @@ def _replace_bbox(field: ExtractedField, match: MatchResult, source: BboxSource)
ymin=match.ymin,
xmax=match.xmax,
ymax=match.ymax,
- quality=old.quality,
- quality_score=old.quality_score,
+ quality=old.quality if old is not None else None,
+ quality_score=old.quality_score if old is not None else 0.0,
source=source,
refinement_confidence=match.score,
)
field.bbox = new
- if match.page not in field.pagesFound:
- field.pagesFound = [match.page, *field.pagesFound]
-
- @staticmethod
- def _stamp_no_source(bbox: BoundingBox) -> None:
- # Empty values keep the placeholder bbox but stamp the source.
- if bbox.source is None:
- bbox.source = BboxSource.NONE
+ if match.page not in field.pages:
+ field.pages = [match.page, *field.pages]
@dataclass(slots=True)
diff --git a/src/flydocs/core/services/bbox/bbox_validator.py b/src/flydocs/core/services/bbox/bbox_validator.py
index 1b4c5a9..690d785 100644
--- a/src/flydocs/core/services/bbox/bbox_validator.py
+++ b/src/flydocs/core/services/bbox/bbox_validator.py
@@ -14,8 +14,6 @@
Heuristics, in order of priority (each maps to a verdict and a
weighted contribution to the score):
-* ``empty`` -- the field has no extracted value, or the bbox is the
- zero placeholder ``BoundingBox.empty()``. Score 0.0.
* ``invalid`` -- corners outside ``[0, 1]``, degenerate (xmin >= xmax
or ymin >= ymax), area exactly 0. Score 0.0.
* ``suspicious`` -- area too big to plausibly fence a single value:
@@ -26,6 +24,9 @@
or extreme aspect ratio (height/width > 30 or < 1/30).
* ``good`` -- everything else.
+In v1 fields with no extracted value carry ``bbox = None`` instead of
+a synthetic empty placeholder; the validator skips those entirely.
+
The score combines:
* ``area_score`` (1.0 inside [1e-4, 0.5], decays outside),
@@ -61,27 +62,28 @@ class BboxValidator:
def validate_groups(self, groups: list[ExtractedFieldGroup]) -> None:
"""Mutate every field's ``bbox`` in place with quality + score."""
for group in groups:
- for field in group.fieldGroupFields:
+ for field in group.fields:
self._validate_field(field)
def _validate_field(self, field: ExtractedField) -> None:
- bbox = field.bbox
- if bbox is None:
- return
- # Recurse into nested rows for array fields.
- if isinstance(field.fieldValueFound, list):
- for child in field.fieldValueFound:
+ # Recurse into nested rows for array fields before assessing
+ # this field's own bbox (which may be ``None`` for arrays).
+ if isinstance(field.value, list):
+ for child in field.value:
if isinstance(child, ExtractedField):
self._validate_field(child)
- if isinstance(child.fieldValueFound, list):
- for sub in child.fieldValueFound:
+ if isinstance(child.value, list):
+ for sub in child.value:
if isinstance(sub, ExtractedField):
self._validate_field(sub)
- if field.fieldValueFound is None:
- self._stamp(bbox, BboxQuality.EMPTY, 0.0)
+ bbox = field.bbox
+ if bbox is None:
+ return
+ if field.value is None:
+ field.bbox = None
return
if _is_zero_placeholder(bbox):
- self._stamp(bbox, BboxQuality.EMPTY, 0.0)
+ field.bbox = None
return
verdict, score = _classify(bbox)
self._stamp(bbox, verdict, score)
diff --git a/src/flydocs/core/services/binary/errors.py b/src/flydocs/core/services/binary/errors.py
index aead4bb..a42f6cf 100644
--- a/src/flydocs/core/services/binary/errors.py
+++ b/src/flydocs/core/services/binary/errors.py
@@ -32,7 +32,7 @@ class UnsupportedBinaryError(BinaryNormalizationError):
The caller needs to convert client-side or pick a different file.
"""
- code = "unsupported_binary"
+ code = "unsupported_file"
class EncryptedPdfError(BinaryNormalizationError):
diff --git a/src/flydocs/core/services/classification/classifier.py b/src/flydocs/core/services/classification/classifier.py
index 9ead130..fe9adaf 100644
--- a/src/flydocs/core/services/classification/classifier.py
+++ b/src/flydocs/core/services/classification/classifier.py
@@ -1,11 +1,11 @@
# Copyright 2026 Firefly Software Solutions Inc
-"""``DocumentClassifier`` -- pick which DocSpec a file matches.
+"""``DocumentClassifier`` -- pick which DocumentTypeSpec a file matches.
-Used in multi-file mode when the caller didn't pin a
-``document_type`` on a :class:`DocumentInput`. One LLM call per
-unclassified file -- the model sees the file bytes (multimodal) plus
-the list of candidate DocSpecs and returns the best match (or
-``"unmatched"`` when none fit).
+Used in multi-file mode when the caller didn't pin an ``expected_type``
+on a :class:`FileInput`. One LLM call per unclassified file -- the
+model sees the file bytes (multimodal) plus the list of candidate
+document types and returns the best match (or ``"unmatched"`` when none
+fit).
The service mirrors the design of :class:`MultimodalExtractor` /
:class:`DocumentSplitter`: prompt template injected through the
@@ -28,7 +28,7 @@
from pydantic import BaseModel, Field
from flydocs.core.observability import DEFAULT_MIDDLEWARE, timed_agent_run
-from flydocs.interfaces.dtos.doc import DocSpec
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec
logger = logging.getLogger(__name__)
@@ -47,7 +47,7 @@ class _ClassifierOutput(BaseModel):
class ClassificationResult:
"""One classifier verdict for a single input file."""
- document_type: str # canonical docType from candidates, or ``unmatched``
+ document_type: str # canonical id from candidates, or ``unmatched``
confidence: float = 0.0
description: str = ""
notes: str = ""
@@ -55,7 +55,7 @@ class ClassificationResult:
class DocumentClassifier:
- """Assign one DocSpec docType to one input file."""
+ """Assign one DocumentTypeSpec id to one input file."""
def __init__(
self,
@@ -74,7 +74,7 @@ async def classify(
document_bytes: bytes,
media_type: str,
filename: str,
- candidates: list[DocSpec],
+ candidates: list[DocumentTypeSpec],
intention: str,
model: str | None = None,
) -> ClassificationResult:
@@ -82,13 +82,13 @@ async def classify(
if not candidates:
return ClassificationResult(document_type=UNMATCHED, matched=False, notes="no candidates")
- known: set[str] = {c.docType.documentType for c in candidates}
+ known: set[str] = {c.id for c in candidates}
targets_json = json.dumps(
[
{
- "documentType": c.docType.documentType,
- "description": c.docType.description,
- "country": c.docType.country,
+ "id": c.id,
+ "description": c.description,
+ "country": c.country,
}
for c in candidates
],
diff --git a/src/flydocs/core/services/escalation/judge_escalator.py b/src/flydocs/core/services/escalation/judge_escalator.py
index 745f55f..d3db200 100644
--- a/src/flydocs/core/services/escalation/judge_escalator.py
+++ b/src/flydocs/core/services/escalation/judge_escalator.py
@@ -158,13 +158,17 @@ async def _re_judge(doc_type: str) -> None:
return info
def _resolve_threshold(self, request: ExtractionRequest) -> float:
- t = request.options.escalation_threshold
+ cfg = request.options.escalation
+ t = cfg.threshold if cfg is not None else None
if t is None:
t = self._default_threshold
return max(0.0, min(1.0, float(t)))
def _resolve_model(self, request: ExtractionRequest) -> str | None:
- return request.options.escalation_model or self._default_model
+ cfg = request.options.escalation
+ if cfg is not None and cfg.model:
+ return cfg.model
+ return self._default_model
def _count_failures(per_doc_extracted: dict[str, list[ExtractedFieldGroup]]) -> tuple[int, int]:
@@ -183,13 +187,13 @@ def _count_failures(per_doc_extracted: dict[str, list[ExtractedFieldGroup]]) ->
total = 0
for groups in per_doc_extracted.values():
for group in groups:
- for field in group.fieldGroupFields:
+ for field in group.fields:
judge = field.judge
if judge is None or not judge.status:
continue
# JudgeStatus may be Enum or str depending on rebuild.
status_value = getattr(judge.status, "value", judge.status)
total += 1
- if status_value == "FAIL" or judge.flag_for_review:
+ if status_value == "fail" or judge.flag_for_review:
fail += 1
return fail, total
diff --git a/src/flydocs/core/services/extract/extract_handler.py b/src/flydocs/core/services/extract/extract_handler.py
index bce4690..0215de5 100644
--- a/src/flydocs/core/services/extract/extract_handler.py
+++ b/src/flydocs/core/services/extract/extract_handler.py
@@ -17,6 +17,21 @@
logger = logging.getLogger(__name__)
+class ExtractionTimedOutError(RuntimeError):
+ """Raised when sync extraction exceeds ``FLYDOCS_SYNC_TIMEOUT_S``.
+
+ Subclasses :class:`RuntimeError` so the pyfly CQRS bus lets it
+ propagate to the controller's exception handler (asyncio's
+ :class:`TimeoutError` extends :class:`OSError` and the bus would
+ otherwise wrap it as a generic ``COMMAND_PROCESSING_ERROR`` with
+ HTTP 400).
+ """
+
+ def __init__(self, timeout_s: int) -> None:
+ super().__init__(f"extraction did not finish within {timeout_s}s")
+ self.timeout_s = timeout_s
+
+
@command_handler
@service
class ExtractHandler(CommandHandler[ExtractCommand, ExtractionResult]):
@@ -33,4 +48,4 @@ async def do_handle(self, command: ExtractCommand) -> ExtractionResult:
)
except TimeoutError as exc:
logger.warning("Sync extraction timed out after %ds", self._settings.sync_timeout_s)
- raise TimeoutError(f"extraction did not finish within {self._settings.sync_timeout_s}s") from exc
+ raise ExtractionTimedOutError(self._settings.sync_timeout_s) from exc
diff --git a/src/flydocs/core/services/extraction/extractor.py b/src/flydocs/core/services/extraction/extractor.py
index 564fd5b..90c85a8 100644
--- a/src/flydocs/core/services/extraction/extractor.py
+++ b/src/flydocs/core/services/extraction/extractor.py
@@ -1,5 +1,5 @@
# Copyright 2026 Firefly Software Solutions Inc
-"""``MultimodalExtractor`` -- one LLM call per :class:`DocSpec`.
+"""``MultimodalExtractor`` -- one LLM call per :class:`DocumentTypeSpec`.
The extractor produces both **fields and bounding boxes** in one shot;
there is no separate bbox-finder stage. Document bytes are shipped
@@ -27,7 +27,7 @@
from flydocs.core.services.extraction.postprocess import normalise_doc
from flydocs.core.services.extraction.schema import build_extraction_output_model
from flydocs.core.services.extraction.text_anchor import NoOpTextAnchor, TextAnchor
-from flydocs.interfaces.dtos.doc import DocSpec
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec
from flydocs.interfaces.dtos.field import ExtractedField, ExtractedFieldGroup
from flydocs.interfaces.enums.field_type import FieldType
@@ -62,13 +62,13 @@ def __init__(
# ------------------------------------------------------------------
# Array-empty retry parameters
# ------------------------------------------------------------------
- # When a docspec declares array fields and the first extraction pass
- # returns ``rows=[]`` for one or more of them on a multi-page document,
- # we re-call the LLM ONCE with a focused "you missed these arrays"
- # prompt. This corrects a well-known structured-output failure mode
- # where Anthropic models default arrays to empty under verbose
- # schemas or generic intentions. The threshold + counter live here
- # so consumers can tune them via subclassing if needed.
+ # When a document type declares array fields and the first extraction
+ # pass returns ``rows=[]`` for one or more of them on a multi-page
+ # document, we re-call the LLM ONCE with a focused "you missed these
+ # arrays" prompt. This corrects a well-known structured-output
+ # failure mode where Anthropic models default arrays to empty under
+ # verbose schemas or generic intentions. The threshold + counter
+ # live here so consumers can tune them via subclassing if needed.
_ARRAY_RETRY_MIN_PAGES = 3
_ARRAY_RETRY_MAX_ATTEMPTS = 1
@@ -78,7 +78,7 @@ async def extract(
document_bytes: bytes,
media_type: str,
page_count: int,
- doc: DocSpec,
+ doc: DocumentTypeSpec,
intention: str,
language_hint: str | None = None,
model: str | None = None,
@@ -107,7 +107,7 @@ async def extract(
if empty_arrays:
logger.info(
"extract.empty_array_retry triggered: doc=%s pages=%d empty=%s",
- doc.docType.documentType,
+ doc.id,
page_count,
empty_arrays,
)
@@ -133,7 +133,7 @@ async def _extract_retry_arrays(
document_bytes: bytes,
media_type: str,
page_count: int,
- doc: DocSpec,
+ doc: DocumentTypeSpec,
base_intention: str,
empty_arrays: list[str],
language_hint: str | None,
@@ -174,7 +174,7 @@ async def _extract_once(
document_bytes: bytes,
media_type: str,
page_count: int,
- doc: DocSpec,
+ doc: DocumentTypeSpec,
intention: str,
language_hint: str | None,
model: str | None,
@@ -261,7 +261,7 @@ def _render_anchor(self, document_bytes: bytes, media_type: str) -> str | None:
# ------------------------------------------------------------------
@staticmethod
- def _suspicious_empty_arrays(doc: DocSpec, groups: list[ExtractedFieldGroup]) -> list[str]:
+ def _suspicious_empty_arrays(doc: DocumentTypeSpec, groups: list[ExtractedFieldGroup]) -> list[str]:
"""Names of array fields that look like a structured-output
empty-default rather than a legitimately empty result.
@@ -274,23 +274,23 @@ def _suspicious_empty_arrays(doc: DocSpec, groups: list[ExtractedFieldGroup]) ->
not a genuine zero-evidence document).
"""
array_field_names: set[str] = {
- f.fieldName for g in doc.fieldGroups for f in g.fieldGroupFields if f.fieldType == FieldType.ARRAY
+ f.name for g in doc.field_groups for f in g.fields if f.type == FieldType.ARRAY
}
if not array_field_names:
return []
empty: list[str] = []
any_filled = False
for group in groups:
- for field in group.fieldGroupFields:
- if field.fieldName not in array_field_names:
+ for field in group.fields:
+ if field.name not in array_field_names:
continue
- value = field.fieldValueFound
+ value = field.value
if not isinstance(value, list):
continue
if value:
any_filled = True
else:
- empty.append(field.fieldName)
+ empty.append(field.name)
# A filled sibling array means the model engaged with the
# schema; the empty ones are genuine zero-evidence results.
if any_filled:
@@ -306,26 +306,24 @@ def _merge_after_retry(
) -> list[ExtractedFieldGroup]:
"""Keep retry's results for the targeted array fields; preserve
originals for everything else."""
- retry_by_group: dict[str, ExtractedFieldGroup] = {g.fieldGroupName: g for g in retry}
+ retry_by_group: dict[str, ExtractedFieldGroup] = {g.name: g for g in retry}
merged: list[ExtractedFieldGroup] = []
for orig_group in original:
- retry_group = retry_by_group.get(orig_group.fieldGroupName)
+ retry_group = retry_by_group.get(orig_group.name)
if retry_group is None:
merged.append(orig_group)
continue
- retry_fields_by_name: dict[str, ExtractedField] = {
- f.fieldName: f for f in retry_group.fieldGroupFields
- }
+ retry_fields_by_name: dict[str, ExtractedField] = {f.name: f for f in retry_group.fields}
new_fields: list[ExtractedField] = []
- for orig_field in orig_group.fieldGroupFields:
- if orig_field.fieldName in empty_arrays and orig_field.fieldName in retry_fields_by_name:
- new_fields.append(retry_fields_by_name[orig_field.fieldName])
+ for orig_field in orig_group.fields:
+ if orig_field.name in empty_arrays and orig_field.name in retry_fields_by_name:
+ new_fields.append(retry_fields_by_name[orig_field.name])
else:
new_fields.append(orig_field)
merged.append(
ExtractedFieldGroup(
- fieldGroupName=orig_group.fieldGroupName,
- fieldGroupFields=new_fields,
+ name=orig_group.name,
+ fields=new_fields,
)
)
return merged
@@ -355,12 +353,12 @@ def _build_agent(self, model_id: str, output_model: type, *, instructions: str)
# Empirically, Sonnet 4.6 (and other Anthropic models under
# structured-output) start to *default array fields to ``[]``* when
- # the schema JSON in the prompt grows verbose. A docspec with long
- # paragraph-style ``fieldGroupDesc`` and multi-sentence
- # ``fieldDescription`` values triggers this safety fallback even on
- # documents that plainly contain matching rows. We compress the
- # schema we send to the LLM (without mutating the caller's docspec)
- # so descriptions stay informative but compact.
+ # the schema JSON in the prompt grows verbose. A long ``description``
+ # on the document type and multi-sentence field descriptions trigger
+ # this safety fallback even on documents that plainly contain
+ # matching rows. We compress the schema we send to the LLM (without
+ # mutating the caller's spec) so descriptions stay informative but
+ # compact.
_SCHEMA_GROUP_DESC_MAX = 180
_SCHEMA_FIELD_DESC_MAX = 160
_SCHEMA_ITEM_DESC_MAX = 140
@@ -376,24 +374,23 @@ def _compress(cls, text: str | None, limit: int) -> str | None:
cut = cleaned[:limit].rsplit(". ", 1)[0]
return cut.rstrip(".") + "."
- def _schema_payload(self, doc: DocSpec) -> str:
+ def _schema_payload(self, doc: DocumentTypeSpec) -> str:
schema = {
- "documentType": doc.docType.documentType,
- "description": self._compress(doc.docType.description, self._SCHEMA_FIELD_DESC_MAX),
- "country": doc.docType.country,
- "fieldGroups": [self._compress_group(g) for g in doc.fieldGroups],
+ "id": doc.id,
+ "description": self._compress(doc.description, self._SCHEMA_FIELD_DESC_MAX),
+ "country": doc.country,
+ "field_groups": [self._compress_group(g) for g in doc.field_groups],
}
return json.dumps(schema, indent=2, ensure_ascii=False)
def _compress_group(self, group: Any) -> dict[str, Any]:
raw = group.model_dump(mode="json", exclude_none=True)
- raw["fieldGroupDesc"] = self._compress(raw.get("fieldGroupDesc"), self._SCHEMA_GROUP_DESC_MAX)
- for field in raw.get("fieldGroupFields", []):
- field["fieldDescription"] = self._compress(
- field.get("fieldDescription"), self._SCHEMA_FIELD_DESC_MAX
- )
- for item in field.get("items") or []:
- item["fieldDescription"] = self._compress(
- item.get("fieldDescription"), self._SCHEMA_ITEM_DESC_MAX
- )
+ raw["description"] = self._compress(raw.get("description"), self._SCHEMA_GROUP_DESC_MAX)
+ for field in raw.get("fields", []) or []:
+ field["description"] = self._compress(field.get("description"), self._SCHEMA_FIELD_DESC_MAX)
+ items = field.get("items")
+ if isinstance(items, dict):
+ items["description"] = self._compress(items.get("description"), self._SCHEMA_ITEM_DESC_MAX)
+ for sub in items.get("fields", []) or []:
+ sub["description"] = self._compress(sub.get("description"), self._SCHEMA_ITEM_DESC_MAX)
return raw
diff --git a/src/flydocs/core/services/extraction/pdf_slicer.py b/src/flydocs/core/services/extraction/pdf_slicer.py
index 06d9063..70525b7 100644
--- a/src/flydocs/core/services/extraction/pdf_slicer.py
+++ b/src/flydocs/core/services/extraction/pdf_slicer.py
@@ -2,9 +2,9 @@
"""Slice a PDF document by 1-indexed page range, returning a new PDF blob.
Used by the orchestrator after the :class:`DocumentSplitter` has mapped
-each requested ``DocSpec`` to its corresponding page range, so the
-downstream extractor / authenticity / judge nodes receive only the
-relevant pages.
+each requested :class:`DocumentTypeSpec` to its corresponding page
+range, so the downstream extractor / authenticity / judge nodes receive
+only the relevant pages.
For non-PDF documents this helper is a no-op (the bytes are returned
unchanged), because the splitter never reports a page range for them.
diff --git a/src/flydocs/core/services/extraction/postprocess.py b/src/flydocs/core/services/extraction/postprocess.py
index 21f7a66..b30ae09 100644
--- a/src/flydocs/core/services/extraction/postprocess.py
+++ b/src/flydocs/core/services/extraction/postprocess.py
@@ -1,7 +1,7 @@
# Copyright 2026 Firefly Software Solutions Inc
-"""Convert the raw LLM output for one :class:`DocSpec` into a stable
-list of :class:`ExtractedFieldGroup` -- value-coerced, bbox-clamped,
-field-order preserved.
+"""Convert the raw LLM output for one :class:`DocumentTypeSpec` into a
+stable list of :class:`ExtractedFieldGroup` -- value-coerced,
+bbox-clamped, field-order preserved.
"""
from __future__ import annotations
@@ -11,18 +11,34 @@
from pydantic import BaseModel
from flydocs.core.services.extraction.schema import clamp_bbox, coerce_scalar
-from flydocs.interfaces.dtos.bbox import BoundingBox
-from flydocs.interfaces.dtos.doc import DocSpec
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec
from flydocs.interfaces.dtos.field import (
ExtractedField,
ExtractedFieldGroup,
- FieldSpec,
+)
+from flydocs.interfaces.dtos.field import (
+ Field as FieldSpec,
)
from flydocs.interfaces.enums.field_type import FieldType
+def _row_items(spec: FieldSpec) -> list[FieldSpec]:
+ """Return the per-row sub-field specs for an array field.
+
+ Mirrors ``schema._items_specs`` -- v1 :class:`Field` is recursive
+ so arrays carry their row shape under ``items`` (typically a Field
+ of type ``object`` whose ``fields`` list contains the columns).
+ """
+ if spec.items is None:
+ return []
+ items_field = spec.items
+ if items_field.type == FieldType.OBJECT and items_field.fields:
+ return list(items_field.fields)
+ return [items_field]
+
+
def _scalar_from_payload(spec: FieldSpec, payload: dict[str, Any]) -> ExtractedField:
- value = coerce_scalar(spec.fieldType, payload.get("value"))
+ value = coerce_scalar(spec.type, payload.get("value"))
confidence = _clamp01(payload.get("confidence", 0.0))
page = _int_or_none(payload.get("page"))
bbox = clamp_bbox(payload.get("bbox"))
@@ -30,13 +46,13 @@ def _scalar_from_payload(spec: FieldSpec, payload: dict[str, Any]) -> ExtractedF
if value is None:
page = None
- bbox = BoundingBox.empty()
+ bbox = None
pages: list[int] = [page] if page is not None else []
return ExtractedField(
- fieldName=spec.fieldName,
- fieldValueFound=value,
- pagesFound=pages,
+ name=spec.name,
+ value=value,
+ pages=pages,
confidence=confidence,
bbox=bbox,
notes=notes,
@@ -48,58 +64,56 @@ def _array_from_payload(spec: FieldSpec, payload: dict[str, Any]) -> ExtractedFi
coerced_rows: list[ExtractedField] = []
page_set: set[int] = set()
+ items = _row_items(spec)
for row_idx, row in enumerate(rows):
if not isinstance(row, dict):
continue
sub_fields: list[ExtractedField] = []
- for item in spec.items or []:
- item_payload = row.get(item.fieldName)
+ for item in items:
+ item_payload = row.get(item.name)
if not isinstance(item_payload, dict):
- sub_fields.append(ExtractedField(fieldName=item.fieldName, fieldValueFound=None))
+ sub_fields.append(ExtractedField(name=item.name, value=None))
continue
- item_spec = FieldSpec.model_validate(
- item.model_dump() | {"name": item.fieldName, "type": item.fieldType}
- )
- sub_field = _scalar_from_payload(item_spec, item_payload)
+ sub_field = _scalar_from_payload(item, item_payload)
sub_fields.append(sub_field)
- page_set.update(sub_field.pagesFound)
+ page_set.update(sub_field.pages)
coerced_rows.append(
ExtractedField(
- fieldName=f"row_{row_idx + 1}",
- fieldValueFound=sub_fields,
- pagesFound=sorted(page_set),
+ name=f"row_{row_idx + 1}",
+ value=sub_fields,
+ pages=sorted(page_set),
)
)
- pages_from_payload = payload.get("pagesFound") or []
+ pages_from_payload = payload.get("pages") or []
pages = sorted({int(p) for p in pages_from_payload if isinstance(p, int) and p >= 1} | page_set)
return ExtractedField(
- fieldName=spec.fieldName,
- fieldValueFound=coerced_rows,
- pagesFound=pages,
+ name=spec.name,
+ value=coerced_rows,
+ pages=pages,
confidence=_clamp01(payload.get("confidence", 0.0)),
notes=_maybe_str(payload.get("notes")),
)
-def normalise_doc(raw_output: BaseModel, doc: DocSpec) -> list[ExtractedFieldGroup]:
- """Build the public list of :class:`ExtractedFieldGroup` for one doc."""
+def normalise_doc(raw_output: BaseModel, doc: DocumentTypeSpec) -> list[ExtractedFieldGroup]:
+ """Build the public list of :class:`ExtractedFieldGroup` for one document type."""
output_dict = raw_output.model_dump(by_alias=True)
groups: list[ExtractedFieldGroup] = []
- for group in doc.fieldGroups:
- group_payload = output_dict.get(group.fieldGroupName) or {}
+ for group in doc.field_groups:
+ group_payload = output_dict.get(group.name) or {}
if not isinstance(group_payload, dict):
group_payload = {}
fields: list[ExtractedField] = []
- for spec in group.fieldGroupFields:
- field_payload = group_payload.get(spec.fieldName)
+ for spec in group.fields:
+ field_payload = group_payload.get(spec.name)
if not isinstance(field_payload, dict):
field_payload = {}
- if spec.fieldType == FieldType.ARRAY:
+ if spec.type == FieldType.ARRAY:
fields.append(_array_from_payload(spec, field_payload))
else:
fields.append(_scalar_from_payload(spec, field_payload))
- groups.append(ExtractedFieldGroup(fieldGroupName=group.fieldGroupName, fieldGroupFields=fields))
+ groups.append(ExtractedFieldGroup(name=group.name, fields=fields))
return groups
diff --git a/src/flydocs/core/services/extraction/schema.py b/src/flydocs/core/services/extraction/schema.py
index a7a9286..da8b940 100644
--- a/src/flydocs/core/services/extraction/schema.py
+++ b/src/flydocs/core/services/extraction/schema.py
@@ -1,5 +1,5 @@
# Copyright 2026 Firefly Software Solutions Inc
-"""Dynamic Pydantic models built from a :class:`DocSpec`.
+"""Dynamic Pydantic models built from a :class:`DocumentTypeSpec`.
We build a fresh model per request because every caller's schema
differs. Each field becomes a sub-model carrying ``value, confidence,
@@ -13,11 +13,12 @@
import re
from typing import Any
-from pydantic import BaseModel, ConfigDict, Field, create_model
+from pydantic import BaseModel, ConfigDict, create_model
+from pydantic import Field as _PydField
from flydocs.interfaces.dtos.bbox import BoundingBox
-from flydocs.interfaces.dtos.doc import DocSpec
-from flydocs.interfaces.dtos.field import FieldItem, FieldSpec
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec
+from flydocs.interfaces.dtos.field import Field as FieldSpec
from flydocs.interfaces.enums.field_type import FieldType
@@ -54,40 +55,62 @@ def _scalar_field_model(name: str, field_type: FieldType) -> type[BaseModel]:
"""Build the schema for a primitive field."""
return create_model(
f"Scalar_{_safe_attr(name)}",
- value=(_python_type(field_type), Field(default=None)),
- confidence=(float, Field(default=0.0, ge=0.0, le=1.0)),
- page=(int | None, Field(default=None, ge=1)),
- bbox=(_RawBBox, Field(default_factory=_RawBBox)),
- notes=(str | None, Field(default=None)),
+ value=(_python_type(field_type), _PydField(default=None)),
+ confidence=(float, _PydField(default=0.0, ge=0.0, le=1.0)),
+ page=(int | None, _PydField(default=None, ge=1)),
+ bbox=(_RawBBox, _PydField(default_factory=_RawBBox)),
+ notes=(str | None, _PydField(default=None)),
__config__=ConfigDict(extra="ignore"), # type: ignore[arg-type]
)
-def _row_model(items: list[FieldItem]) -> type[BaseModel]:
+def _items_specs(spec: FieldSpec) -> list[FieldSpec]:
+ """Return the per-row sub-field specs for an array field.
+
+ In v1 :class:`Field` is recursive: an array's row shape lives in
+ ``items`` (a single Field, typically of type ``object``). The
+ object's sub-fields live in ``items.fields``. This helper unwraps
+ both layers and returns the row's sub-field specs in declaration
+ order. Returns ``[]`` when the array doesn't declare a row schema.
+ """
+ if spec.items is None:
+ return []
+ items_field = spec.items
+ if items_field.type == FieldType.OBJECT and items_field.fields:
+ return list(items_field.fields)
+ # Allow a single recursive primitive items definition for simple
+ # arrays — its own ``name`` becomes the lone column name.
+ return [items_field]
+
+
+def _row_model(items: list[FieldSpec]) -> type[BaseModel]:
"""Build the schema for one row of an array field."""
fields: dict[str, Any] = {}
used: set[str] = set()
for item in items:
- attr = _safe_attr(item.fieldName)
+ attr = _safe_attr(item.name)
while attr in used:
attr = f"{attr}_"
used.add(attr)
fields[attr] = (
- _scalar_field_model(item.fieldName, item.fieldType),
- Field(default_factory=_scalar_field_model(item.fieldName, item.fieldType), alias=item.fieldName),
+ _scalar_field_model(item.name, item.type),
+ _PydField(
+ default_factory=_scalar_field_model(item.name, item.type),
+ alias=item.name,
+ ),
)
config = ConfigDict(populate_by_name=True, extra="ignore")
return create_model("ArrayRow", __config__=config, **fields) # type: ignore[call-overload]
def _array_field_model(spec: FieldSpec) -> type[BaseModel]:
- row_cls = _row_model(spec.items or [])
+ row_cls = _row_model(_items_specs(spec))
return create_model(
- f"Array_{_safe_attr(spec.fieldName)}",
- rows=(list[row_cls], Field(default_factory=list)), # type: ignore[valid-type]
- pagesFound=(list[int], Field(default_factory=list)),
- confidence=(float, Field(default=0.0, ge=0.0, le=1.0)),
- notes=(str | None, Field(default=None)),
+ f"Array_{_safe_attr(spec.name)}",
+ rows=(list[row_cls], _PydField(default_factory=list)), # type: ignore[valid-type]
+ pages=(list[int], _PydField(default_factory=list)),
+ confidence=(float, _PydField(default=0.0, ge=0.0, le=1.0)),
+ notes=(str | None, _PydField(default=None)),
__config__=ConfigDict(extra="ignore"), # type: ignore[arg-type]
)
@@ -96,33 +119,33 @@ def build_field_group_model(group_name: str, specs: list[FieldSpec]) -> type[Bas
fields: dict[str, Any] = {}
used: set[str] = set()
for spec in specs:
- attr = _safe_attr(spec.fieldName)
+ attr = _safe_attr(spec.name)
while attr in used:
attr = f"{attr}_"
used.add(attr)
- if spec.fieldType == FieldType.ARRAY:
+ if spec.type == FieldType.ARRAY:
sub_cls = _array_field_model(spec)
else:
- sub_cls = _scalar_field_model(spec.fieldName, spec.fieldType)
- fields[attr] = (sub_cls, Field(default_factory=sub_cls, alias=spec.fieldName))
+ sub_cls = _scalar_field_model(spec.name, spec.type)
+ fields[attr] = (sub_cls, _PydField(default_factory=sub_cls, alias=spec.name))
config = ConfigDict(populate_by_name=True, extra="ignore")
return create_model(f"Group_{_safe_attr(group_name)}", __config__=config, **fields) # type: ignore[call-overload]
-def build_extraction_output_model(doc: DocSpec) -> type[BaseModel]:
- """Produce the dynamic output model the LLM must return for a single doc."""
+def build_extraction_output_model(doc: DocumentTypeSpec) -> type[BaseModel]:
+ """Produce the dynamic output model the LLM must return for a single document type."""
groups: dict[str, Any] = {}
used: set[str] = set()
- for group in doc.fieldGroups:
- attr = _safe_attr(group.fieldGroupName, prefix="g")
+ for group in doc.field_groups:
+ attr = _safe_attr(group.name, prefix="g")
while attr in used:
attr = f"{attr}_"
used.add(attr)
- sub_cls = build_field_group_model(group.fieldGroupName, group.fieldGroupFields)
- groups[attr] = (sub_cls, Field(default_factory=sub_cls, alias=group.fieldGroupName))
+ sub_cls = build_field_group_model(group.name, group.fields)
+ groups[attr] = (sub_cls, _PydField(default_factory=sub_cls, alias=group.name))
config = ConfigDict(populate_by_name=True, extra="ignore")
return create_model(
- f"ExtractionOutput_{_safe_attr(doc.docType.documentType)}",
+ f"ExtractionOutput_{_safe_attr(doc.id)}",
__config__=config,
**groups,
)
@@ -152,18 +175,24 @@ def coerce_scalar(field_type: FieldType, raw: Any) -> Any:
return raw
-def clamp_bbox(box: _RawBBox | dict[str, Any] | None) -> BoundingBox:
+def clamp_bbox(box: _RawBBox | dict[str, Any] | None) -> BoundingBox | None:
+ """Clamp raw LLM bbox coords into a real :class:`BoundingBox`.
+
+ Returns ``None`` for degenerate / missing / zero-area inputs --
+ consumers attach ``None`` to ``ExtractedField.bbox`` in v1 instead
+ of carrying a synthetic ``empty`` placeholder.
+ """
if box is None:
- return BoundingBox.empty()
+ return None
if isinstance(box, dict):
try:
box = _RawBBox.model_validate(box)
except Exception: # noqa: BLE001
- return BoundingBox.empty()
+ return None
xmin = max(0.0, min(1.0, float(getattr(box, "xmin", 0.0))))
ymin = max(0.0, min(1.0, float(getattr(box, "ymin", 0.0))))
xmax = max(0.0, min(1.0, float(getattr(box, "xmax", 0.0))))
ymax = max(0.0, min(1.0, float(getattr(box, "ymax", 0.0))))
if xmin >= xmax or ymin >= ymax:
- return BoundingBox.empty()
+ return None
return BoundingBox(xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax)
diff --git a/src/flydocs/core/services/extractions/__init__.py b/src/flydocs/core/services/extractions/__init__.py
new file mode 100644
index 0000000..17f3061
--- /dev/null
+++ b/src/flydocs/core/services/extractions/__init__.py
@@ -0,0 +1,36 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""Async extraction CQRS handlers."""
+
+from flydocs.core.services.extractions.cancel_extraction_handler import (
+ CancelExtractionCommand,
+ CancelExtractionHandler,
+)
+from flydocs.core.services.extractions.get_extraction_handler import (
+ GetExtractionHandler,
+ GetExtractionQuery,
+)
+from flydocs.core.services.extractions.get_extraction_result_handler import (
+ GetExtractionResultHandler,
+ GetExtractionResultQuery,
+)
+from flydocs.core.services.extractions.list_extractions_handler import (
+ ListExtractionsHandler,
+ ListExtractionsQuery,
+)
+from flydocs.core.services.extractions.submit_extraction_handler import (
+ SubmitExtractionCommand,
+ SubmitExtractionHandler,
+)
+
+__all__ = [
+ "CancelExtractionCommand",
+ "CancelExtractionHandler",
+ "GetExtractionHandler",
+ "GetExtractionQuery",
+ "GetExtractionResultHandler",
+ "GetExtractionResultQuery",
+ "ListExtractionsHandler",
+ "ListExtractionsQuery",
+ "SubmitExtractionCommand",
+ "SubmitExtractionHandler",
+]
diff --git a/src/flydocs/core/services/extractions/_projector.py b/src/flydocs/core/services/extractions/_projector.py
new file mode 100644
index 0000000..7a64f47
--- /dev/null
+++ b/src/flydocs/core/services/extractions/_projector.py
@@ -0,0 +1,58 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""Shared helpers for projecting :class:`Extraction` rows onto the public DTO."""
+
+from __future__ import annotations
+
+from flydocs.interfaces.dtos.extraction import (
+ BboxRefinementInfo,
+ Extraction,
+ ExtractionError,
+ PostProcessing,
+)
+from flydocs.interfaces.enums.extraction_status import (
+ ExtractionStatus,
+ PostProcessingStatus,
+)
+from flydocs.models.entities.extraction import Extraction as ExtractionEntity
+
+
+def row_to_extraction(row: ExtractionEntity) -> Extraction:
+ """Project an :class:`ExtractionEntity` row onto an :class:`Extraction` DTO."""
+ error: ExtractionError | None = None
+ if row.error_code or row.error_message:
+ error = ExtractionError(
+ code=row.error_code or "unknown",
+ message=row.error_message or "",
+ )
+
+ post_processing: PostProcessing | None = None
+ if row.post_processing_bbox_status is not None:
+ bbox_error: ExtractionError | None = None
+ if row.post_processing_bbox_error_code or row.post_processing_bbox_error_message:
+ bbox_error = ExtractionError(
+ code=row.post_processing_bbox_error_code or "unknown",
+ message=row.post_processing_bbox_error_message or "",
+ )
+ post_processing = PostProcessing(
+ bbox_refinement=BboxRefinementInfo(
+ status=PostProcessingStatus(row.post_processing_bbox_status),
+ started_at=row.post_processing_bbox_started_at,
+ finished_at=row.post_processing_bbox_finished_at,
+ attempts=row.post_processing_bbox_attempts or 0,
+ error=bbox_error,
+ )
+ )
+
+ return Extraction(
+ id=row.id,
+ status=ExtractionStatus(row.status),
+ submitted_at=row.submitted_at,
+ started_at=row.started_at,
+ finished_at=row.finished_at,
+ attempts=row.attempts or 0,
+ error=error,
+ post_processing=post_processing,
+ )
+
+
+__all__ = ["row_to_extraction"]
diff --git a/src/flydocs/core/services/extractions/cancel_extraction_handler.py b/src/flydocs/core/services/extractions/cancel_extraction_handler.py
new file mode 100644
index 0000000..be6532c
--- /dev/null
+++ b/src/flydocs/core/services/extractions/cancel_extraction_handler.py
@@ -0,0 +1,63 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""``CancelExtractionHandler`` -- cancels an extraction while still QUEUED.
+
+The cancel is a single atomic ``UPDATE ... WHERE status='queued'``
+against Postgres. If the row is no longer QUEUED (worker just claimed
+it, the extraction is already terminal, or it never existed under that
+id), the UPDATE matches zero rows and we surface the appropriate error.
+
+This eliminates the previous TOCTOU window where a SELECT-then-UPDATE
+pair could clobber a worker that claimed the extraction in between.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from pyfly.container import service
+from pyfly.cqrs import Command, CommandHandler, command_handler
+
+from flydocs.core.services.extractions._projector import row_to_extraction
+from flydocs.interfaces.dtos.extraction import Extraction
+from flydocs.models.repositories import ExtractionRepository
+
+
+@dataclass(frozen=True)
+class CancelExtractionCommand(Command[Extraction | None]):
+ extraction_id: str
+
+
+class ExtractionNotCancellable(RuntimeError):
+ """Raised when the extraction is past the QUEUED state."""
+
+
+@command_handler
+@service
+class CancelExtractionHandler(CommandHandler[CancelExtractionCommand, Extraction | None]):
+ def __init__(self, repository: ExtractionRepository) -> None:
+ super().__init__()
+ self._repository = repository
+
+ async def do_handle(self, command: CancelExtractionCommand) -> Extraction | None:
+ # Atomic compare-and-swap: ``mark_cancelled`` only succeeds when
+ # the row's current status is QUEUED. A worker claiming the row
+ # in the same instant moves the status to RUNNING with a single
+ # UPDATE -- Postgres serialises the two UPDATEs by row-level lock
+ # and exactly one of them matches its precondition.
+ cancelled = await self._repository.mark_cancelled(command.extraction_id)
+ if cancelled is not None:
+ return row_to_extraction(cancelled)
+ # mark_cancelled returned None -- either the extraction doesn't
+ # exist or it's past QUEUED. Distinguish the two so the REST
+ # layer can emit 404 vs 409 correctly.
+ row = await self._repository.get(command.extraction_id)
+ if row is None:
+ return None
+ raise ExtractionNotCancellable(f"Extraction {row.id!r} cannot be cancelled in status {row.status}")
+
+
+__all__ = [
+ "CancelExtractionCommand",
+ "CancelExtractionHandler",
+ "ExtractionNotCancellable",
+]
diff --git a/src/flydocs/core/services/extractions/get_extraction_handler.py b/src/flydocs/core/services/extractions/get_extraction_handler.py
new file mode 100644
index 0000000..64d7730
--- /dev/null
+++ b/src/flydocs/core/services/extractions/get_extraction_handler.py
@@ -0,0 +1,35 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""``GetExtractionHandler`` -- status lookup for an async extraction."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from pyfly.container import service
+from pyfly.cqrs import Query, QueryHandler, query_handler
+
+from flydocs.core.services.extractions._projector import row_to_extraction
+from flydocs.interfaces.dtos.extraction import Extraction
+from flydocs.models.repositories import ExtractionRepository
+
+
+@dataclass(frozen=True)
+class GetExtractionQuery(Query[Extraction | None]):
+ extraction_id: str
+
+
+@query_handler
+@service
+class GetExtractionHandler(QueryHandler[GetExtractionQuery, Extraction | None]):
+ def __init__(self, repository: ExtractionRepository) -> None:
+ super().__init__()
+ self._repository = repository
+
+ async def do_handle(self, query: GetExtractionQuery) -> Extraction | None:
+ row = await self._repository.get(query.extraction_id)
+ if row is None:
+ return None
+ return row_to_extraction(row)
+
+
+__all__ = ["GetExtractionHandler", "GetExtractionQuery"]
diff --git a/src/flydocs/core/services/extractions/get_extraction_result_handler.py b/src/flydocs/core/services/extractions/get_extraction_result_handler.py
new file mode 100644
index 0000000..bf5fb97
--- /dev/null
+++ b/src/flydocs/core/services/extractions/get_extraction_result_handler.py
@@ -0,0 +1,129 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""``GetExtractionResultHandler`` -- result reader for extractions with a result available.
+
+Only :class:`ExtractionStatus.SUCCEEDED` carries a readable result.
+``QUEUED`` / ``RUNNING`` raises :class:`ExtractionNotReady` (mapped to
+RFC 7807 409). ``FAILED`` / ``CANCELLED`` likewise return 409 -- they
+will never produce a result no matter how long the caller waits.
+
+The optional ``wait_for_post_processing`` long-poll waits for the
+additive bbox-refinement leg to finish before returning the result so
+callers that need grounded coordinates can block instead of polling.
+The main extraction status is already terminal at the point we enter
+this code path; only the post-processing block changes.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from dataclasses import dataclass
+
+from pyfly.container import service
+from pyfly.cqrs import Query, QueryHandler, query_handler
+
+from flydocs.interfaces.dtos.extract import ExtractionResult
+from flydocs.interfaces.dtos.extraction import ExtractionResultEnvelope
+from flydocs.interfaces.enums.extraction_status import (
+ ExtractionStatus,
+ PostProcessingStatus,
+)
+from flydocs.models.repositories import ExtractionRepository
+
+# Statuses we never block on -- they will never produce a result no
+# matter how long we wait.
+_TERMINAL_NO_RESULT = (ExtractionStatus.FAILED, ExtractionStatus.CANCELLED)
+
+
+@dataclass(frozen=True)
+class GetExtractionResultQuery(Query[ExtractionResultEnvelope | None]):
+ extraction_id: str
+ # Long-poll knobs. ``wait_for_post_processing`` blocks the request
+ # until the post-processing leg finishes or ``timeout_s`` elapses;
+ # at timeout the handler returns whatever's currently persisted.
+ wait_for_post_processing: bool = False
+ timeout_s: float = 60.0
+ poll_interval_s: float = 1.0
+
+
+class ExtractionNotReady(RuntimeError):
+ def __init__(self, extraction_id: str, status: ExtractionStatus) -> None:
+ super().__init__(f"Extraction {extraction_id!r} is in status {status.value}")
+ self.extraction_id = extraction_id
+ self.status = status
+
+
+@query_handler
+@service
+class GetExtractionResultHandler(QueryHandler[GetExtractionResultQuery, ExtractionResultEnvelope | None]):
+ def __init__(self, repository: ExtractionRepository) -> None:
+ super().__init__()
+ self._repository = repository
+
+ async def do_handle(self, query: GetExtractionResultQuery) -> ExtractionResultEnvelope | None:
+ row = await self._repository.get(query.extraction_id)
+ if row is None:
+ return None
+
+ # Optional long-poll: block until the additive post-processing
+ # leg finishes. The main pipeline is already terminal at this
+ # point; the bbox-leg may still progress from pending/running
+ # to succeeded/failed in the background.
+ if query.wait_for_post_processing:
+ polled = await self._poll_for_terminal(query)
+ if polled is None:
+ # Row was deleted under us mid-poll; treat as not-found.
+ return None
+ row = polled
+
+ status = ExtractionStatus(row.status)
+ if not status.has_result:
+ if status in _TERMINAL_NO_RESULT:
+ raise ExtractionNotReady(row.id, status)
+ raise ExtractionNotReady(row.id, status)
+ if not row.result_json:
+ raise RuntimeError(f"Extraction {row.id} has status {status.value} but no result_json")
+ return ExtractionResultEnvelope(
+ id=row.id,
+ result=ExtractionResult.model_validate(row.result_json),
+ )
+
+ async def _poll_for_terminal(self, query: GetExtractionResultQuery):
+ """Block until the post-processing leg reaches a stable state.
+
+ Stable states (any one of these stops the loop):
+ * Main pipeline is in a non-success terminal state: ``FAILED`` /
+ ``CANCELLED`` -- there's nothing to wait for.
+ * Post-processing bbox status is ``succeeded`` or ``failed``.
+ * No post-processing leg exists (``post_processing_bbox_status``
+ is NULL) -- result is already final.
+
+ Returns whatever's currently persisted; never raises on
+ timeout (the result is still a valid response shape).
+ """
+ deadline = asyncio.get_running_loop().time() + max(0.0, query.timeout_s)
+ interval = max(0.1, query.poll_interval_s)
+ last = await self._repository.get(query.extraction_id)
+ while last is not None:
+ status = ExtractionStatus(last.status)
+ bbox_status = last.post_processing_bbox_status
+ if status in _TERMINAL_NO_RESULT:
+ return last
+ if bbox_status is None:
+ return last
+ if bbox_status in (
+ PostProcessingStatus.SUCCEEDED.value,
+ PostProcessingStatus.FAILED.value,
+ ):
+ return last
+ if asyncio.get_running_loop().time() >= deadline:
+ return last
+ await asyncio.sleep(interval)
+ last = await self._repository.get(query.extraction_id)
+ return last # type: ignore[return-value]
+
+
+__all__ = [
+ "ExtractionNotReady",
+ "GetExtractionResultHandler",
+ "GetExtractionResultQuery",
+]
diff --git a/src/flydocs/core/services/extractions/list_extractions_handler.py b/src/flydocs/core/services/extractions/list_extractions_handler.py
new file mode 100644
index 0000000..290584d
--- /dev/null
+++ b/src/flydocs/core/services/extractions/list_extractions_handler.py
@@ -0,0 +1,62 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""``ListExtractionsHandler`` -- paginated, filterable listing of extractions.
+
+Exposed at ``GET /api/v1/extractions`` by :class:`ExtractionsController`.
+Filters are optional and combine with ``AND``; the response is paginated
+and the total reflects the FILTERED set (not the table size). Rows come
+back ordered ``submitted_at DESC`` so the most recent activity surfaces
+first in dashboards / operator tooling.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime
+
+from pyfly.container import service
+from pyfly.cqrs import Query, QueryHandler, query_handler
+
+from flydocs.core.services.extractions._projector import row_to_extraction
+from flydocs.interfaces.dtos.extraction import ExtractionListResponse
+from flydocs.interfaces.enums.extraction_status import (
+ ExtractionStatus,
+ PostProcessingStatus,
+)
+from flydocs.models.repositories import ExtractionRepository
+
+
+@dataclass(frozen=True)
+class ListExtractionsQuery(Query[ExtractionListResponse]):
+ """Filters + pagination for ``GET /api/v1/extractions``."""
+
+ statuses: tuple[ExtractionStatus, ...] = ()
+ post_processing_statuses: tuple[PostProcessingStatus, ...] = ()
+ created_after: datetime | None = None
+ created_before: datetime | None = None
+ idempotency_key: str | None = None
+ limit: int = 50
+ offset: int = 0
+
+
+@query_handler
+@service
+class ListExtractionsHandler(QueryHandler[ListExtractionsQuery, ExtractionListResponse]):
+ def __init__(self, repository: ExtractionRepository) -> None:
+ super().__init__()
+ self._repository = repository
+
+ async def do_handle(self, query: ListExtractionsQuery) -> ExtractionListResponse:
+ rows, total = await self._repository.list_extractions(
+ statuses=[s.value for s in query.statuses] or None,
+ post_processing_bbox_statuses=[s.value for s in query.post_processing_statuses] or None,
+ created_after=query.created_after,
+ created_before=query.created_before,
+ idempotency_key=query.idempotency_key,
+ limit=query.limit,
+ offset=query.offset,
+ )
+ items = [row_to_extraction(r) for r in rows]
+ return ExtractionListResponse(items=items, total=total, limit=query.limit, offset=query.offset)
+
+
+__all__ = ["ListExtractionsHandler", "ListExtractionsQuery"]
diff --git a/src/flydocs/core/services/jobs/submit_job_handler.py b/src/flydocs/core/services/extractions/submit_extraction_handler.py
similarity index 73%
rename from src/flydocs/core/services/jobs/submit_job_handler.py
rename to src/flydocs/core/services/extractions/submit_extraction_handler.py
index f230c8e..a737771 100644
--- a/src/flydocs/core/services/jobs/submit_job_handler.py
+++ b/src/flydocs/core/services/extractions/submit_extraction_handler.py
@@ -1,12 +1,13 @@
# Copyright 2026 Firefly Software Solutions Inc
-"""``SubmitJobHandler`` -- persist the job + publish it on the EDA bus.
+"""``SubmitExtractionHandler`` -- persist the extraction + publish it on the EDA bus.
Before anything is written to Postgres or the EDA outbox, the handler
runs the same :class:`RequestValidator` the sync controller uses. A
-semantic mismatch (rule pointing at a non-existent docType, cycles in
-the rule DAG, duplicate rule ids, ...) raises :class:`InvalidRequestError`
-so the REST layer can return a ``422 invalid_request`` problem-detail
-with every issue surfaced -- without persisting an unrunnable job.
+semantic mismatch (rule pointing at a non-existent document type,
+cycles in the rule DAG, duplicate rule ids, ...) raises
+:class:`InvalidRequestError` so the REST layer can return a ``422
+validation_failed`` problem-detail with every issue surfaced -- without
+persisting an unrunnable extraction.
"""
from __future__ import annotations
@@ -23,13 +24,18 @@
from pyfly.observability.correlation import current_correlation_context
from flydocs.config import IDPSettings
+from flydocs.core.services.extractions._projector import row_to_extraction
from flydocs.core.services.validation import RequestValidator, ValidationReport
-from flydocs.interfaces.dtos.event import IDPJobSubmittedEvent, envelope_for_publish
+from flydocs.interfaces.dtos.event import (
+ EVENT_TYPE_EXTRACTION_SUBMITTED,
+ EventEnvelope,
+ envelope_for_publish,
+)
from flydocs.interfaces.dtos.extract import ExtractionRequest
-from flydocs.interfaces.dtos.job import SubmitJobRequest, SubmitJobResponse
-from flydocs.interfaces.enums.job_status import JobStatus
-from flydocs.models.entities.extraction_job import ExtractionJob
-from flydocs.models.repositories import ExtractionJobRepository
+from flydocs.interfaces.dtos.extraction import Extraction, SubmitExtractionRequest
+from flydocs.interfaces.enums.extraction_status import ExtractionStatus
+from flydocs.models.entities.extraction import Extraction as ExtractionEntity
+from flydocs.models.repositories import ExtractionRepository
logger = logging.getLogger(__name__)
@@ -47,17 +53,20 @@ def __init__(self, report: ValidationReport) -> None:
@dataclass(frozen=True)
-class SubmitJobCommand(Command[SubmitJobResponse]):
- request: SubmitJobRequest
+class SubmitExtractionCommand(Command[Extraction]):
+ request: SubmitExtractionRequest
idempotency_key: str | None = None
+_row_to_dto = row_to_extraction
+
+
@command_handler
@service
-class SubmitJobHandler(CommandHandler[SubmitJobCommand, SubmitJobResponse]):
+class SubmitExtractionHandler(CommandHandler[SubmitExtractionCommand, Extraction]):
def __init__(
self,
- repository: ExtractionJobRepository,
+ repository: ExtractionRepository,
event_publisher: EventPublisher,
validator: RequestValidator,
settings: IDPSettings,
@@ -68,15 +77,11 @@ def __init__(
self._validator = validator
self._settings = settings
- async def do_handle(self, command: SubmitJobCommand) -> SubmitJobResponse:
+ async def do_handle(self, command: SubmitExtractionCommand) -> Extraction:
if command.idempotency_key:
existing = await self._repository.get_by_idempotency_key(command.idempotency_key)
if existing is not None:
- return SubmitJobResponse(
- job_id=existing.id,
- status=JobStatus(existing.status),
- submitted_at=existing.created_at,
- )
+ return _row_to_dto(existing)
# NOTE: the SELECT-then-INSERT above has a TOCTOU window when
# two requests submit the same idempotency_key concurrently:
@@ -88,11 +93,11 @@ async def do_handle(self, command: SubmitJobCommand) -> SubmitJobResponse:
payload = command.request
# Reuse the sync semantic validator over an ExtractionRequest
# built from the submit payload -- same checks, same error shape.
- files = payload.documents
+ files = payload.files
as_extraction = ExtractionRequest(
intention=payload.intention,
- documents=files,
- docs=payload.docs,
+ files=files,
+ document_types=payload.document_types,
rules=payload.rules,
options=payload.options,
)
@@ -108,7 +113,7 @@ async def do_handle(self, command: SubmitJobCommand) -> SubmitJobResponse:
)
# The DB row carries a single ``filename`` / ``content_sha256``
- # pair; the per-file bytes live in ``schema_json.documents``. For
+ # pair; the per-file bytes live in ``schema_json.files``. For
# multi-file submits the primary filename summarises the bundle
# ("first (+N more)") and the content hash rolls every file's
# bytes so idempotency / dedupe checks still discriminate
@@ -127,14 +132,14 @@ async def do_handle(self, command: SubmitJobCommand) -> SubmitJobResponse:
content_sha256 = roll.hexdigest()
schema_json: dict[str, Any] = {
"intention": payload.intention,
- "docs": [d.model_dump(mode="json") for d in payload.docs],
+ "document_types": [d.model_dump(mode="json") for d in payload.document_types],
"rules": [r.model_dump(mode="json") for r in payload.rules],
- "documents": [
+ "files": [
{
"filename": f.filename,
"content_base64": f.content_base64,
"content_type": f.content_type,
- "document_type": f.document_type,
+ "expected_type": f.expected_type,
}
for f in files
],
@@ -149,9 +154,9 @@ async def do_handle(self, command: SubmitJobCommand) -> SubmitJobResponse:
if ctx:
metadata.setdefault("_correlation", ctx)
- job = ExtractionJob(
+ extraction = ExtractionEntity(
idempotency_key=command.idempotency_key,
- status=JobStatus.QUEUED.value,
+ status=ExtractionStatus.QUEUED.value,
filename=primary_filename,
content_sha256=content_sha256,
content_bytes=total_bytes,
@@ -161,7 +166,7 @@ async def do_handle(self, command: SubmitJobCommand) -> SubmitJobResponse:
metadata_json=metadata,
)
try:
- job = await self._repository.add(job)
+ extraction = await self._repository.add(extraction)
except self._repository.IntegrityError:
# Concurrent submit with the same idempotency_key collided
# on the partial unique index. Re-resolve the winning row
@@ -178,28 +183,24 @@ async def do_handle(self, command: SubmitJobCommand) -> SubmitJobResponse:
# violation was rolled back between INSERT and our
# follow-up SELECT. Re-raise so the caller retries.
raise
- return SubmitJobResponse(
- job_id=winner.id,
- status=JobStatus(winner.status),
- submitted_at=winner.created_at,
- )
- submitted_at = job.created_at or datetime.now(UTC)
- event = IDPJobSubmittedEvent(
- job_id=job.id,
- submitted_at=submitted_at,
- attempt=1,
+ return _row_to_dto(winner)
+ submitted_at = extraction.submitted_at or datetime.now(UTC)
+ extraction_dto = _row_to_dto(extraction)
+ envelope = EventEnvelope(
+ event_type=EVENT_TYPE_EXTRACTION_SUBMITTED,
+ occurred_at=submitted_at,
correlation_id=(ctx or {}).get("X-Correlation-Id"),
tenant_id=(ctx or {}).get("X-Tenant-Id"),
+ extraction=extraction_dto,
)
await self._publisher.publish(
destination=self._settings.jobs_topic,
- event_type=self._settings.jobs_event_type,
- payload=envelope_for_publish(event),
+ event_type=EVENT_TYPE_EXTRACTION_SUBMITTED,
+ payload=envelope_for_publish(envelope),
headers=ctx,
)
- return SubmitJobResponse(
- job_id=job.id,
- status=JobStatus(job.status),
- submitted_at=submitted_at,
- )
+ return extraction_dto
+
+
+__all__ = ["InvalidRequestError", "SubmitExtractionCommand", "SubmitExtractionHandler"]
diff --git a/src/flydocs/core/services/jobs/__init__.py b/src/flydocs/core/services/jobs/__init__.py
deleted file mode 100644
index 6fd84d3..0000000
--- a/src/flydocs/core/services/jobs/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""Async job CQRS handlers."""
-
-from flydocs.core.services.jobs.cancel_job_handler import CancelJobCommand, CancelJobHandler
-from flydocs.core.services.jobs.get_job_handler import GetJobHandler, GetJobQuery
-from flydocs.core.services.jobs.get_job_result_handler import (
- GetJobResultHandler,
- GetJobResultQuery,
-)
-from flydocs.core.services.jobs.list_jobs_handler import ListJobsHandler, ListJobsQuery
-from flydocs.core.services.jobs.submit_job_handler import SubmitJobCommand, SubmitJobHandler
-
-__all__ = [
- "CancelJobCommand",
- "CancelJobHandler",
- "GetJobHandler",
- "GetJobQuery",
- "GetJobResultHandler",
- "GetJobResultQuery",
- "ListJobsHandler",
- "ListJobsQuery",
- "SubmitJobCommand",
- "SubmitJobHandler",
-]
diff --git a/src/flydocs/core/services/jobs/cancel_job_handler.py b/src/flydocs/core/services/jobs/cancel_job_handler.py
deleted file mode 100644
index 4917565..0000000
--- a/src/flydocs/core/services/jobs/cancel_job_handler.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""``CancelJobHandler`` -- cancels a job while still QUEUED.
-
-The cancel is a single atomic ``UPDATE ... WHERE status='QUEUED'``
-against Postgres. If the row is no longer QUEUED (worker just claimed
-it, the job is already terminal, or it never existed under that id),
-the UPDATE matches zero rows and we surface the appropriate error.
-
-This eliminates the previous TOCTOU window where a SELECT-then-UPDATE
-pair could clobber a worker that claimed the job in between.
-"""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-
-from pyfly.container import service
-from pyfly.cqrs import Command, CommandHandler, command_handler
-
-from flydocs.interfaces.dtos.job import JobStatusResponse
-from flydocs.interfaces.enums.job_status import JobStatus
-from flydocs.models.repositories import ExtractionJobRepository
-
-
-@dataclass(frozen=True)
-class CancelJobCommand(Command[JobStatusResponse | None]):
- job_id: str
-
-
-class JobNotCancellable(RuntimeError):
- """Raised when the job is past the QUEUED state."""
-
-
-@command_handler
-@service
-class CancelJobHandler(CommandHandler[CancelJobCommand, JobStatusResponse | None]):
- def __init__(self, repository: ExtractionJobRepository) -> None:
- super().__init__()
- self._repository = repository
-
- async def do_handle(self, command: CancelJobCommand) -> JobStatusResponse | None:
- # Atomic compare-and-swap: ``mark_cancelled`` only succeeds when
- # the row's current status is QUEUED. A worker claiming the row
- # in the same instant moves the status to RUNNING with a single
- # UPDATE -- Postgres serialises the two UPDATEs by row-level lock
- # and exactly one of them matches its precondition.
- cancelled = await self._repository.mark_cancelled(command.job_id)
- if cancelled is not None:
- return JobStatusResponse(
- job_id=cancelled.id,
- status=JobStatus.CANCELLED,
- submitted_at=cancelled.created_at,
- started_at=cancelled.started_at,
- finished_at=cancelled.finished_at,
- attempts=cancelled.attempts,
- )
- # mark_cancelled returned None -- either the job doesn't exist
- # or it's past QUEUED. Distinguish the two so the REST layer
- # can emit 404 vs 409 correctly.
- job = await self._repository.get(command.job_id)
- if job is None:
- return None
- raise JobNotCancellable(f"Job {job.id!r} cannot be cancelled in status {job.status}")
diff --git a/src/flydocs/core/services/jobs/get_job_handler.py b/src/flydocs/core/services/jobs/get_job_handler.py
deleted file mode 100644
index f7bd199..0000000
--- a/src/flydocs/core/services/jobs/get_job_handler.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""``GetJobHandler`` -- status lookup for an async job."""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-
-from pyfly.container import service
-from pyfly.cqrs import Query, QueryHandler, query_handler
-
-from flydocs.interfaces.dtos.job import JobStatusResponse
-from flydocs.interfaces.enums.job_status import JobStatus
-from flydocs.models.repositories import ExtractionJobRepository
-
-
-@dataclass(frozen=True)
-class GetJobQuery(Query[JobStatusResponse | None]):
- job_id: str
-
-
-@query_handler
-@service
-class GetJobHandler(QueryHandler[GetJobQuery, JobStatusResponse | None]):
- def __init__(self, repository: ExtractionJobRepository) -> None:
- super().__init__()
- self._repository = repository
-
- async def do_handle(self, query: GetJobQuery) -> JobStatusResponse | None:
- job = await self._repository.get(query.job_id)
- if job is None:
- return None
- return JobStatusResponse(
- job_id=job.id,
- status=JobStatus(job.status),
- submitted_at=job.created_at,
- started_at=job.started_at,
- finished_at=job.finished_at,
- attempts=job.attempts,
- error_code=job.error_code,
- error_message=job.error_message,
- bbox_refine_status=job.bbox_refine_status,
- bbox_refine_attempts=job.bbox_refine_attempts or 0,
- bbox_refine_started_at=job.bbox_refine_started_at,
- bbox_refine_finished_at=job.bbox_refine_finished_at,
- bbox_refine_error_code=job.bbox_refine_error_code,
- bbox_refine_error_message=job.bbox_refine_error_message,
- )
diff --git a/src/flydocs/core/services/jobs/get_job_result_handler.py b/src/flydocs/core/services/jobs/get_job_result_handler.py
deleted file mode 100644
index 1696f40..0000000
--- a/src/flydocs/core/services/jobs/get_job_result_handler.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""``GetJobResultHandler`` -- result reader for jobs with a result available.
-
-Supports both:
-
-* **Fully complete** -- ``status == SUCCEEDED``: returns the grounded
- result (or the LLM-bbox result if bbox_refine was disabled).
-* **Partial** -- ``status in {PARTIAL_SUCCEEDED, REFINING_BBOXES}``:
- returns the LLM-bbox result so callers don't have to wait for the
- out-of-band refiner before consuming field values. Callers that need
- the grounded version pass ``wait_for_bboxes=true`` so the handler
- polls until the refiner finishes (or a timeout fires).
-
-Anything earlier than ``PARTIAL_SUCCEEDED`` (``QUEUED`` / ``RUNNING``)
-or terminal-without-result (``FAILED`` / ``CANCELLED``) raises
-:class:`JobNotReady` so the REST controller can surface an RFC 7807 409.
-"""
-
-from __future__ import annotations
-
-import asyncio
-from dataclasses import dataclass
-
-from pyfly.container import service
-from pyfly.cqrs import Query, QueryHandler, query_handler
-
-from flydocs.interfaces.dtos.extract import ExtractionResult
-from flydocs.interfaces.dtos.job import JobResult
-from flydocs.interfaces.enums.job_status import JobStatus
-from flydocs.models.repositories import ExtractionJobRepository
-
-# Statuses we never block on -- they will never produce a result no
-# matter how long we wait.
-_TERMINAL_NO_RESULT = (JobStatus.FAILED, JobStatus.CANCELLED)
-
-
-@dataclass(frozen=True)
-class GetJobResultQuery(Query[JobResult | None]):
- job_id: str
- # Long-poll knobs. ``wait_for_bboxes`` blocks the request until the
- # refiner finishes (status -> SUCCEEDED) or ``timeout_s`` elapses;
- # at timeout the handler returns whatever's currently persisted.
- wait_for_bboxes: bool = False
- timeout_s: float = 60.0
- poll_interval_s: float = 1.0
-
-
-class JobNotReady(RuntimeError):
- def __init__(self, job_id: str, status: JobStatus) -> None:
- super().__init__(f"Job {job_id!r} is in status {status.value}")
- self.job_id = job_id
- self.status = status
-
-
-@query_handler
-@service
-class GetJobResultHandler(QueryHandler[GetJobResultQuery, JobResult | None]):
- def __init__(self, repository: ExtractionJobRepository) -> None:
- super().__init__()
- self._repository = repository
-
- async def do_handle(self, query: GetJobResultQuery) -> JobResult | None:
- job = await self._repository.get(query.job_id)
- if job is None:
- return None
-
- # Optional long-poll for callers that want grounded bboxes only.
- # We block while the refiner is in flight, returning whatever's
- # in the row at timeout (which is always the partial result --
- # never None, since PARTIAL_SUCCEEDED requires result_json).
- if query.wait_for_bboxes:
- polled = await self._poll_for_terminal(query)
- if polled is None:
- # Job was deleted under us mid-poll; treat as not-found.
- return None
- job = polled
-
- status = JobStatus(job.status)
- if not status.has_result:
- if status in _TERMINAL_NO_RESULT:
- raise JobNotReady(job.id, status)
- raise JobNotReady(job.id, status)
- if not job.result_json:
- raise RuntimeError(f"Job {job.id} has status {status.value} but no result_json")
- return JobResult(
- job_id=job.id,
- result=ExtractionResult.model_validate(job.result_json),
- )
-
- async def _poll_for_terminal(self, query: GetJobResultQuery):
- """Block until the job reaches a stable, no-more-progress state.
-
- Stable states (any one of these stops the loop):
- * Main pipeline is terminal: ``SUCCEEDED`` / ``FAILED`` /
- ``CANCELLED``.
- * Bbox-refine leg has finished one way or the other:
- ``bbox_refine_status in {'succeeded', 'failed'}``. Once the
- refiner has succeeded the job is also ``SUCCEEDED``; once
- it has permanently failed the job stays
- ``PARTIAL_SUCCEEDED`` -- in either case there is no further
- asynchronous progress to wait for, so callers that asked
- for ``wait_for_bboxes`` should be unblocked immediately
- instead of polling until ``timeout_s`` elapses.
-
- Returns whatever's currently persisted; never raises on
- timeout (the partial result is still a valid response shape).
- """
- deadline = asyncio.get_running_loop().time() + max(0.0, query.timeout_s)
- interval = max(0.1, query.poll_interval_s)
- last = await self._repository.get(query.job_id)
- while last is not None:
- status = JobStatus(last.status)
- bbox_status = getattr(last, "bbox_refine_status", None)
- if status.is_terminal or bbox_status in ("succeeded", "failed"):
- return last
- if asyncio.get_running_loop().time() >= deadline:
- return last
- await asyncio.sleep(interval)
- last = await self._repository.get(query.job_id)
- # Job was deleted under us; fall through to outer handler which
- # will raise JobNotReady on the stale status. ``last`` is None
- # here only when the job disappeared, which today never happens
- # but is correctly defensive.
- return last # type: ignore[return-value]
diff --git a/src/flydocs/core/services/jobs/list_jobs_handler.py b/src/flydocs/core/services/jobs/list_jobs_handler.py
deleted file mode 100644
index a0a2f7a..0000000
--- a/src/flydocs/core/services/jobs/list_jobs_handler.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""``ListJobsHandler`` -- paginated, filterable listing of extraction jobs.
-
-Exposed at ``GET /api/v1/jobs`` by :class:`JobsController`. Filters are
-optional and combine with ``AND``; the response is paginated and the
-total reflects the FILTERED set (not the table size). Rows come back
-ordered ``created_at DESC`` so the most recent activity surfaces first
-in dashboards / operator tooling.
-"""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-from datetime import datetime
-
-from pyfly.container import service
-from pyfly.cqrs import Query, QueryHandler, query_handler
-
-from flydocs.interfaces.dtos.job import JobListResponse, JobStatusResponse
-from flydocs.interfaces.enums.job_status import JobStatus
-from flydocs.models.repositories import ExtractionJobRepository
-
-
-@dataclass(frozen=True)
-class ListJobsQuery(Query[JobListResponse]):
- """Filters + pagination for ``GET /api/v1/jobs``."""
-
- statuses: tuple[JobStatus, ...] = ()
- bbox_refine_statuses: tuple[str, ...] = ()
- created_after: datetime | None = None
- created_before: datetime | None = None
- idempotency_key: str | None = None
- limit: int = 50
- offset: int = 0
-
-
-@query_handler
-@service
-class ListJobsHandler(QueryHandler[ListJobsQuery, JobListResponse]):
- def __init__(self, repository: ExtractionJobRepository) -> None:
- super().__init__()
- self._repository = repository
-
- async def do_handle(self, query: ListJobsQuery) -> JobListResponse:
- rows, total = await self._repository.list_jobs(
- statuses=[s.value for s in query.statuses] or None,
- bbox_refine_statuses=list(query.bbox_refine_statuses) or None,
- created_after=query.created_after,
- created_before=query.created_before,
- idempotency_key=query.idempotency_key,
- limit=query.limit,
- offset=query.offset,
- )
- items = [
- JobStatusResponse(
- job_id=r.id,
- status=JobStatus(r.status),
- submitted_at=r.created_at,
- started_at=r.started_at,
- finished_at=r.finished_at,
- attempts=r.attempts or 0,
- error_code=r.error_code,
- error_message=r.error_message,
- bbox_refine_status=r.bbox_refine_status,
- bbox_refine_attempts=r.bbox_refine_attempts or 0,
- bbox_refine_started_at=r.bbox_refine_started_at,
- bbox_refine_finished_at=r.bbox_refine_finished_at,
- bbox_refine_error_code=r.bbox_refine_error_code,
- bbox_refine_error_message=r.bbox_refine_error_message,
- )
- for r in rows
- ]
- return JobListResponse(items=items, total=total, limit=query.limit, offset=query.offset)
-
-
-__all__ = ["ListJobsQuery", "ListJobsHandler"]
diff --git a/src/flydocs/core/services/judge/judge.py b/src/flydocs/core/services/judge/judge.py
index ed04f25..a57f8b2 100644
--- a/src/flydocs/core/services/judge/judge.py
+++ b/src/flydocs/core/services/judge/judge.py
@@ -21,7 +21,7 @@
from pydantic import BaseModel, Field
from flydocs.core.observability import DEFAULT_MIDDLEWARE, timed_agent_run
-from flydocs.interfaces.dtos.doc import DocSpec
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec
from flydocs.interfaces.dtos.field import (
ExtractedField,
ExtractedFieldGroup,
@@ -33,7 +33,7 @@
class _RawJudgeField(BaseModel):
- fieldName: str
+ name: str
status: JudgeStatus = JudgeStatus.UNCERTAIN
confidence: float = Field(default=0.0, ge=0.0, le=1.0)
evidence: str = ""
@@ -43,8 +43,8 @@ class _RawJudgeField(BaseModel):
class _RawJudgeGroup(BaseModel):
- fieldGroupName: str
- fieldGroupFields: list[_RawJudgeField] = Field(default_factory=list)
+ name: str
+ fields: list[_RawJudgeField] = Field(default_factory=list)
class _JudgeOutput(BaseModel):
@@ -71,7 +71,7 @@ async def judge(
*,
document_bytes: bytes,
media_type: str,
- doc: DocSpec,
+ doc: DocumentTypeSpec,
extracted_groups: list[ExtractedFieldGroup],
intention: str,
model: str | None = None,
@@ -86,7 +86,7 @@ async def judge(
)
prompt = self._template.render(
intention=intention,
- documentType=doc.docType.documentType,
+ documentType=doc.id,
extracted_fields_json=extracted_fields_json,
)
agent: FireflyAgent[Any, _JudgeOutput] = FireflyAgent(
@@ -105,13 +105,13 @@ async def judge(
]
run_result = await timed_agent_run(agent, content, op="judge", model=model or self._model)
judge_by_group: dict[str, dict[str, _RawJudgeField]] = {
- g.fieldGroupName: {f.fieldName: f for f in g.fieldGroupFields} for g in run_result.output.fields
+ g.name: {f.name: f for f in g.fields} for g in run_result.output.fields
}
for group in extracted_groups:
- field_map = judge_by_group.get(group.fieldGroupName, {})
- for field in group.fieldGroupFields:
- self._apply(field, field_map.get(field.fieldName))
+ field_map = judge_by_group.get(group.name, {})
+ for field in group.fields:
+ self._apply(field, field_map.get(field.name))
return extracted_groups
def _apply(self, field: ExtractedField, raw: _RawJudgeField | None) -> None:
@@ -124,11 +124,11 @@ def _apply(self, field: ExtractedField, raw: _RawJudgeField | None) -> None:
notes=raw.notes,
flag_for_review=raw.flag_for_review,
)
- if raw.items and isinstance(field.fieldValueFound, list):
- for row in field.fieldValueFound:
- if not isinstance(row, ExtractedField) or not isinstance(row.fieldValueFound, list):
+ if raw.items and isinstance(field.value, list):
+ for row in field.value:
+ if not isinstance(row, ExtractedField) or not isinstance(row.value, list):
continue
- raw_items = {r.fieldName: r for r in raw.items}
- for sub_field in row.fieldValueFound:
+ raw_items = {r.name: r for r in raw.items}
+ for sub_field in row.value:
if isinstance(sub_field, ExtractedField):
- self._apply(sub_field, raw_items.get(sub_field.fieldName))
+ self._apply(sub_field, raw_items.get(sub_field.name))
diff --git a/src/flydocs/core/services/pipeline/orchestrator.py b/src/flydocs/core/services/pipeline/orchestrator.py
index bd9e2ca..40bc7b3 100644
--- a/src/flydocs/core/services/pipeline/orchestrator.py
+++ b/src/flydocs/core/services/pipeline/orchestrator.py
@@ -2,8 +2,8 @@
"""``PipelineOrchestrator`` -- runs the IDP pipeline as a
:class:`fireflyframework_agentic.pipeline.PipelineEngine` DAG.
-Every input file (whether the caller submitted ``document`` or
-``documents``) flows through the same stages:
+Every input file (whether the caller submitted ``files=[…]`` with one
+or many entries) flows through the same stages:
load -> discover? -> classify? -> plan_tasks -> extract ->
bbox_validation -> field_validation? -> visual? -> content? ->
@@ -13,16 +13,17 @@
sub-document inside a file, so a single uploaded PDF that happens to
contain a deed + a DNI + a utility bill comes out as three segments
rather than one. The classifier then runs **per segment** and assigns
-each one to a declared ``DocSpec`` (or ``unmatched``). One extraction
-task is produced per matched (segment, DocSpec) pair.
+each one to a declared :class:`DocumentTypeSpec` (or ``unmatched``).
+One extraction task is produced per matched (segment, DocumentTypeSpec)
+pair.
Skip rules:
-* Files pinned with ``document_type`` skip the splitter and the
+* Files pinned with ``expected_type`` skip the splitter and the
classifier -- the caller already told us what that file is.
* Single-page files skip the splitter (one segment is enough).
-* Segments that already have a resolved doctype (pinned, or only one
- declared DocSpec is on offer) skip the classifier.
+* Segments that already have a resolved type (pinned, or only one
+ declared :class:`DocumentTypeSpec` is on offer) skip the classifier.
The method is called ``execute`` rather than ``run`` so it does **not**
accidentally satisfy pyfly's ``CommandLineRunner`` structural protocol
@@ -34,6 +35,7 @@
import asyncio
import logging
import time
+import uuid
from dataclasses import dataclass, field
from typing import Any
@@ -70,16 +72,18 @@
from flydocs.interfaces.dtos.authenticity import (
ContentAuthenticity,
DocumentAuthenticity,
- VisualValidationOutcome,
+ VisualCheckResult,
)
-from flydocs.interfaces.dtos.doc import DocSpec
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec
from flydocs.interfaces.dtos.extract import (
ClassificationInfo,
- DocumentInfo,
+ Document,
EscalationInfo,
- ExtractedDocument,
ExtractionRequest,
ExtractionResult,
+ FileSummary,
+ PipelineError,
+ PipelineMeta,
TraceEntry,
UsageBreakdown,
)
@@ -127,17 +131,17 @@ class _Segment:
@dataclass(slots=True)
class _ExtractionTask:
- """One (segment, DocSpec) pair the downstream stages iterate over."""
+ """One (segment, DocumentTypeSpec) pair the downstream stages iterate over."""
task_id: str # unique: ``f"file{i}/seg{j}/{doc_type}"``
segment: _Segment
- doc_spec: DocSpec
+ doc_spec: DocumentTypeSpec
slice_bytes: bytes
slice_pages: int
extracted_groups: list[ExtractedFieldGroup] = field(default_factory=list)
model_used: str | None = None
- visual: list[VisualValidationOutcome] = field(default_factory=list)
- content: ContentAuthenticity = field(default_factory=ContentAuthenticity)
+ visual: list[VisualCheckResult] = field(default_factory=list)
+ content: ContentAuthenticity | None = None
# ===========================================================================
@@ -233,22 +237,37 @@ def __init__(
self._settings = settings
self._default_model = default_model
- async def execute(self, request: ExtractionRequest) -> ExtractionResult:
+ async def execute(
+ self,
+ request: ExtractionRequest,
+ *,
+ extraction_id: str | None = None,
+ ) -> ExtractionResult:
+ """Run the pipeline for ``request``.
+
+ ``extraction_id`` is the public id (``ext_…``) the result will
+ carry. The sync ``ExtractHandler`` mints a fresh one; the async
+ worker passes the persisted extraction's id so the
+ ``ExtractionResult.id`` matches the row id.
+ """
started = time.monotonic()
- # Bind the request id to the active asyncio context so every
+ result_id = extraction_id or f"ext_{uuid.uuid4().hex[:26].upper()}"
+ # Bind the result id to the active asyncio context so every
# downstream ``timed_agent_run`` tags its UsageRecord with this
# correlation id. The reset happens in ``finally`` further down
# so the var is always cleared even when the pipeline raises.
- correlation_token = set_correlation_id(str(request.request_id))
+ correlation_token = set_correlation_id(result_id)
try:
- return await self._execute_inner(request, started)
+ return await self._execute_inner(request, started, result_id=result_id)
finally:
reset_correlation_id(correlation_token)
- async def _execute_inner(self, request: ExtractionRequest, started: float) -> ExtractionResult:
+ async def _execute_inner(
+ self, request: ExtractionRequest, started: float, *, result_id: str
+ ) -> ExtractionResult:
stages = request.options.stages
model_id = request.options.model or self._default_model
- files = request.documents
+ files = request.files
builder = PipelineBuilder(self.PIPELINE_NAME)
chain: list[str] = []
@@ -259,7 +278,7 @@ async def _execute_inner(self, request: ExtractionRequest, started: float) -> Ex
# Discover runs when the splitter is on AND at least one file is
# unpinned (pinned files skip discovery — the caller already said
# what they are) AND that file has more than one page.
- needs_discover = stages.splitter and any((not f.document_type) for f in files)
+ needs_discover = stages.splitter and any((not f.expected_type) for f in files)
if needs_discover:
builder.add_node(
"discover",
@@ -360,23 +379,31 @@ async def _execute_inner(self, request: ExtractionRequest, started: float) -> Ex
builder.chain(*chain)
engine = builder.build()
- engine._event_handler = _LoggingEventHandler(str(request.request_id)) # noqa: SLF001
+ engine._event_handler = _LoggingEventHandler(result_id) # noqa: SLF001
ctx = PipelineContext(
inputs=request,
metadata={
"request": request,
+ "result_id": result_id,
"model_id": model_id,
"pipeline_errors": [],
"unmatched_segments": [], # segments the classifier left without a docType
},
- correlation_id=str(request.request_id),
+ correlation_id=result_id,
)
pipeline_result = await engine.run(context=ctx)
latency_ms = int((time.monotonic() - started) * 1000)
- return self._build_result(request, ctx, model_id, latency_ms, pipeline_result=pipeline_result)
+ return self._build_result(
+ request,
+ ctx,
+ model_id,
+ latency_ms,
+ pipeline_result=pipeline_result,
+ result_id=result_id,
+ )
# ------------------------------------------------------------------
# Pipeline steps
@@ -385,7 +412,7 @@ async def _execute_inner(self, request: ExtractionRequest, started: float) -> Ex
async def _step_load(self, ctx: PipelineContext, _inputs: dict[str, Any]) -> Any:
"""Normalise every input binary and emit one ``_FileSlot`` per row.
- Each inbound :class:`DocumentInput` flows through
+ Each inbound :class:`FileInput` flows through
:class:`BinaryNormalizer`. A born-digital PDF or a clean PNG
passes through 1:1; a DOCX is converted to PDF; a HEIC photo
becomes PNG; a multi-frame TIFF becomes a multi-page PDF; a ZIP
@@ -393,16 +420,16 @@ async def _step_load(self, ctx: PipelineContext, _inputs: dict[str, Any]) -> Any
chains let the response trace each output back to the original
upload.
- The doctype pin survives only when normalisation produced
- exactly one row -- a multi-row expansion (ZIP, email) makes the
- original pin ambiguous, so the classifier is asked to decide
- per-row.
+ The ``expected_type`` pin survives only when normalisation
+ produced exactly one row -- a multi-row expansion (ZIP, email)
+ makes the original pin ambiguous, so the classifier is asked to
+ decide per-row.
"""
request: ExtractionRequest = ctx.metadata["request"]
files: list[_FileSlot] = []
# Slot index is monotonic across the expansion of all inputs.
slot_index = 0
- for file in request.documents:
+ for file in request.files:
document_bytes = file.decoded_bytes()
normalised = await self._binary_normalizer.normalise(
document_bytes,
@@ -411,7 +438,7 @@ async def _step_load(self, ctx: PipelineContext, _inputs: dict[str, Any]) -> Any
)
multi_row = len(normalised) > 1
for row in normalised:
- effective_doctype = file.document_type if not multi_row else None
+ effective_doctype = file.expected_type if not multi_row else None
slot_filename = (
"/".join((*row.derived_from, row.filename)) if row.derived_from else row.filename
)
@@ -458,7 +485,7 @@ async def _discover_one(slot: _FileSlot) -> None:
document_bytes=slot.document_bytes,
media_type=slot.media_type,
page_count=slot.page_count,
- targets=request.docs,
+ targets=request.document_types,
intention=request.intention,
model=ctx.metadata["model_id"],
)
@@ -485,15 +512,16 @@ async def _discover_one(slot: _FileSlot) -> None:
return {"segments": total}
async def _step_classifier(self, ctx: PipelineContext, _inputs: dict[str, Any]) -> Any:
- """Per-segment classifier: pick a declared DocSpec for each segment."""
+ """Per-segment classifier: pick a declared DocumentTypeSpec for each segment."""
request: ExtractionRequest = ctx.metadata["request"]
files: list[_FileSlot] = ctx.metadata["files_data"]
- docs_by_type: dict[str, DocSpec] = {d.docType.documentType: d for d in request.docs}
+ docs_by_type: dict[str, DocumentTypeSpec] = {d.id: d for d in request.document_types}
- # If there's exactly one declared DocSpec, every unpinned, unmatched
- # segment is implicitly that one -- no LLM call needed.
- if len(request.docs) == 1:
- only = request.docs[0].docType.documentType
+ # If there's exactly one declared DocumentTypeSpec, every
+ # unpinned, unmatched segment is implicitly that one -- no LLM
+ # call needed.
+ if len(request.document_types) == 1:
+ only = request.document_types[0].id
for slot in files:
for seg in slot.segments:
if seg.resolved_doctype is None:
@@ -519,7 +547,7 @@ async def _classify_one(slot: _FileSlot, seg: _Segment) -> None:
document_bytes=bytes_for_seg,
media_type=media_type,
filename=slot.filename,
- candidates=request.docs,
+ candidates=request.document_types,
intention=request.intention,
model=ctx.metadata["model_id"],
)
@@ -546,24 +574,25 @@ async def _classify_one(slot: _FileSlot, seg: _Segment) -> None:
return {"classified": matched, "unmatched": len(targets) - matched}
async def _step_plan_tasks(self, ctx: PipelineContext, _inputs: dict[str, Any]) -> Any:
- """Build the flat list of (segment, DocSpec) extraction tasks."""
+ """Build the flat list of (segment, DocumentTypeSpec) extraction tasks."""
request: ExtractionRequest = ctx.metadata["request"]
files: list[_FileSlot] = ctx.metadata["files_data"]
- docs_by_type: dict[str, DocSpec] = {d.docType.documentType: d for d in request.docs}
+ docs_by_type: dict[str, DocumentTypeSpec] = {d.id: d for d in request.document_types}
tasks: list[_ExtractionTask] = []
unmatched_segments: list[_Segment] = list(ctx.metadata.get("unmatched_segments", []))
for slot in files:
for seg_index, seg in enumerate(slot.segments):
- # Unresolved segment -> nothing to extract; route to additional.
+ # Unresolved segment -> nothing to extract; route to discovered.
if seg.resolved_doctype is None:
unmatched_segments.append(seg)
continue
doc_spec = docs_by_type.get(seg.resolved_doctype)
if doc_spec is None:
# Caller pinned (or classifier returned) a doctype that
- # is not declared in ``docs[]``. The request validator
- # rejects unknown pins up-front; this is a safety net.
+ # is not declared in ``document_types[]``. The request
+ # validator rejects unknown pins up-front; this is a
+ # safety net.
unmatched_segments.append(seg)
continue
slice_bytes, slice_pages = self._slice_segment_bytes(slot, seg, ctx)
@@ -616,11 +645,11 @@ async def _step_bbox_validation(self, ctx: PipelineContext, _inputs: dict[str, A
async def _step_bbox_refine(self, ctx: PipelineContext, _inputs: dict[str, Any]) -> Any:
"""Replace LLM-estimated bboxes with grounded ones (PDF text / OCR).
- Runs per task (one per resolved (segment, DocSpec) pair) so a
- single request that fans out across multiple files / DocSpecs
- gets refined per source. Tasks with no extracted groups or no
- bytes are skipped. Failures degrade gracefully -- the LLM bbox
- stays in place and the pipeline continues.
+ Runs per task (one per resolved (segment, DocumentTypeSpec)
+ pair) so a single request that fans out across multiple files /
+ types gets refined per source. Tasks with no extracted groups or
+ no bytes are skipped. Failures degrade gracefully -- the LLM
+ bbox stays in place and the pipeline continues.
"""
request: ExtractionRequest = ctx.metadata["request"]
tasks: list[_ExtractionTask] = ctx.metadata["tasks"]
@@ -650,7 +679,7 @@ async def _refine_one(task: _ExtractionTask) -> None:
async def _step_field_validation(self, ctx: PipelineContext, _inputs: dict[str, Any]) -> Any:
tasks: list[_ExtractionTask] = ctx.metadata["tasks"]
for task in tasks:
- self._field_validator.validate(task.doc_spec.fieldGroups, task.extracted_groups)
+ self._field_validator.validate(task.doc_spec.field_groups, task.extracted_groups)
return {"validated": True}
async def _step_visual_authenticity(self, ctx: PipelineContext, _inputs: dict[str, Any]) -> Any:
@@ -658,7 +687,7 @@ async def _step_visual_authenticity(self, ctx: PipelineContext, _inputs: dict[st
tasks: list[_ExtractionTask] = ctx.metadata["tasks"]
async def _check_one(task: _ExtractionTask) -> None:
- if not task.doc_spec.validators.visual or not task.slice_bytes:
+ if not task.doc_spec.visual_checks or not task.slice_bytes:
return
try:
outcomes = await self._visual_checker.check(
@@ -726,7 +755,8 @@ async def _step_judge_escalation(self, ctx: PipelineContext, _inputs: dict[str,
request: ExtractionRequest = ctx.metadata["request"]
tasks: list[_ExtractionTask] = ctx.metadata["tasks"]
# The escalator was built around per-doc maps keyed by a stable id.
- # Use ``task_id`` so each (segment, DocSpec) re-run is independent.
+ # Use ``task_id`` so each (segment, DocumentTypeSpec) re-run is
+ # independent.
per_doc_extracted = {t.task_id: t.extracted_groups for t in tasks}
per_doc_inputs = {
t.task_id: (
@@ -763,19 +793,19 @@ async def _step_rules(self, ctx: PipelineContext, _inputs: dict[str, Any]) -> An
request: ExtractionRequest = ctx.metadata["request"]
tasks: list[_ExtractionTask] = ctx.metadata["tasks"]
- # The rule engine takes per-doctype maps. Group by doctype across
+ # The rule engine takes per-document-type maps. Group across
# files and segments -- multiple segments of the same doctype
# contribute their field groups to the same bucket.
extracted_by_doc: dict[str, list[ExtractedFieldGroup]] = {}
- visual_by_doc: dict[str, list[VisualValidationOutcome]] = {}
+ visual_by_doc: dict[str, list[VisualCheckResult]] = {}
for t in tasks:
- doc_type = t.doc_spec.docType.documentType
+ doc_type = t.doc_spec.id
extracted_by_doc.setdefault(doc_type, []).extend(t.extracted_groups)
visual_by_doc.setdefault(doc_type, []).extend(t.visual)
try:
rule_results = await self._rule_engine.evaluate(
request.rules,
- docs=request.docs,
+ docs=request.document_types,
extracted_by_doc=extracted_by_doc,
visual_by_doc=visual_by_doc,
intention=request.intention,
@@ -891,6 +921,7 @@ def _build_result(
latency_ms: int,
*,
pipeline_result: Any = None,
+ result_id: str,
) -> ExtractionResult:
files_data: list[_FileSlot] = ctx.metadata.get("files_data", [])
tasks: list[_ExtractionTask] = ctx.metadata.get("tasks", [])
@@ -905,24 +936,27 @@ def _build_result(
if used_models:
model_id = ",".join(sorted(used_models)) if len(used_models) > 1 else next(iter(used_models))
- documents: list[ExtractedDocument] = [
- ExtractedDocument(
- document_type=task.doc_spec.docType.documentType,
+ documents: list[Document] = [
+ Document(
+ type=task.doc_spec.id,
missing=False,
pages=_pages_range(task.segment.page_start, task.segment.page_end),
description=_segment_description(task.segment, task.doc_spec),
confidence=_segment_confidence(task.segment),
- fields=task.extracted_groups,
- authenticity=DocumentAuthenticity(visual=task.visual, content=task.content),
+ field_groups=task.extracted_groups,
+ authenticity=DocumentAuthenticity(
+ visual=task.visual,
+ content=task.content,
+ ),
source_file=task.segment.filename,
)
for task in tasks
]
# Unmatched / unroutable segments
- additional_documents = [
- ExtractedDocument(
- document_type=UNMATCHED,
+ discovered_documents = [
+ Document(
+ type=UNMATCHED,
missing=False,
pages=_pages_range(seg.page_start, seg.page_end),
description=(seg.classification.description if seg.classification else seg.description),
@@ -937,23 +971,42 @@ def _build_result(
escalation: EscalationInfo | None = ctx.metadata.get("escalation")
usage_breakdown = _usage_breakdown(
- request_id=str(request.request_id),
+ request_id=result_id,
pipeline_result=pipeline_result,
)
trace = _trace_entries(pipeline_result)
+ pipeline_errors = [
+ PipelineError(node=e["node"], code=e["code"], message=e["message"])
+ for e in ctx.metadata.get("pipeline_errors", [])
+ ]
+ pipeline_meta = PipelineMeta(
+ model=model_id,
+ latency_ms=latency_ms,
+ trace=trace,
+ errors=pipeline_errors,
+ escalation=escalation,
+ usage=usage_breakdown,
+ )
+ # Determine overall status: ``partial`` when at least one task
+ # produced no extracted groups (typically because the extractor
+ # node failed for that task); ``success`` otherwise. The async
+ # lifecycle is orthogonal to this -- the ``ExtractionStatus``
+ # enum already collapses both into the main ``succeeded`` state
+ # on the row -- but the result envelope reflects which sub-task
+ # outcome the caller is consuming.
+ any_failed_task = any(not t.extracted_groups for t in tasks)
+ any_pipeline_error = bool(pipeline_errors)
+ status_literal = "partial" if (any_failed_task or any_pipeline_error) else "success"
+
return ExtractionResult(
- request_id=request.request_id,
+ id=result_id,
+ status=status_literal,
files=files_info,
documents=documents,
- additional_documents=additional_documents,
+ discovered_documents=discovered_documents,
rule_results=rule_results,
request_transformations=ctx.metadata.get("request_transformations", []),
- model=model_id,
- latency_ms=latency_ms,
- pipeline_errors=ctx.metadata.get("pipeline_errors", []),
- escalation=escalation,
- usage=usage_breakdown,
- trace=trace,
+ pipeline=pipeline_meta,
)
@@ -968,12 +1021,12 @@ def _pages_range(start: int | None, end: int | None) -> list[int]:
return list(range(start, end + 1))
-def _segment_description(seg: _Segment, doc_spec: DocSpec) -> str:
+def _segment_description(seg: _Segment, doc_spec: DocumentTypeSpec) -> str:
if seg.classification and seg.classification.description:
return seg.classification.description
if seg.description:
return seg.description
- return doc_spec.docType.description or ""
+ return doc_spec.description or ""
def _segment_confidence(seg: _Segment) -> float:
@@ -1069,24 +1122,24 @@ def _classification_info(result: ClassificationResult | None) -> ClassificationI
)
-def _file_info(slot: _FileSlot) -> DocumentInfo:
- """Top-level ``DocumentInfo`` summary for the per-file response field.
+def _file_info(slot: _FileSlot) -> FileSummary:
+ """Top-level :class:`FileSummary` for the per-file response field.
For files with a single segment we surface the segment's classifier
verdict directly. For files split into multiple segments,
- ``document_type`` is left ``null`` and ``classification`` is null --
+ ``matched_type`` is left ``null`` and ``classification`` is null --
the per-segment outcomes live on ``documents[]`` and
- ``additional_documents[]``.
+ ``discovered_documents[]``.
"""
one_segment = len(slot.segments) == 1
- doc_type = slot.segments[0].resolved_doctype if one_segment else None
+ matched_type = slot.segments[0].resolved_doctype if one_segment else None
classification = _classification_info(slot.segments[0].classification) if one_segment else None
- return DocumentInfo(
+ return FileSummary(
filename=slot.filename,
media_type=slot.media_type,
page_count=slot.page_count,
bytes=len(slot.document_bytes),
- document_type=doc_type,
+ matched_type=matched_type,
classification=classification,
)
diff --git a/src/flydocs/core/services/rules/rule_engine.py b/src/flydocs/core/services/rules/rule_engine.py
index 377e159..0a0b177 100644
--- a/src/flydocs/core/services/rules/rule_engine.py
+++ b/src/flydocs/core/services/rules/rule_engine.py
@@ -22,8 +22,8 @@
from pydantic import BaseModel, Field
from flydocs.core.observability import DEFAULT_MIDDLEWARE, timed_agent_run
-from flydocs.interfaces.dtos.authenticity import VisualValidationOutcome
-from flydocs.interfaces.dtos.doc import DocSpec
+from flydocs.interfaces.dtos.authenticity import VisualCheckResult
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec
from flydocs.interfaces.dtos.field import ExtractedField, ExtractedFieldGroup
from flydocs.interfaces.dtos.rule import (
RuleFieldParent,
@@ -65,9 +65,9 @@ async def evaluate(
self,
rules: list[RuleSpec],
*,
- docs: list[DocSpec],
+ docs: list[DocumentTypeSpec],
extracted_by_doc: dict[str, list[ExtractedFieldGroup]],
- visual_by_doc: dict[str, list[VisualValidationOutcome]],
+ visual_by_doc: dict[str, list[VisualCheckResult]],
intention: str,
model: str | None = None,
) -> list[RuleResult]:
@@ -148,9 +148,9 @@ async def evaluate(
predicate=raw.predicate
or (rule_by_id[raw.rule_id].predicate if raw.rule_id in rule_by_id else ""),
output=raw.output,
- summary=raw.summary,
+ summary=raw.summary or None,
notes=list(raw.notes),
- human_revision=raw.human_revision,
+ human_revision=raw.human_revision or None,
)
)
sorter.done(*ready_ids)
@@ -164,10 +164,10 @@ def _build_dag(self, rules: list[RuleSpec], rule_by_id: dict[str, RuleSpec]) ->
parents: list[str] = []
for parent in rule.parents:
if isinstance(parent, RuleRuleParent):
- if parent.ruleId in rule_by_id:
- parents.append(parent.ruleId)
+ if parent.rule in rule_by_id:
+ parents.append(parent.rule)
else:
- logger.warning("Rule %r references unknown parent rule %r", rule.id, parent.ruleId)
+ logger.warning("Rule %r references unknown parent rule %r", rule.id, parent.rule)
sorter.add(rule.id, *parents)
return sorter
@@ -177,40 +177,40 @@ def _build_rules_context(self, rules: Iterable[RuleSpec]) -> list[dict[str, Any]
def _build_documents_context(
self,
active_rules: list[RuleSpec],
- docs: list[DocSpec],
+ docs: list[DocumentTypeSpec],
extracted_by_doc: dict[str, list[ExtractedFieldGroup]],
- visual_by_doc: dict[str, list[VisualValidationOutcome]],
+ visual_by_doc: dict[str, list[VisualCheckResult]],
) -> list[dict[str, Any]]:
- # Collect which fields/validators each rule touches per doc type.
+ # Collect which fields/validators each rule touches per document type.
deps: dict[str, dict[str, set[str]]] = {}
for rule in active_rules:
for parent in rule.parents:
if isinstance(parent, RuleFieldParent):
- deps.setdefault(parent.documentType, {"fields": set(), "validators": set()})[
+ deps.setdefault(parent.document_type, {"fields": set(), "validators": set()})[
"fields"
- ].update(parent.fieldNames)
+ ].update(parent.fields)
elif isinstance(parent, RuleValidatorParent):
- deps.setdefault(parent.documentType, {"fields": set(), "validators": set()})[
+ deps.setdefault(parent.document_type, {"fields": set(), "validators": set()})[
"validators"
- ].add(parent.validatorName)
+ ].add(parent.validator)
# Walk every required dep and emit a row per match.
rows: list[dict[str, Any]] = []
- spec_by_type = {d.docType.documentType: d for d in docs}
+ spec_by_type = {d.id: d for d in docs}
for doc_type, want in deps.items():
spec = spec_by_type.get(doc_type)
if spec is None:
logger.warning("Rule references unknown documentType %r", doc_type)
continue
for group in extracted_by_doc.get(doc_type, []):
- for field in group.fieldGroupFields:
- if field.fieldName in want["fields"]:
+ for field in group.fields:
+ if field.name in want["fields"]:
rows.append(
{
- "documentType": doc_type,
- "fieldGroupName": group.fieldGroupName,
- "fieldName": field.fieldName,
- "fieldValueFound": _serialise_field_value(field),
- "field_validation": field.field_validation.model_dump(mode="json"),
+ "document_type": doc_type,
+ "field_group": group.name,
+ "field_name": field.name,
+ "value": _serialise_field_value(field),
+ "validation": field.validation.model_dump(mode="json"),
"judge": field.judge.model_dump(mode="json"),
}
)
@@ -218,8 +218,8 @@ def _build_documents_context(
if validator.name in want["validators"]:
rows.append(
{
- "documentType": doc_type,
- "validatorName": validator.name,
+ "document_type": doc_type,
+ "validator_name": validator.name,
"validator_passed": validator.passed,
"validator_confidence": validator.confidence,
"validator_notes": validator.notes,
@@ -230,28 +230,28 @@ def _build_documents_context(
def _used_by_any(self, parent_rule_id: str, active_rules: Iterable[RuleSpec]) -> bool:
for rule in active_rules:
for parent in rule.parents:
- if isinstance(parent, RuleRuleParent) and parent.ruleId == parent_rule_id:
+ if isinstance(parent, RuleRuleParent) and parent.rule == parent_rule_id:
return True
return False
def _serialise_field_value(field: ExtractedField) -> Any:
- if isinstance(field.fieldValueFound, list):
+ if isinstance(field.value, list):
return [
{
- "rowName": row.fieldName,
- # Nested ``fieldValueFound`` only carries a row of
- # sub-fields for array-typed parents; for scalar leaves
- # it is the value itself, which we skip here. The
- # isinstance guard also satisfies pyright -- without it
- # the iterator type is the full union (str / int / …).
+ "rowName": row.name,
+ # Nested ``value`` only carries a row of sub-fields for
+ # array-typed parents; for scalar leaves it is the value
+ # itself, which we skip here. The isinstance guard also
+ # satisfies pyright -- without it the iterator type is
+ # the full union (str / int / …).
"row": [
- {"fieldName": sub.fieldName, "value": sub.fieldValueFound}
- for sub in (row.fieldValueFound if isinstance(row.fieldValueFound, list) else [])
+ {"fieldName": sub.name, "value": sub.value}
+ for sub in (row.value if isinstance(row.value, list) else [])
if isinstance(sub, ExtractedField)
],
}
- for row in field.fieldValueFound
+ for row in field.value
if isinstance(row, ExtractedField)
]
- return field.fieldValueFound
+ return field.value
diff --git a/src/flydocs/core/services/splitting/splitter.py b/src/flydocs/core/services/splitting/splitter.py
index 7b9cec8..2a1a592 100644
--- a/src/flydocs/core/services/splitting/splitter.py
+++ b/src/flydocs/core/services/splitting/splitter.py
@@ -3,7 +3,7 @@
a file and pin each one to a contiguous, non-overlapping page range.
Pure segmentation: the splitter does **not** decide which caller-declared
-``DocSpec`` each segment matches -- that is the
+:class:`DocumentTypeSpec` each segment matches -- that is the
:class:`flydocs.core.services.classification.DocumentClassifier`'s
job. Keeping the two services separate means a single uploaded file
that happens to contain several documents (a deed + a DNI + a utility
@@ -28,7 +28,7 @@
from pydantic import BaseModel, Field
from flydocs.core.observability import DEFAULT_MIDDLEWARE, timed_agent_run
-from flydocs.interfaces.dtos.doc import DocSpec
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec
logger = logging.getLogger(__name__)
@@ -96,7 +96,7 @@ async def discover(
document_bytes: bytes,
media_type: str,
page_count: int,
- targets: list[DocSpec],
+ targets: list[DocumentTypeSpec],
intention: str,
model: str | None = None,
) -> SplitResult:
@@ -125,9 +125,9 @@ async def discover(
targets_json = json.dumps(
[
{
- "documentType": d.docType.documentType,
- "description": d.docType.description,
- "country": d.docType.country,
+ "id": d.id,
+ "description": d.description,
+ "country": d.country,
}
for d in targets
],
diff --git a/src/flydocs/core/services/transformations/entity_resolution.py b/src/flydocs/core/services/transformations/entity_resolution.py
index 9477aee..9fb5656 100644
--- a/src/flydocs/core/services/transformations/entity_resolution.py
+++ b/src/flydocs/core/services/transformations/entity_resolution.py
@@ -66,38 +66,36 @@ def apply(
)
return None
- raw = array_field.fieldValueFound if isinstance(array_field.fieldValueFound, list) else []
+ raw = array_field.value if isinstance(array_field.value, list) else []
rows = [r for r in raw if isinstance(r, ExtractedField)]
if not rows:
return None
merged_rows = _dedupe_rows(rows, transformation)
- # Build the replacement array field. We keep the same
- # ``fieldName`` so downstream consumers don't need to special-case
- # the post-transformation shape. ``name=`` / ``value=`` are
- # Pydantic aliases declared on :class:`ExtractedField`; we use
- # them here so static type checkers don't flag the call.
+ # Build the replacement array field. We keep the same field name
+ # so downstream consumers don't need to special-case the
+ # post-transformation shape.
new_array = ExtractedField(
- name=array_field.fieldName,
+ name=array_field.name,
value=merged_rows,
- pagesFound=array_field.pagesFound,
+ pages=array_field.pages,
confidence=array_field.confidence,
bbox=array_field.bbox,
)
if transformation.output_group:
new_group = ExtractedFieldGroup(
- fieldGroupName=transformation.output_group,
- fieldGroupFields=[new_array],
+ name=transformation.output_group,
+ fields=[new_array],
)
groups.append(new_group)
return new_group
# Mutate in place — replace the array field on the existing group.
- for idx, fld in enumerate(target.fieldGroupFields):
+ for idx, fld in enumerate(target.fields):
if fld is array_field:
- target.fieldGroupFields[idx] = new_array
+ target.fields[idx] = new_array
break
return target
@@ -109,15 +107,15 @@ def apply(
def _find_group(groups: list[ExtractedFieldGroup], name: str) -> ExtractedFieldGroup | None:
for g in groups:
- if g.fieldGroupName == name:
+ if g.name == name:
return g
return None
def _find_array_field(group: ExtractedFieldGroup) -> ExtractedField | None:
"""Return the first field whose value is a list (the array row container)."""
- for f in group.fieldGroupFields:
- if isinstance(f.fieldValueFound, list):
+ for f in group.fields:
+ if isinstance(f.value, list):
return f
return None
@@ -138,12 +136,12 @@ def _name_tokens(value: str) -> frozenset[str]:
def _row_value(row: ExtractedField, field_name: str) -> str:
"""Extract a scalar value from a row's sub-fields, by name."""
- inner = row.fieldValueFound if isinstance(row.fieldValueFound, list) else []
+ inner = row.value if isinstance(row.value, list) else []
for sub in inner:
if not isinstance(sub, ExtractedField):
continue
- if sub.fieldName == field_name:
- v = sub.fieldValueFound
+ if sub.name == field_name:
+ v = sub.value
if isinstance(v, (str, int, float)):
return str(v).strip()
return ""
@@ -239,30 +237,30 @@ def _canonicalise(cluster: list[ExtractedField], _t: EntityResolutionTransformat
For each sub-field name found across the cluster, we keep the
"most complete" value -- the longest string, or the first
non-empty value when comparing scalars / multi-type fields. The
- canonical row inherits its ``fieldName`` and bbox-ish metadata
- from the first row in the cluster (the row we encountered first
- in source order).
+ canonical row inherits its ``name`` and bbox-ish metadata from the
+ first row in the cluster (the row we encountered first in source
+ order).
"""
if not cluster:
# Defensive — callers ensure non-empty clusters but keep this safe.
return ExtractedField(name="row", value=None)
base = cluster[0]
- if not isinstance(base.fieldValueFound, list):
+ if not isinstance(base.value, list):
return base
# Collect every sub-field name we've seen, preserving insertion order.
seen_names: list[str] = []
by_name: dict[str, list[ExtractedField]] = {}
for row in cluster:
- inner = row.fieldValueFound if isinstance(row.fieldValueFound, list) else []
+ inner = row.value if isinstance(row.value, list) else []
for sub in inner:
if not isinstance(sub, ExtractedField):
continue
- if sub.fieldName not in by_name:
- seen_names.append(sub.fieldName)
- by_name[sub.fieldName] = []
- by_name[sub.fieldName].append(sub)
+ if sub.name not in by_name:
+ seen_names.append(sub.name)
+ by_name[sub.name] = []
+ by_name[sub.name].append(sub)
merged_subs: list[ExtractedField] = []
for fname in seen_names:
@@ -270,9 +268,9 @@ def _canonicalise(cluster: list[ExtractedField], _t: EntityResolutionTransformat
merged_subs.append(_pick_canonical(candidates))
return ExtractedField(
- name=base.fieldName,
+ name=base.name,
value=merged_subs,
- pagesFound=_merge_pages(cluster),
+ pages=_merge_pages(cluster),
confidence=max((r.confidence for r in cluster), default=0.0),
bbox=base.bbox,
)
@@ -282,7 +280,7 @@ def _pick_canonical(candidates: list[ExtractedField]) -> ExtractedField:
"""Of N candidate sub-fields for the same name, return the 'best' value."""
def score(sub: ExtractedField) -> tuple[int, int]:
- v = sub.fieldValueFound
+ v = sub.value
if isinstance(v, str):
return (1, len(v.strip()))
if isinstance(v, (int, float)):
@@ -297,11 +295,11 @@ def score(sub: ExtractedField) -> tuple[int, int]:
def _merge_pages(cluster: list[ExtractedField]) -> list[int]:
- """Union of all pagesFound across a cluster, preserving order."""
+ """Union of all pages across a cluster, preserving order."""
seen: set[int] = set()
out: list[int] = []
for row in cluster:
- for p in row.pagesFound or []:
+ for p in row.pages or []:
if p not in seen:
seen.add(p)
out.append(p)
diff --git a/src/flydocs/core/services/transformations/llm_transformer.py b/src/flydocs/core/services/transformations/llm_transformer.py
index baef8ae..b071834 100644
--- a/src/flydocs/core/services/transformations/llm_transformer.py
+++ b/src/flydocs/core/services/transformations/llm_transformer.py
@@ -85,7 +85,7 @@ async def apply(
)
return None
- raw = array_field.fieldValueFound if isinstance(array_field.fieldValueFound, list) else []
+ raw = array_field.value if isinstance(array_field.value, list) else []
rows = [r for r in raw if isinstance(r, ExtractedField)]
if not rows:
return None
@@ -126,24 +126,24 @@ async def apply(
produced_rows = _rebuild_rows(run_result.output.rows, rows[0])
new_array = ExtractedField(
- name=array_field.fieldName,
+ name=array_field.name,
value=produced_rows,
- pagesFound=array_field.pagesFound,
+ pages=array_field.pages,
confidence=array_field.confidence,
bbox=array_field.bbox,
)
if transformation.output_group:
new_group = ExtractedFieldGroup(
- fieldGroupName=transformation.output_group,
- fieldGroupFields=[new_array],
+ name=transformation.output_group,
+ fields=[new_array],
)
groups.append(new_group)
return new_group
- for idx, fld in enumerate(target.fieldGroupFields):
+ for idx, fld in enumerate(target.fields):
if fld is array_field:
- target.fieldGroupFields[idx] = new_array
+ target.fields[idx] = new_array
break
return target
@@ -155,26 +155,26 @@ async def apply(
def _find_group(groups: list[ExtractedFieldGroup], name: str) -> ExtractedFieldGroup | None:
for g in groups:
- if g.fieldGroupName == name:
+ if g.name == name:
return g
return None
def _find_array_field(group: ExtractedFieldGroup) -> ExtractedField | None:
- for f in group.fieldGroupFields:
- if isinstance(f.fieldValueFound, list):
+ for f in group.fields:
+ if isinstance(f.value, list):
return f
return None
def _serialise_row(row: ExtractedField) -> dict[str, Any]:
"""Flatten a row to a JSON dict the LLM can read."""
- inner = row.fieldValueFound if isinstance(row.fieldValueFound, list) else []
+ inner = row.value if isinstance(row.value, list) else []
out: dict[str, Any] = {}
for sub in inner:
if not isinstance(sub, ExtractedField):
continue
- out[sub.fieldName] = sub.fieldValueFound
+ out[sub.name] = sub.value
return out
@@ -187,8 +187,8 @@ def _rebuild_rows(llm_rows: list[_TransformRow], template_row: ExtractedField) -
names produced by the LLM are added as fresh ExtractedField
children with default metadata.
"""
- template_subs = template_row.fieldValueFound if isinstance(template_row.fieldValueFound, list) else []
- template_by_name = {s.fieldName: s for s in template_subs if isinstance(s, ExtractedField)}
+ template_subs = template_row.value if isinstance(template_row.value, list) else []
+ template_by_name = {s.name: s for s in template_subs if isinstance(s, ExtractedField)}
materialised: list[ExtractedField] = []
for i, lr in enumerate(llm_rows):
@@ -199,7 +199,7 @@ def _rebuild_rows(llm_rows: list[_TransformRow], template_row: ExtractedField) -
ExtractedField(
name=name,
value=value,
- pagesFound=tmpl.pagesFound if tmpl else [],
+ pages=tmpl.pages if tmpl else [],
confidence=tmpl.confidence if tmpl else 0.0,
bbox=tmpl.bbox if tmpl else template_row.bbox,
)
@@ -208,7 +208,7 @@ def _rebuild_rows(llm_rows: list[_TransformRow], template_row: ExtractedField) -
ExtractedField(
name=f"row_{i + 1}",
value=sub_fields,
- pagesFound=template_row.pagesFound,
+ pages=template_row.pages,
confidence=template_row.confidence,
bbox=template_row.bbox,
)
diff --git a/src/flydocs/core/services/transformations/transformation_engine.py b/src/flydocs/core/services/transformations/transformation_engine.py
index d97c6fd..db550fa 100644
--- a/src/flydocs/core/services/transformations/transformation_engine.py
+++ b/src/flydocs/core/services/transformations/transformation_engine.py
@@ -149,14 +149,14 @@ def _consolidate_groups(
found_any = False
for task_groups in per_task_groups:
for g in task_groups:
- if g.fieldGroupName != target_name:
+ if g.name != target_name:
continue
found_any = True
- for f in g.fieldGroupFields:
- if isinstance(f.fieldValueFound, list):
+ for f in g.fields:
+ if isinstance(f.value, list):
if not array_field_name:
- array_field_name = f.fieldName
- all_rows.extend(r for r in f.fieldValueFound if isinstance(r, ExtractedField))
+ array_field_name = f.name
+ all_rows.extend(r for r in f.value if isinstance(r, ExtractedField))
if not found_any or not all_rows:
return None
array_field = ExtractedField(
@@ -164,8 +164,8 @@ def _consolidate_groups(
value=all_rows,
)
return ExtractedFieldGroup(
- fieldGroupName=target_name,
- fieldGroupFields=[array_field],
+ name=target_name,
+ fields=[array_field],
)
diff --git a/src/flydocs/core/services/validation/__init__.py b/src/flydocs/core/services/validation/__init__.py
index 33de9c9..140f1b9 100644
--- a/src/flydocs/core/services/validation/__init__.py
+++ b/src/flydocs/core/services/validation/__init__.py
@@ -4,10 +4,10 @@
Two layers:
* :class:`FieldValidator` runs *after* extraction: regex, range, enum
- and ``StandardValidator`` checks on each ExtractedField.
+ and ``ValidatorSpec`` checks on each ExtractedField.
* :class:`RequestValidator` runs *before* the pipeline: semantic
cross-field checks that pydantic can't express (rule parents that
- reference unknown docTypes / fields, cycles in the rule DAG,
+ reference unknown document types / fields, cycles in the rule DAG,
duplicate ids, etc.).
"""
@@ -17,10 +17,16 @@
ValidationIssue,
ValidationReport,
)
+from flydocs.core.services.validation.validator_registry import (
+ ValidatorRegistry,
+ run_validator,
+)
__all__ = [
"FieldValidator",
"RequestValidator",
"ValidationIssue",
"ValidationReport",
+ "ValidatorRegistry",
+ "run_validator",
]
diff --git a/src/flydocs/core/services/validation/field_validator.py b/src/flydocs/core/services/validation/field_validator.py
index f446a19..a4d55c0 100644
--- a/src/flydocs/core/services/validation/field_validator.py
+++ b/src/flydocs/core/services/validation/field_validator.py
@@ -1,11 +1,10 @@
# Copyright 2026 Firefly Software Solutions Inc
"""Pure-Python field validator -- evaluates pattern, format, enum,
-minimum, maximum, and every declared :class:`StandardValidatorSpec`
-against every extracted field. No LLM involvement.
+minimum, maximum, and every declared :class:`ValidatorSpec` against
+every extracted field. No LLM involvement.
The verdict is attached directly to each :class:`ExtractedField` via
-its ``field_validation`` attribute, so consumers never need a parallel
-tree.
+its ``validation`` attribute, so consumers never need a parallel tree.
"""
from __future__ import annotations
@@ -16,17 +15,16 @@
from urllib.parse import urlparse
from uuid import UUID as PyUUID
-from flydocs.core.services.validation.standard_validator_registry import run_standard_validator
+from flydocs.core.services.validation.validator_registry import run_validator
from flydocs.interfaces.dtos.field import (
ExtractedField,
ExtractedFieldGroup,
+ Field,
FieldGroup,
- FieldItem,
- FieldSpec,
FieldValidation,
FieldValidationError,
)
-from flydocs.interfaces.dtos.standard_validator import StandardValidatorSpec
+from flydocs.interfaces.dtos.validator import ValidatorSpec
from flydocs.interfaces.enums.field_type import FieldType, StandardFormat
from flydocs.interfaces.enums.status import ValidationRule
@@ -39,17 +37,17 @@ def validate(
spec_groups: list[FieldGroup],
extracted_groups: list[ExtractedFieldGroup],
) -> list[ExtractedFieldGroup]:
- """Return *extracted_groups* with each field's ``field_validation`` populated."""
+ """Return *extracted_groups* with each field's ``validation`` populated."""
if not spec_groups or not extracted_groups:
return extracted_groups
- by_group: dict[str, FieldGroup] = {g.fieldGroupName: g for g in spec_groups}
+ by_group: dict[str, FieldGroup] = {g.name: g for g in spec_groups}
for group in extracted_groups:
- spec_group = by_group.get(group.fieldGroupName)
+ spec_group = by_group.get(group.name)
if spec_group is None:
continue
- spec_by_name: dict[str, FieldSpec] = {s.fieldName: s for s in spec_group.fieldGroupFields}
- for field in group.fieldGroupFields:
- spec = spec_by_name.get(field.fieldName)
+ spec_by_name: dict[str, Field] = {s.name: s for s in spec_group.fields}
+ for field in group.fields:
+ spec = spec_by_name.get(field.name)
if spec is None:
continue
self._validate_field(spec, field)
@@ -57,53 +55,58 @@ def validate(
# ----------------------------------------------------------- private
- def _validate_field(self, spec: FieldSpec, field: ExtractedField) -> None:
- if spec.fieldType == FieldType.ARRAY:
+ def _validate_field(self, spec: Field, field: ExtractedField) -> None:
+ if spec.type == FieldType.ARRAY:
self._validate_array(spec, field)
return
errors = self._run_constraints(
- field_type=spec.fieldType,
+ field_type=spec.type,
pattern=spec.pattern,
fmt=spec.format,
enum=spec.enum,
minimum=spec.minimum,
maximum=spec.maximum,
- standard_validators=spec.standard_validators,
- value=field.fieldValueFound,
+ validators=spec.validators,
+ value=field.value,
)
# ``severity=warning`` validators record errors but don't flip ``valid``.
hard_errors = [e for e in errors if not e.message.endswith("[warning]")]
- field.field_validation = FieldValidation(valid=not hard_errors, errors=errors)
+ field.validation = FieldValidation(valid=not hard_errors, errors=errors)
- def _validate_array(self, spec: FieldSpec, field: ExtractedField) -> None:
+ def _validate_array(self, spec: Field, field: ExtractedField) -> None:
row_errors: list[FieldValidationError] = []
- rows = field.fieldValueFound if isinstance(field.fieldValueFound, list) else []
- item_specs: dict[str, FieldItem] = {item.fieldName: item for item in (spec.items or [])}
+ rows = field.value if isinstance(field.value, list) else []
+ items_spec = spec.items
+ # The recursive Field for an array element is typically ``type =
+ # object`` -- iterate its declared sub-fields.
+ sub_specs: dict[str, Field] = {}
+ if items_spec is not None and items_spec.fields:
+ sub_specs = {s.name: s for s in items_spec.fields}
all_valid = True
for row in rows:
- if not isinstance(row, ExtractedField) or not isinstance(row.fieldValueFound, list):
+ if not isinstance(row, ExtractedField) or not isinstance(row.value, list):
continue
- for sub_field in row.fieldValueFound:
+ for sub_field in row.value:
if not isinstance(sub_field, ExtractedField):
continue
- item = item_specs.get(sub_field.fieldName)
- if item is None:
+ sub_spec = sub_specs.get(sub_field.name)
+ if sub_spec is None:
continue
errors = self._run_constraints(
- field_type=item.fieldType,
- pattern=item.pattern,
- fmt=item.format,
- enum=item.enum,
- minimum=item.minimum,
- maximum=item.maximum,
- standard_validators=item.standard_validators,
- value=sub_field.fieldValueFound,
+ field_type=sub_spec.type,
+ pattern=sub_spec.pattern,
+ fmt=sub_spec.format,
+ enum=sub_spec.enum,
+ minimum=sub_spec.minimum,
+ maximum=sub_spec.maximum,
+ validators=sub_spec.validators,
+ value=sub_field.value,
)
hard_errors = [e for e in errors if not e.message.endswith("[warning]")]
- sub_field.field_validation = FieldValidation(valid=not hard_errors, errors=errors)
+ sub_field.validation = FieldValidation(valid=not hard_errors, errors=errors)
if hard_errors:
all_valid = False
- field.field_validation = FieldValidation(valid=all_valid, errors=row_errors)
+ field.validation = FieldValidation(valid=all_valid, errors=row_errors)
def _run_constraints(
self,
@@ -114,7 +117,7 @@ def _run_constraints(
enum: list[Any] | None,
minimum: float | None,
maximum: float | None,
- standard_validators: list[StandardValidatorSpec],
+ validators: list[ValidatorSpec],
value: Any,
) -> list[FieldValidationError]:
if value is None:
@@ -146,14 +149,14 @@ def _run_constraints(
err = self._validate_maximum(maximum, value)
if err is not None:
errors.append(err)
- for sv in standard_validators or []:
- message = run_standard_validator(sv.type, value, sv.params)
+ for sv in validators or []:
+ message = run_validator(sv.name, value, sv.params)
if message is not None:
suffix = " [warning]" if sv.severity == "warning" else ""
errors.append(
FieldValidationError(
- rule=ValidationRule.STANDARD,
- message=f"{sv.type.value}: {message}{suffix}",
+ rule=ValidationRule.VALIDATOR,
+ message=f"{sv.name.value}: {message}{suffix}",
)
)
return errors
diff --git a/src/flydocs/core/services/validation/request_validator.py b/src/flydocs/core/services/validation/request_validator.py
index fcfdf49..e44ada0 100644
--- a/src/flydocs/core/services/validation/request_validator.py
+++ b/src/flydocs/core/services/validation/request_validator.py
@@ -5,17 +5,17 @@
``min_length``). This validator catches the *semantic* mistakes a caller
can make even with a well-formed JSON body:
- * a ``RuleSpec.parents`` entry that points to a ``documentType`` not
- declared in ``docs``,
- * field names that don't exist in the referenced DocSpec,
- * a rule that depends on a validator name not declared on that doc,
+ * a ``RuleSpec.parents`` entry that points to a ``document_type`` not
+ declared in ``document_types``,
+ * field names that don't exist in the referenced :class:`DocumentTypeSpec`,
+ * a rule that depends on a validator name not declared on that document type,
* a rule that depends on another rule's ``id`` not present in the request,
* duplicate rule ids,
* cycles in the rule DAG (detected before the rule engine is invoked),
* ``stages.rule_engine`` toggled without any rule.
Each issue carries a ``severity`` (``error`` or ``warning``); the
-controller layer raises ``422 invalid_request`` with the RFC 7807
+controller layer raises ``422 validation_failed`` with the RFC 7807
problem-detail body when at least one error is present. Warnings are
returned but don't block the request.
"""
@@ -83,14 +83,14 @@ class RequestValidator:
"""Pre-flight checks before the pipeline runs.
Stateless and side-effect-free. Registered as a pyfly @bean so the
- controllers, the job submit handler, and the test suite all share
- one instance.
+ controllers, the extraction submit handler, and the test suite all
+ share one instance.
"""
def validate(self, request: ExtractionRequest) -> ValidationReport:
report = ValidationReport()
self._check_files(request, report)
- self._check_docs(request, report)
+ self._check_document_types(request, report)
self._check_rule_references(request, report)
self._check_rule_dag(request, report)
self._check_stage_consistency(request, report)
@@ -99,58 +99,61 @@ def validate(self, request: ExtractionRequest) -> ValidationReport:
# -- file-level checks (multi-file shape) ----------------------------
def _check_files(self, request: ExtractionRequest, report: ValidationReport) -> None:
- known_types = {d.docType.documentType for d in request.docs}
- for f_index, file in enumerate(request.documents):
- if not file.document_type:
+ known_types = {d.id for d in request.document_types}
+ for f_index, file in enumerate(request.files):
+ if not file.expected_type:
continue
- if file.document_type not in known_types:
+ if file.expected_type not in known_types:
report.issues.append(
ValidationIssue(
severity="error",
code="document_type_unknown",
message=(
- f"File {file.filename!r} pins document_type "
- f"{file.document_type!r} which is not declared in "
- "docs[]."
+ f"File {file.filename!r} pins expected_type "
+ f"{file.expected_type!r} which is not declared in "
+ "document_types[]."
),
- path=f"documents[{f_index}].document_type",
+ path=f"files[{f_index}].expected_type",
)
)
- # -- doc-level checks ------------------------------------------------
+ # -- document-type-level checks --------------------------------------
- def _check_docs(self, request: ExtractionRequest, report: ValidationReport) -> None:
- # Pydantic already rejects empty docs[] (min_length=1) but each
- # DocSpec may still have an empty fieldGroups or empty fieldFields.
+ def _check_document_types(self, request: ExtractionRequest, report: ValidationReport) -> None:
+ # Pydantic already rejects empty document_types[] (min_length=1)
+ # but each DocumentTypeSpec may still have an empty field_groups
+ # or empty group.fields.
seen_doctypes: dict[str, int] = {}
- for d_index, doc in enumerate(request.docs):
- path = f"docs[{d_index}].docType.documentType"
- doc_type = doc.docType.documentType
+ for d_index, doc in enumerate(request.document_types):
+ path = f"document_types[{d_index}].id"
+ doc_type = doc.id
seen_doctypes[doc_type] = seen_doctypes.get(doc_type, 0) + 1
- if not doc.fieldGroups:
+ if not doc.field_groups:
report.issues.append(
ValidationIssue(
severity="error",
code="empty_field_groups",
- message=f"DocSpec {doc_type!r} declares no fieldGroups -- nothing to extract.",
- path=f"docs[{d_index}].fieldGroups",
+ message=(
+ f"DocumentTypeSpec {doc_type!r} declares no field_groups -- nothing to extract."
+ ),
+ path=f"document_types[{d_index}].field_groups",
)
)
continue
- for g_index, group in enumerate(doc.fieldGroups):
- if not group.fieldGroupFields:
+ for g_index, group in enumerate(doc.field_groups):
+ if not group.fields:
report.issues.append(
ValidationIssue(
severity="error",
code="empty_field_group",
message=(
- f"DocSpec {doc_type!r} fieldGroup {group.fieldGroupName!r} has no fields."
+ f"DocumentTypeSpec {doc_type!r} field group {group.name!r} has no fields."
),
- path=f"docs[{d_index}].fieldGroups[{g_index}].fieldGroupFields",
+ path=f"document_types[{d_index}].field_groups[{g_index}].fields",
)
)
- # Duplicate field names within the same doc.
- all_names: list[str] = [f.fieldName for g in doc.fieldGroups for f in g.fieldGroupFields]
+ # Duplicate field names within the same document type.
+ all_names: list[str] = [f.name for g in doc.field_groups for f in g.fields]
seen: set[str] = set()
for name in all_names:
if name in seen:
@@ -158,13 +161,15 @@ def _check_docs(self, request: ExtractionRequest, report: ValidationReport) -> N
ValidationIssue(
severity="error",
code="duplicate_field_name",
- message=(f"DocSpec {doc_type!r} declares fieldName {name!r} more than once."),
+ message=(
+ f"DocumentTypeSpec {doc_type!r} declares field name {name!r} more than once."
+ ),
path=path,
)
)
seen.add(name)
- # Duplicate documentType across docs.
+ # Duplicate document type id across declarations.
for doc_type, count in seen_doctypes.items():
if count > 1:
report.issues.append(
@@ -172,10 +177,10 @@ def _check_docs(self, request: ExtractionRequest, report: ValidationReport) -> N
severity="error",
code="duplicate_document_type",
message=(
- f"documentType {doc_type!r} declared {count} times in "
- "docs[]; document types must be unique."
+ f"document type {doc_type!r} declared {count} times in "
+ "document_types[]; ids must be unique."
),
- path="docs[].docType.documentType",
+ path="document_types[].id",
)
)
@@ -183,13 +188,12 @@ def _check_docs(self, request: ExtractionRequest, report: ValidationReport) -> N
def _check_rule_references(self, request: ExtractionRequest, report: ValidationReport) -> None:
# Catalog what's declared so rule parents can be resolved.
- doc_index = {doc.docType.documentType: doc for doc in request.docs}
+ doc_index = {doc.id: doc for doc in request.document_types}
fields_per_doc: dict[str, set[str]] = {
- dt: {f.fieldName for g in d.fieldGroups for f in g.fieldGroupFields}
- for dt, d in doc_index.items()
+ dt: {f.name for g in d.field_groups for f in g.fields} for dt, d in doc_index.items()
}
validators_per_doc: dict[str, set[str]] = {
- dt: {v.name for v in d.validators.visual} for dt, d in doc_index.items()
+ dt: {v.name for v in d.visual_checks} for dt, d in doc_index.items()
}
rule_ids = {r.id for r in request.rules}
@@ -210,21 +214,22 @@ def _check_rule_references(self, request: ExtractionRequest, report: ValidationR
parent_path = f"rules[{r_index}].parents[{p_index}]"
if isinstance(parent, RuleFieldParent):
- if parent.documentType not in doc_index:
+ if parent.document_type not in doc_index:
report.issues.append(
ValidationIssue(
severity="error",
code="rule_unknown_doctype",
message=(
- f"Rule {rule.id!r} references documentType "
- f"{parent.documentType!r} which is not declared in docs[]."
+ f"Rule {rule.id!r} references document_type "
+ f"{parent.document_type!r} which is not declared in "
+ "document_types[]."
),
path=parent_path,
)
)
continue
- known = fields_per_doc.get(parent.documentType, set())
- for fn in parent.fieldNames:
+ known = fields_per_doc.get(parent.document_type, set())
+ for fn in parent.fields:
if fn not in known:
report.issues.append(
ValidationIssue(
@@ -232,59 +237,60 @@ def _check_rule_references(self, request: ExtractionRequest, report: ValidationR
code="rule_unknown_field",
message=(
f"Rule {rule.id!r} references field "
- f"{fn!r} on documentType "
- f"{parent.documentType!r}, but that doc "
- "doesn't declare such a field."
+ f"{fn!r} on document_type "
+ f"{parent.document_type!r}, but that document "
+ "type doesn't declare such a field."
),
path=parent_path,
)
)
elif isinstance(parent, RuleValidatorParent):
- if parent.documentType not in doc_index:
+ if parent.document_type not in doc_index:
report.issues.append(
ValidationIssue(
severity="error",
code="rule_unknown_doctype",
message=(
- f"Rule {rule.id!r} references documentType "
- f"{parent.documentType!r} which is not declared in docs[]."
+ f"Rule {rule.id!r} references document_type "
+ f"{parent.document_type!r} which is not declared in "
+ "document_types[]."
),
path=parent_path,
)
)
continue
- known = validators_per_doc.get(parent.documentType, set())
- if parent.validatorName not in known:
+ known = validators_per_doc.get(parent.document_type, set())
+ if parent.validator not in known:
report.issues.append(
ValidationIssue(
severity="error",
code="rule_unknown_validator",
message=(
f"Rule {rule.id!r} references validator "
- f"{parent.validatorName!r} on documentType "
- f"{parent.documentType!r}, but that doc "
- "doesn't declare such a visual validator."
+ f"{parent.validator!r} on document_type "
+ f"{parent.document_type!r}, but that document "
+ "type doesn't declare such a visual check."
),
path=parent_path,
)
)
elif isinstance(parent, RuleRuleParent):
- if parent.ruleId not in rule_ids:
+ if parent.rule not in rule_ids:
report.issues.append(
ValidationIssue(
severity="error",
code="rule_unknown_parent",
message=(
f"Rule {rule.id!r} declares parent rule "
- f"{parent.ruleId!r} which is not present in "
+ f"{parent.rule!r} which is not present in "
"the request."
),
path=parent_path,
)
)
- elif parent.ruleId == rule.id:
+ elif parent.rule == rule.id:
report.issues.append(
ValidationIssue(
severity="error",
@@ -300,9 +306,7 @@ def _check_rule_dag(self, request: ExtractionRequest, report: ValidationReport)
rule_ids = {r.id for r in request.rules}
sorter: TopologicalSorter[str] = TopologicalSorter()
for rule in request.rules:
- parents = [
- p.ruleId for p in rule.parents if isinstance(p, RuleRuleParent) and p.ruleId in rule_ids
- ]
+ parents = [p.rule for p in rule.parents if isinstance(p, RuleRuleParent) and p.rule in rule_ids]
sorter.add(rule.id, *parents)
try:
sorter.prepare()
@@ -338,30 +342,31 @@ def _check_stage_consistency(self, request: ExtractionRequest, report: Validatio
)
)
- # visual_authenticity on but no visual validators anywhere => warn.
+ # visual_authenticity on but no visual_checks anywhere => warn.
if stages.visual_authenticity:
- any_visual = any(bool(d.validators.visual) for d in request.docs)
+ any_visual = any(bool(d.visual_checks) for d in request.document_types)
if not any_visual:
report.issues.append(
ValidationIssue(
severity="warning",
code="visual_authenticity_no_validators",
message=(
- "stages.visual_authenticity is enabled but no DocSpec declares visual validators."
+ "stages.visual_authenticity is enabled but no DocumentTypeSpec "
+ "declares visual_checks."
),
path="options.stages.visual_authenticity",
)
)
- # splitter on but only one doc => the stage will short-circuit. Warn.
- if stages.splitter and len(request.docs) <= 1:
+ # splitter on but only one document type => the stage will short-circuit. Warn.
+ if stages.splitter and len(request.document_types) <= 1:
report.issues.append(
ValidationIssue(
severity="warning",
code="splitter_single_doc",
message=(
"stages.splitter is enabled but the request declares "
- "only one DocSpec -- the splitter will short-circuit."
+ "only one DocumentTypeSpec -- the splitter will short-circuit."
),
path="options.stages.splitter",
)
diff --git a/src/flydocs/core/services/validation/standard_validator_registry.py b/src/flydocs/core/services/validation/validator_registry.py
similarity index 86%
rename from src/flydocs/core/services/validation/standard_validator_registry.py
rename to src/flydocs/core/services/validation/validator_registry.py
index 382c75c..cc988c0 100644
--- a/src/flydocs/core/services/validation/standard_validator_registry.py
+++ b/src/flydocs/core/services/validation/validator_registry.py
@@ -1,10 +1,10 @@
# Copyright 2026 Firefly Software Solutions Inc
-"""Pure-Python implementations for every :class:`StandardValidatorType`.
+"""Pure-Python implementations for every :class:`ValidatorType`.
Each checker is a function ``(value: Any, params: dict) -> str | None``
that returns ``None`` on success or a human-readable error message on
failure. The :class:`FieldValidator` looks the function up by
-``StandardValidatorType`` and runs it after the simple constraint set.
+``ValidatorType`` and runs it after the simple constraint set.
"""
from __future__ import annotations
@@ -17,7 +17,7 @@
from urllib.parse import urlparse
from uuid import UUID
-from flydocs.interfaces.enums.standard_validator import StandardValidatorType
+from flydocs.interfaces.enums.validator import ValidatorType
# ---------------------------------------------------------------------------
# Low-level helpers
@@ -473,43 +473,58 @@ def _check_passport_number(value: Any, _: dict) -> str | None:
CHECKERS = {
- StandardValidatorType.EMAIL: _check_email,
- StandardValidatorType.URI: _check_uri,
- StandardValidatorType.URL: _check_url,
- StandardValidatorType.IPV4: _check_ipv4,
- StandardValidatorType.IPV6: _check_ipv6,
- StandardValidatorType.DOMAIN: _check_domain,
- StandardValidatorType.SLUG: _check_slug,
- StandardValidatorType.DATE: _check_date,
- StandardValidatorType.DATETIME: _check_datetime,
- StandardValidatorType.TIME: _check_time,
- StandardValidatorType.ISO_8601: _check_iso_8601,
- StandardValidatorType.UUID: _check_uuid,
- StandardValidatorType.JSON: _check_json,
- StandardValidatorType.HEX_COLOR: _check_hex_color,
- StandardValidatorType.IBAN: _check_iban,
- StandardValidatorType.BIC: _check_bic,
- StandardValidatorType.CREDIT_CARD: _check_credit_card,
- StandardValidatorType.CURRENCY_CODE: _check_currency_code,
- StandardValidatorType.AMOUNT: _check_amount,
- StandardValidatorType.PHONE_E164: _check_phone_e164,
- StandardValidatorType.COUNTRY_CODE: _check_country_code,
- StandardValidatorType.LANGUAGE_CODE: _check_language_code,
- StandardValidatorType.POSTAL_CODE: _check_postal_code,
- StandardValidatorType.LATITUDE: _check_latitude,
- StandardValidatorType.LONGITUDE: _check_longitude,
- StandardValidatorType.NIF: _check_nif,
- StandardValidatorType.NIE: _check_nie,
- StandardValidatorType.CIF: _check_cif,
- StandardValidatorType.VAT_ID: _check_vat_id,
- StandardValidatorType.SSN: _check_ssn,
- StandardValidatorType.PASSPORT_NUMBER: _check_passport_number,
+ ValidatorType.EMAIL: _check_email,
+ ValidatorType.URI: _check_uri,
+ ValidatorType.URL: _check_url,
+ ValidatorType.IPV4: _check_ipv4,
+ ValidatorType.IPV6: _check_ipv6,
+ ValidatorType.DOMAIN: _check_domain,
+ ValidatorType.SLUG: _check_slug,
+ ValidatorType.DATE: _check_date,
+ ValidatorType.DATETIME: _check_datetime,
+ ValidatorType.TIME: _check_time,
+ ValidatorType.ISO_8601: _check_iso_8601,
+ ValidatorType.UUID: _check_uuid,
+ ValidatorType.JSON: _check_json,
+ ValidatorType.HEX_COLOR: _check_hex_color,
+ ValidatorType.IBAN: _check_iban,
+ ValidatorType.BIC: _check_bic,
+ ValidatorType.CREDIT_CARD: _check_credit_card,
+ ValidatorType.CURRENCY_CODE: _check_currency_code,
+ ValidatorType.AMOUNT: _check_amount,
+ ValidatorType.PHONE_E164: _check_phone_e164,
+ ValidatorType.COUNTRY_CODE: _check_country_code,
+ ValidatorType.LANGUAGE_CODE: _check_language_code,
+ ValidatorType.POSTAL_CODE: _check_postal_code,
+ ValidatorType.LATITUDE: _check_latitude,
+ ValidatorType.LONGITUDE: _check_longitude,
+ ValidatorType.NIF: _check_nif,
+ ValidatorType.NIE: _check_nie,
+ ValidatorType.CIF: _check_cif,
+ ValidatorType.VAT_ID: _check_vat_id,
+ ValidatorType.SSN: _check_ssn,
+ ValidatorType.PASSPORT_NUMBER: _check_passport_number,
}
-def run_standard_validator(validator_type: StandardValidatorType, value: Any, params: dict) -> str | None:
+def run_validator(validator_type: ValidatorType, value: Any, params: dict) -> str | None:
"""Look the checker up and run it. Returns ``None`` on success."""
checker = CHECKERS.get(validator_type)
if checker is None:
- return f"Unknown standard validator: {validator_type.value!r}"
+ return f"Unknown validator: {validator_type.value!r}"
return checker(value, params)
+
+
+class ValidatorRegistry:
+ """Thin object wrapper exposing the validator catalogue as a service.
+
+ Existing callers can use ``run_validator()`` directly; this class is
+ provided for callers that prefer DI-style injection.
+ """
+
+ @staticmethod
+ def run(validator_type: ValidatorType, value: Any, params: dict) -> str | None:
+ return run_validator(validator_type, value, params)
+
+
+__all__ = ["CHECKERS", "ValidatorRegistry", "run_validator"]
diff --git a/src/flydocs/core/services/webhook/webhook_publisher.py b/src/flydocs/core/services/webhook/webhook_publisher.py
index 0dfcffc..1bc7ac9 100644
--- a/src/flydocs/core/services/webhook/webhook_publisher.py
+++ b/src/flydocs/core/services/webhook/webhook_publisher.py
@@ -7,6 +7,10 @@
final outcome. The publisher signs the body with HMAC-SHA256 when a
secret is configured, and propagates any ``extra_headers`` supplied by
the caller (the worker uses this to forward correlation IDs).
+
+The payload shape is the unified :class:`EventEnvelope` -- the same
+model the EDA bus carries. Webhook consumers and EDA consumers see one
+mental model end to end.
"""
from __future__ import annotations
@@ -26,7 +30,7 @@
)
from flydocs.core.observability import log_outbound
-from flydocs.interfaces.dtos.webhook import JobWebhookPayload
+from flydocs.interfaces.dtos.event import EventEnvelope
logger = logging.getLogger(__name__)
@@ -50,7 +54,7 @@ def __init__(
async def deliver(
self,
url: str,
- payload: JobWebhookPayload,
+ payload: EventEnvelope,
*,
extra_headers: dict[str, str] | None = None,
) -> bool:
@@ -80,6 +84,7 @@ async def deliver(
headers[self._signature_header] = f"{self._signature_scheme}={digest}"
attempt_counter = {"n": 0}
+ extraction_id = payload.extraction.id
@retry(
reraise=True,
@@ -105,7 +110,7 @@ async def _do_post() -> bool:
latency_ms=latency_ms,
url=url,
attempt=attempt,
- job_id=payload.job_id,
+ extraction_id=extraction_id,
correlation_id=correlation_id,
error=type(exc).__name__,
)
@@ -121,7 +126,7 @@ async def _do_post() -> bool:
url=url,
attempt=attempt,
http_status=http_status,
- job_id=payload.job_id,
+ extraction_id=extraction_id,
correlation_id=correlation_id,
)
raise _RetryableWebhook(f"webhook {url} returned retryable status {http_status}")
@@ -134,7 +139,7 @@ async def _do_post() -> bool:
url=url,
attempt=attempt,
http_status=http_status,
- job_id=payload.job_id,
+ extraction_id=extraction_id,
correlation_id=correlation_id,
)
logger.error(
@@ -152,7 +157,7 @@ async def _do_post() -> bool:
url=url,
attempt=attempt,
http_status=http_status,
- job_id=payload.job_id,
+ extraction_id=extraction_id,
correlation_id=correlation_id,
)
return True
@@ -167,7 +172,7 @@ async def _do_post() -> bool:
latency_ms=0.0,
url=url,
attempts=attempt_counter["n"],
- job_id=payload.job_id,
+ extraction_id=extraction_id,
error=type(exc).__name__,
)
logger.error("Webhook %s exhausted retries: %s", url, exc)
@@ -180,7 +185,7 @@ async def _do_post() -> bool:
latency_ms=0.0,
url=url,
attempts=attempt_counter["n"],
- job_id=payload.job_id,
+ extraction_id=extraction_id,
error=type(exc).__name__,
)
logger.error("Webhook %s transport error: %s", url, exc)
diff --git a/src/flydocs/core/services/workers/__init__.py b/src/flydocs/core/services/workers/__init__.py
index ff88bd2..1006793 100644
--- a/src/flydocs/core/services/workers/__init__.py
+++ b/src/flydocs/core/services/workers/__init__.py
@@ -1,6 +1,16 @@
# Copyright 2026 Firefly Software Solutions Inc
-"""Background worker -- consumes the job queue and runs the pipeline."""
+"""Background workers -- consume the EDA bus and run pipeline + post-processing."""
-from flydocs.core.services.workers.job_worker import JobWorker
+from flydocs.core.services.workers.bbox_reaper import BboxReaper
+from flydocs.core.services.workers.bbox_refine_worker import BboxRefineWorker
+from flydocs.core.services.workers.job_reaper import ExtractionReaper, JobReaper
+from flydocs.core.services.workers.job_worker import ExtractionWorker, JobWorker
-__all__ = ["JobWorker"]
+__all__ = [
+ "BboxReaper",
+ "BboxRefineWorker",
+ "ExtractionReaper",
+ "ExtractionWorker",
+ "JobReaper",
+ "JobWorker",
+]
diff --git a/src/flydocs/core/services/workers/bbox_reaper.py b/src/flydocs/core/services/workers/bbox_reaper.py
index 35b09cb..8d970cb 100644
--- a/src/flydocs/core/services/workers/bbox_reaper.py
+++ b/src/flydocs/core/services/workers/bbox_reaper.py
@@ -1,18 +1,20 @@
# Copyright 2026 Firefly Software Solutions Inc
"""``BboxReaper`` -- periodic sweep for orphaned bbox-refine legs.
-Bbox-leg analogue of :class:`JobReaper`. Two orphan classes:
-
-* ``REFINING_BBOXES`` with stale ``bbox_refine_started_at`` -- the
- bbox worker that claimed the leg crashed past its lease.
-* ``PARTIAL_SUCCEEDED`` with ``bbox_refine_status='pending'`` -- the
- initial bbox event was never published (main worker crashed
- between ``mark_partial_succeeded`` and ``publisher.publish``), or
- a prior bbox-leg retry's ``_delayed_publish`` task was lost.
-
-Both are revived by republishing a fresh ``IDPBboxRefineRequested``
-event; the bbox worker's atomic ``mark_bbox_refining`` claim dedupes
-duplicate publishes from concurrent replicas.
+Bbox-leg analogue of :class:`ExtractionReaper`. Two orphan classes:
+
+* ``post_processing_bbox_status=running`` with stale
+ ``post_processing_bbox_started_at`` -- the bbox worker that claimed
+ the leg crashed past its lease.
+* ``post_processing_bbox_status=pending`` -- the initial bbox event
+ was never published (main worker crashed between ``mark_succeeded``
+ and ``publisher.publish``), or a prior bbox-leg retry's
+ ``_delayed_publish`` task was lost.
+
+Both are revived by republishing a fresh
+``extraction.post_processing.requested`` event; the bbox worker's
+atomic ``claim_bbox_refinement`` dedupes duplicate publishes from
+concurrent replicas.
"""
from __future__ import annotations
@@ -26,22 +28,24 @@
from flydocs.config import IDPSettings
from flydocs.core.observability import log_outbound
+from flydocs.core.services.extractions._projector import row_to_extraction
from flydocs.interfaces.dtos.event import (
- IDPBboxRefineRequestedEvent,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
+ EventEnvelope,
envelope_for_publish,
)
-from flydocs.models.repositories import ExtractionJobRepository
+from flydocs.models.repositories import ExtractionRepository
logger = logging.getLogger(__name__)
class BboxReaper:
- """Periodic sweep for orphaned PARTIAL_SUCCEEDED / REFINING_BBOXES jobs."""
+ """Periodic sweep for orphaned bbox-refine legs."""
def __init__(
self,
*,
- repository: ExtractionJobRepository,
+ repository: ExtractionRepository,
event_publisher: EventPublisher,
settings: IDPSettings,
consumer_id: str | None = None,
@@ -54,7 +58,7 @@ def __init__(
async def run_forever(self) -> None:
logger.info(
- "BboxReaper %s started (interval=%ds, refine_lease=%ds, partial_threshold=%ds)",
+ "BboxReaper %s started (interval=%ds, refine_lease=%ds, pending_threshold=%ds)",
self._consumer_id,
self._settings.reaper_sweep_interval_s,
self._settings.bbox_refine_lease_s,
@@ -75,31 +79,40 @@ def stop(self) -> None:
self._stop.set()
async def _sweep(self) -> None:
- stale_refining = await self._repository.find_stale_refining_bboxes(
+ stale_refining = await self._repository.find_stale_bbox_refining(
lease_seconds=self._settings.bbox_refine_lease_s
)
- for job_id in stale_refining:
- await self._republish(job_id, reason="stale_refining_bboxes")
+ for extraction_id in stale_refining:
+ await self._republish(extraction_id, reason="stale_running_bbox")
pending_orphans = await self._repository.find_pending_bbox_revive(
- partial_threshold_seconds=self._settings.partial_succeeded_orphan_threshold_s,
+ pending_threshold_seconds=self._settings.partial_succeeded_orphan_threshold_s,
bbox_lease_seconds=self._settings.bbox_refine_lease_s,
)
- for job_id in pending_orphans:
- await self._republish(job_id, reason="orphan_partial_succeeded")
+ for extraction_id in pending_orphans:
+ await self._republish(extraction_id, reason="orphan_pending_bbox")
- async def _republish(self, job_id: str, *, reason: str) -> None:
- event = IDPBboxRefineRequestedEvent(job_id=job_id, attempt=1)
+ async def _republish(self, extraction_id: str, *, reason: str) -> None:
+ row = await self._repository.get(extraction_id)
+ if row is None:
+ return
+ envelope = EventEnvelope(
+ event_type=EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
+ extraction=row_to_extraction(row),
+ )
await self._publisher.publish(
destination=self._settings.bbox_refine_topic,
- event_type=self._settings.bbox_refine_event_type,
- payload=envelope_for_publish(event),
+ event_type=EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
+ payload=envelope_for_publish(envelope),
)
log_outbound(
"bbox-reaper",
- op="republish.bbox_refine",
+ op="republish.post_processing",
status="ok",
latency_ms=0.0,
- job_id=job_id,
+ extraction_id=extraction_id,
reason=reason,
)
- logger.info("BboxReaper republished job %s (%s)", job_id, reason)
+ logger.info("BboxReaper republished extraction %s (%s)", extraction_id, reason)
+
+
+__all__ = ["BboxReaper"]
diff --git a/src/flydocs/core/services/workers/bbox_refine_worker.py b/src/flydocs/core/services/workers/bbox_refine_worker.py
index ea8f42f..a0151a2 100644
--- a/src/flydocs/core/services/workers/bbox_refine_worker.py
+++ b/src/flydocs/core/services/workers/bbox_refine_worker.py
@@ -1,33 +1,36 @@
# Copyright 2026 Firefly Software Solutions Inc
-"""``BboxRefineWorker`` -- second-stage EDA worker for grounded bbox refinement.
+"""``BboxRefineWorker`` -- post-processing EDA worker for grounded bbox refinement.
-Subscribes to ``IDPSettings.bbox_refine_event_type`` on
-``IDPSettings.bbox_refine_topic``. Each event carries one ``job_id``
-whose main extraction has already finished with
-``JobStatus.PARTIAL_SUCCEEDED`` and whose ``options.stages.bbox_refine``
-was ``true``.
+Subscribes to ``extraction.post_processing.requested`` events.
+Each event carries one extraction whose main pipeline has already
+finished ``succeeded`` AND whose ``options.stages.bbox_refine`` was
+``true``. The bbox-leg sub-status on the row is ``pending`` when we
+get to here -- :meth:`ExtractionRepository.mark_succeeded` set it
+atomically with the main success transition.
Per-event lifecycle:
-1. Load the job row.
-2. Skip if the job is already past ``REFINING_BBOXES`` (idempotent
- re-delivery from at-least-once buses is normal).
-3. Transition ``PARTIAL_SUCCEEDED -> REFINING_BBOXES`` and bump the
- refine attempts counter atomically.
+1. Load the extraction row.
+2. Skip if the bbox leg is already past ``pending`` / stale
+ ``running`` (idempotent re-delivery from at-least-once buses is
+ normal).
+3. Transition ``pending`` -> ``running`` on
+ ``post_processing_bbox_status`` (atomic claim with a lease).
4. Re-run :class:`BinaryNormalizer` on the saved input bytes to recover
the per-file LLM-renderable rows. (Deterministic; cheaper than
- persisting the normalised bytes alongside the job.)
-5. For each :class:`ExtractedDocument` in the persisted result, find
- the matching normalised binary by ``source_file`` and call
+ persisting the normalised bytes alongside the row.)
+5. For each :class:`Document` in the persisted result, find the
+ matching normalised binary by ``source_file`` and call
:class:`BboxRefiner.refine` against that document's field groups.
-6. Re-serialise the mutated result, transition the job to
- :class:`JobStatus.SUCCEEDED`, and fire the final webhook.
+6. Re-serialise the mutated result and transition the bbox leg to
+ ``succeeded`` (the main extraction status was already ``succeeded``).
+ Fire the post-processing-completed webhook.
-Failures degrade gracefully: the partial result is **never** dropped.
+Failures degrade gracefully: the result is **never** dropped.
Retryable errors (timeouts, transient OCR engine failures) re-publish
the same event with exponential backoff up to
``IDPSettings.bbox_refine_max_attempts``; permanent errors mark the
-refine leg ``failed`` and the job reverts to ``PARTIAL_SUCCEEDED`` with
+bbox leg ``failed`` and the main extraction stays ``succeeded`` with
its LLM-bbox result intact.
"""
@@ -42,27 +45,30 @@
from datetime import UTC, datetime
from typing import Any
-from pyfly.eda import EventEnvelope, EventPublisher
+from pyfly.eda import EventEnvelope as EdaEnvelope
+from pyfly.eda import EventPublisher
from flydocs.config import IDPSettings
from flydocs.core.observability import log_outbound
from flydocs.core.services.bbox import BboxRefiner
from flydocs.core.services.binary import BinaryNormalizer, NormalisedBinary
+from flydocs.core.services.extractions._projector import row_to_extraction
from flydocs.core.services.webhook import WebhookPublisher
from flydocs.interfaces.dtos.event import (
- IDPBboxRefineRequestedEvent,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
+ EventEnvelope,
envelope_for_publish,
)
from flydocs.interfaces.dtos.extract import ExtractionResult
-from flydocs.interfaces.dtos.webhook import JobWebhookPayload
-from flydocs.interfaces.enums.job_status import JobStatus
-from flydocs.models.repositories import ExtractionJobRepository
+from flydocs.interfaces.dtos.extraction import Extraction
+from flydocs.models.repositories import ExtractionRepository
logger = logging.getLogger(__name__)
-# Same permanent-error hints the JobWorker uses; the refiner can hit
-# the same provider-side failure classes via OCR adapters.
+# Same permanent-error hints the ExtractionWorker uses; the refiner can
+# hit the same provider-side failure classes via OCR adapters.
_PERMANENT_ERROR_HINTS: tuple[str, ...] = (
"content policy",
"content_filter",
@@ -84,12 +90,12 @@ def _is_permanent(exc: Exception) -> bool:
class BboxRefineWorker:
- """Second-stage EDA consumer: ground bboxes after main extraction."""
+ """Post-processing EDA consumer: ground bboxes after main extraction."""
def __init__(
self,
*,
- repository: ExtractionJobRepository,
+ repository: ExtractionRepository,
event_publisher: EventPublisher,
webhook: WebhookPublisher,
normalizer: BinaryNormalizer,
@@ -109,14 +115,14 @@ def __init__(
async def run_forever(self) -> None:
# Subscribe before start() -- the EDA adapters only spin up the
# consumer loop when at least one handler is registered.
- self._publisher.subscribe(self._settings.bbox_refine_event_type, self._on_event)
+ self._publisher.subscribe(EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED, self._on_event)
await self._publisher.start()
logger.info(
"BboxRefineWorker %s started (adapter=%s, destination=%s, event_type=%s)",
self._consumer_id,
self._settings.eda_adapter,
self._settings.bbox_refine_topic,
- self._settings.bbox_refine_event_type,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
)
try:
await self._stop.wait()
@@ -128,63 +134,62 @@ def stop(self) -> None:
# ------------------------------------------------------------------
- async def _on_event(self, envelope: EventEnvelope) -> None:
- job_id = envelope.payload.get("job_id") if isinstance(envelope.payload, dict) else None
- if not job_id:
+ async def _on_event(self, envelope: EdaEnvelope) -> None:
+ extraction_id = _extraction_id_from_payload(envelope.payload)
+ if not extraction_id:
logger.warning(
- "Received %s event without job_id: %r -- dropping",
+ "Received %s event without extraction id: %r -- dropping",
envelope.event_type,
envelope.payload,
)
return
- await self._process(str(job_id))
+ await self._process(extraction_id)
- async def _process(self, job_id: str) -> None:
- job = await self._repository.get(job_id)
- if job is None:
- logger.warning("EDA delivered unknown bbox-refine job %s -- dropping", job_id)
+ async def _process(self, extraction_id: str) -> None:
+ row = await self._repository.get(extraction_id)
+ if row is None:
+ logger.warning("EDA delivered unknown extraction %s -- dropping", extraction_id)
return
- # Atomic claim: precondition matches PARTIAL_SUCCEEDED (first
- # delivery) or stale REFINING_BBOXES (previous claimant crashed
- # past its lease). Anything else -- SUCCEEDED, FAILED,
- # fresh REFINING_BBOXES -- returns None so we treat the event
- # as already handled and bail.
- claimed = await self._repository.mark_bbox_refining(
- job.id, lease_seconds=self._settings.bbox_refine_lease_s
+ # Atomic claim: precondition matches pending (first delivery) or
+ # stale running (previous claimant crashed past its lease).
+ # Anything else (succeeded / failed bbox leg, fresh running)
+ # returns None so we treat the event as already handled and bail.
+ claimed = await self._repository.claim_bbox_refinement(
+ row.id, lease_seconds=self._settings.bbox_refine_lease_s
)
if claimed is None:
logger.info(
- "Bbox refine for job %s could not be claimed (status=%s) -- "
- "another worker owns it or the job advanced past PARTIAL_SUCCEEDED",
- job.id,
- job.status,
+ "Bbox refine for extraction %s could not be claimed (bbox_status=%s) -- "
+ "another worker owns it or the leg already finished",
+ row.id,
+ row.post_processing_bbox_status,
)
return
- job = claimed
- attempts = job.bbox_refine_attempts or 1
+ row = claimed
+ attempts = row.post_processing_bbox_attempts or 1
log_outbound(
"bbox-worker",
op="bbox.refine",
status="started",
latency_ms=0.0,
- job_id=job.id,
+ extraction_id=row.id,
attempt=attempts,
)
started = time.monotonic()
try:
refined = await asyncio.wait_for(
- self._refine_job_result(job),
+ self._refine_extraction_result(row),
timeout=self._settings.bbox_refine_timeout_s,
)
- finalised = await self._repository.mark_bbox_refined(
- job.id, result=refined.model_dump(mode="json", by_alias=True)
+ finalised = await self._repository.complete_bbox_refinement(
+ row.id, result=refined.model_dump(mode="json", by_alias=True)
)
if finalised is None:
logger.info(
- "Bbox refine for job %s no longer in REFINING_BBOXES -- "
+ "Bbox refine for extraction %s no longer in running -- "
"another worker finalised it, discarding our result",
- job.id,
+ row.id,
)
return
log_outbound(
@@ -192,99 +197,94 @@ async def _process(self, job_id: str) -> None:
op="bbox.refine",
status="ok",
latency_ms=(time.monotonic() - started) * 1000,
- job_id=job.id,
+ extraction_id=row.id,
attempt=attempts,
)
await self._fire_webhook(
- job_id=job.id,
- status=JobStatus.SUCCEEDED,
+ extraction=row_to_extraction(finalised),
result=refined,
- metadata=job.metadata_json or {},
- callback_url=job.callback_url,
- correlation=_extract_correlation(job.metadata_json),
- started_at=getattr(job, "bbox_refine_started_at", None),
- finished_at=datetime.now(UTC),
- attempts=attempts,
+ metadata=row.metadata_json or {},
+ callback_url=row.callback_url,
+ correlation=_extract_correlation(row.metadata_json),
)
except Exception as exc: # noqa: BLE001
permanent = _is_permanent(exc)
exhausted = attempts >= self._settings.bbox_refine_max_attempts
terminal = permanent or exhausted
- error_code = "PERMANENT_ERROR" if permanent else "BBOX_REFINE_FAILED"
+ error_code = "permanent_error" if permanent else "bbox_refine_failed"
log_outbound(
"bbox-worker",
op="bbox.refine",
status="error",
latency_ms=(time.monotonic() - started) * 1000,
- job_id=job.id,
+ extraction_id=row.id,
attempt=attempts,
permanent=permanent,
exhausted=exhausted,
error=type(exc).__name__,
)
if terminal:
- failed = await self._repository.mark_bbox_refine_failed(
- job.id, code=error_code, message=str(exc)
+ failed = await self._repository.fail_bbox_refinement(
+ row.id, code=error_code, message=str(exc)
)
if failed is None:
logger.info(
- "Bbox refine for job %s already past REFINING_BBOXES -- "
+ "Bbox refine for extraction %s already past running -- "
"another worker handled the terminal transition",
- job.id,
+ row.id,
)
# No webhook on bbox-refine permanent failure: the caller
- # already received the ``idp.job.partial`` payload with
- # the LLM-bbox result; nothing new to deliver.
+ # already received the ``extraction.completed`` payload
+ # with the LLM-bbox result; nothing new to deliver.
else:
delay = self._backoff_delay(attempts)
logger.warning(
- "Bbox refine for job %s failed attempt %d (%s); re-publishing in %.1fs",
- job.id,
+ "Bbox refine for extraction %s failed attempt %d (%s); re-publishing in %.1fs",
+ row.id,
attempts,
exc,
delay,
)
- # Atomically revert REFINING_BBOXES -> PARTIAL_SUCCEEDED
- # so the next delivery's claim precondition passes. If
- # we lost the row (another worker advanced it), skip the
- # republish: someone else owns the lifecycle now.
- requeued = await self._repository.requeue_bbox_refine(job.id)
+ # Atomically revert running -> pending so the next
+ # delivery's claim precondition passes. If we lost the
+ # row (another worker advanced it), skip the republish.
+ requeued = await self._repository.requeue_bbox_refinement(row.id)
if requeued is None:
logger.info(
- "Bbox refine for job %s not requeueable -- skipping retry",
- job.id,
+ "Bbox refine for extraction %s not requeueable -- skipping retry",
+ row.id,
)
else:
- asyncio.create_task(self._delayed_publish(job.id, delay))
+ asyncio.create_task(self._delayed_publish(row.id, delay))
# ------------------------------------------------------------------
- async def _refine_job_result(self, job: Any) -> ExtractionResult:
- """Reconstruct the per-document bytes + run the refiner per doc."""
- if not job.result_json:
- raise ValueError(f"job {job.id} has no result_json to refine")
- result = ExtractionResult.model_validate(job.result_json)
+ async def _refine_extraction_result(self, row: Any) -> ExtractionResult:
+ """Reconstruct the per-document bytes + run the refiner per document."""
+ if not row.result_json:
+ raise ValueError(f"extraction {row.id} has no result_json to refine")
+ result = ExtractionResult.model_validate(row.result_json)
- schema = job.schema_json or {}
- # ``schema_json.documents`` carries every input file the submit
+ schema = row.schema_json or {}
+ # ``schema_json.files`` carries every input file the submit
# handler stored: a list of ``{filename, content_base64,
- # content_type, document_type}``. We normalise each one
+ # content_type, expected_type}``. We normalise each one
# independently so the refiner has one :class:`NormalisedBinary`
# row per ``source_file`` to look up.
- documents_payload = schema.get("documents") or []
- if not documents_payload:
- raise ValueError(f"job {job.id} schema_json missing 'documents'")
+ files_payload = schema.get("files") or []
+ if not files_payload:
+ raise ValueError(f"extraction {row.id} schema_json missing 'files'")
sources: list[tuple[bytes, str | None, str]] = [
(
base64.b64decode(entry.get("content_base64") or ""),
entry.get("content_type"),
- entry.get("filename") or job.filename,
+ entry.get("filename") or row.filename,
)
- for entry in documents_payload
+ for entry in files_payload
if entry.get("content_base64")
]
if not sources:
- raise ValueError(f"job {job.id} has no decodable document bytes in schema_json")
+ raise ValueError(f"extraction {row.id} has no decodable file bytes in schema_json")
normalised: list[NormalisedBinary] = []
for raw_bytes, media_type, name in sources:
@@ -299,19 +299,19 @@ async def _refine_job_result(self, job: Any) -> ExtractionResult:
# carry their normalised filename in ``row.filename``.
by_filename: dict[str, NormalisedBinary] = {row.filename: row for row in normalised}
- language_hint = (job.options_json or {}).get("language_hint")
+ language_hint = (row.options_json or {}).get("language_hint")
for document in result.documents:
- if not document.fields:
+ if not document.field_groups:
continue
- row = by_filename.get(document.source_file or "")
- if row is None:
+ mapped = by_filename.get(document.source_file or "")
+ if mapped is None:
continue
await self._refiner.refine(
- document_bytes=row.bytes,
- media_type=row.media_type,
- page_count=row.page_count,
- groups=document.fields,
+ document_bytes=mapped.bytes,
+ media_type=mapped.media_type,
+ page_count=mapped.page_count,
+ groups=document.field_groups,
language_hint=language_hint,
)
return result
@@ -324,65 +324,72 @@ def _backoff_delay(self, attempts: int) -> float:
jitter = capped * 0.2 * random.random()
return capped + jitter
- async def _delayed_publish(self, job_id: str, delay_s: float) -> None:
+ async def _delayed_publish(self, extraction_id: str, delay_s: float) -> None:
try:
await asyncio.sleep(delay_s)
- republish = IDPBboxRefineRequestedEvent(job_id=job_id, attempt=2)
+ row = await self._repository.get(extraction_id)
+ if row is None:
+ logger.warning("Delayed republish: extraction %s vanished", extraction_id)
+ return
+ envelope = EventEnvelope(
+ event_type=EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
+ extraction=row_to_extraction(row),
+ )
await self._publisher.publish(
destination=self._settings.bbox_refine_topic,
- event_type=self._settings.bbox_refine_event_type,
- payload=envelope_for_publish(republish),
+ event_type=EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
+ payload=envelope_for_publish(envelope),
)
log_outbound(
"eda",
- op="republish.bbox_refine",
+ op="republish.post_processing",
status="ok",
latency_ms=delay_s * 1000,
- job_id=job_id,
+ extraction_id=extraction_id,
)
except Exception as exc: # noqa: BLE001
- logger.error("Failed to re-publish bbox refine job %s after backoff: %s", job_id, exc)
+ logger.error(
+ "Failed to re-publish bbox refine extraction %s after backoff: %s",
+ extraction_id,
+ exc,
+ )
async def _fire_webhook(
self,
*,
- job_id: str,
- status: JobStatus,
+ extraction: Extraction,
result: ExtractionResult | None,
metadata: dict[str, Any],
callback_url: str | None,
correlation: dict[str, str] | None = None,
- started_at: datetime | None = None,
- finished_at: datetime | None = None,
- attempts: int = 1,
- error_code: str | None = None,
- error_message: str | None = None,
) -> None:
if not callback_url:
return
clean_metadata = {k: v for k, v in (metadata or {}).items() if not k.startswith("_")}
corr = correlation or {}
- payload = JobWebhookPayload(
- # Bbox-refine's terminal webhook still carries the
- # IDPJobCompleted event_type — from the consumer's POV
- # the job has reached its FINAL terminal state at this
- # point. The IDPBboxRefineCompleted EDA event is a
- # separate internal signal not surfaced to webhook clients.
- event_type="IDPJobCompleted",
- job_id=job_id,
- status=status,
+ envelope = EventEnvelope(
+ event_type=EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED,
occurred_at=datetime.now(UTC),
- started_at=started_at,
- finished_at=finished_at,
- attempts=attempts,
correlation_id=corr.get("X-Correlation-Id"),
tenant_id=corr.get("X-Tenant-Id"),
- metadata=clean_metadata,
+ extraction=extraction,
result=result,
- error_code=error_code,
- error_message=error_message,
+ metadata=clean_metadata,
)
- await self._webhook.deliver(callback_url, payload, extra_headers=corr)
+ await self._webhook.deliver(callback_url, envelope, extra_headers=corr)
+
+
+def _extraction_id_from_payload(payload: Any) -> str | None:
+ if not isinstance(payload, dict):
+ return None
+ extraction = payload.get("extraction")
+ if isinstance(extraction, dict) and extraction.get("id"):
+ return str(extraction["id"])
+ for key in ("extraction_id", "job_id"):
+ value = payload.get(key)
+ if value:
+ return str(value)
+ return None
def _extract_correlation(metadata: dict[str, Any] | None) -> dict[str, str]:
@@ -392,3 +399,6 @@ def _extract_correlation(metadata: dict[str, Any] | None) -> dict[str, str]:
if not isinstance(raw, dict):
return {}
return {str(k): str(v) for k, v in raw.items() if v}
+
+
+__all__ = ["BboxRefineWorker"]
diff --git a/src/flydocs/core/services/workers/job_reaper.py b/src/flydocs/core/services/workers/job_reaper.py
index 941db3e..cdd5f01 100644
--- a/src/flydocs/core/services/workers/job_reaper.py
+++ b/src/flydocs/core/services/workers/job_reaper.py
@@ -1,26 +1,27 @@
# Copyright 2026 Firefly Software Solutions Inc
-"""``JobReaper`` -- periodic sweep to revive orphaned async jobs.
+"""``ExtractionReaper`` -- periodic sweep to revive orphaned async extractions.
-The job pipeline is at-least-once: every triggering event flows from
-the API/worker → EDA outbox → drain → worker handler. When part of that
-chain crashes we lose the event:
+The extraction pipeline is at-least-once: every triggering event flows
+from the API/worker → EDA outbox → drain → worker handler. When part
+of that chain crashes we lose the event:
* Submit handler crashed between the row INSERT and the outbox INSERT.
-* Worker crashed mid-extraction, leaving the row in ``RUNNING``.
+* Worker crashed mid-extraction, leaving the row in ``running``.
* Worker's failure-path ``_delayed_publish`` task was killed before its
- delay completed, leaving the row in ``QUEUED`` after a ``requeue_for_retry``.
+ delay completed, leaving the row in ``queued`` after a
+ ``requeue_for_retry``.
In any of those cases the row sits stuck because the bus has nothing to
deliver. The reaper closes the gap: it periodically queries for rows
whose state has been "frozen" longer than the lease / threshold,
-republishes a fresh ``IDPJobSubmitted`` event, and lets the worker's
-atomic ``mark_running`` claim decide the winner. Duplicate publishes
-across replicas are deduped at claim time, so running this in every
-worker container is safe.
+republishes a fresh ``extraction.submitted`` event, and lets the
+worker's atomic ``mark_running`` claim decide the winner. Duplicate
+publishes across replicas are deduped at claim time, so running this
+in every worker container is safe.
Recovery time is bounded by ``settings.reaper_sweep_interval_s`` +
-``settings.job_run_lease_s`` (for RUNNING orphans) or
-``settings.queued_orphan_threshold_s`` (for QUEUED orphans).
+``settings.job_run_lease_s`` (for running orphans) or
+``settings.queued_orphan_threshold_s`` (for queued orphans).
"""
from __future__ import annotations
@@ -34,19 +35,24 @@
from flydocs.config import IDPSettings
from flydocs.core.observability import log_outbound
-from flydocs.interfaces.dtos.event import IDPJobSubmittedEvent, envelope_for_publish
-from flydocs.models.repositories import ExtractionJobRepository
+from flydocs.core.services.extractions._projector import row_to_extraction
+from flydocs.interfaces.dtos.event import (
+ EVENT_TYPE_EXTRACTION_SUBMITTED,
+ EventEnvelope,
+ envelope_for_publish,
+)
+from flydocs.models.repositories import ExtractionRepository
logger = logging.getLogger(__name__)
-class JobReaper:
- """Periodic sweep for orphaned QUEUED / RUNNING jobs."""
+class ExtractionReaper:
+ """Periodic sweep for orphaned queued / running extractions."""
def __init__(
self,
*,
- repository: ExtractionJobRepository,
+ repository: ExtractionRepository,
event_publisher: EventPublisher,
settings: IDPSettings,
consumer_id: str | None = None,
@@ -59,7 +65,7 @@ def __init__(
async def run_forever(self) -> None:
logger.info(
- "JobReaper %s started (interval=%ds, run_lease=%ds, queued_threshold=%ds)",
+ "ExtractionReaper %s started (interval=%ds, run_lease=%ds, queued_threshold=%ds)",
self._consumer_id,
self._settings.reaper_sweep_interval_s,
self._settings.job_run_lease_s,
@@ -69,7 +75,7 @@ async def run_forever(self) -> None:
try:
await self._sweep()
except Exception: # noqa: BLE001
- logger.exception("JobReaper sweep failed; will retry next interval")
+ logger.exception("ExtractionReaper sweep failed; will retry next interval")
with contextlib.suppress(TimeoutError):
await asyncio.wait_for(
self._stop.wait(),
@@ -80,35 +86,48 @@ def stop(self) -> None:
self._stop.set()
async def _sweep(self) -> None:
- """One pass: re-publish for every job stuck past its threshold."""
+ """One pass: re-publish for every extraction stuck past its threshold."""
# Order matters very little -- duplicate publishes for the same
- # job are deduped at claim time. We still run them in two
+ # row are deduped at claim time. We still run them in two
# distinct queries so a partial failure (one query OK, the other
# raising) doesn't lose the half that succeeded.
stale_running = await self._repository.find_stale_running(
lease_seconds=self._settings.job_run_lease_s
)
- for job_id in stale_running:
- await self._republish(job_id, reason="stale_running")
+ for extraction_id in stale_running:
+ await self._republish(extraction_id, reason="stale_running")
stale_queued = await self._repository.find_stale_queued(
older_than_seconds=self._settings.queued_orphan_threshold_s
)
- for job_id in stale_queued:
- await self._republish(job_id, reason="orphan_queued")
-
- async def _republish(self, job_id: str, *, reason: str) -> None:
- event = IDPJobSubmittedEvent(job_id=job_id, attempt=1)
+ for extraction_id in stale_queued:
+ await self._republish(extraction_id, reason="orphan_queued")
+
+ async def _republish(self, extraction_id: str, *, reason: str) -> None:
+ row = await self._repository.get(extraction_id)
+ if row is None:
+ return
+ envelope = EventEnvelope(
+ event_type=EVENT_TYPE_EXTRACTION_SUBMITTED,
+ extraction=row_to_extraction(row),
+ )
await self._publisher.publish(
destination=self._settings.jobs_topic,
- event_type=self._settings.jobs_event_type,
- payload=envelope_for_publish(event),
+ event_type=EVENT_TYPE_EXTRACTION_SUBMITTED,
+ payload=envelope_for_publish(envelope),
)
log_outbound(
"reaper",
- op="republish.job",
+ op="republish.extraction",
status="ok",
latency_ms=0.0,
- job_id=job_id,
+ extraction_id=extraction_id,
reason=reason,
)
- logger.info("JobReaper republished job %s (%s)", job_id, reason)
+ logger.info("ExtractionReaper republished extraction %s (%s)", extraction_id, reason)
+
+
+# Backwards-compat alias for callers (CLI) that still import the old name.
+JobReaper = ExtractionReaper
+
+
+__all__ = ["ExtractionReaper", "JobReaper"]
diff --git a/src/flydocs/core/services/workers/job_worker.py b/src/flydocs/core/services/workers/job_worker.py
index 23c89f9..a782ae9 100644
--- a/src/flydocs/core/services/workers/job_worker.py
+++ b/src/flydocs/core/services/workers/job_worker.py
@@ -1,5 +1,5 @@
# Copyright 2026 Firefly Software Solutions Inc
-"""``JobWorker`` -- subscribes to the EDA bus and dispatches into the pipeline.
+"""``ExtractionWorker`` -- subscribes to the EDA bus and dispatches into the pipeline.
The worker registers an :func:`event_listener` handler on the configured
``jobs_event_type`` against the pyfly :class:`EventPublisher` bean (the
@@ -13,14 +13,22 @@
A failed attempt is classified into one of two buckets:
* ``permanent`` -- a malformed payload, an unrecoverable provider error
- (content policy, unsupported model). The job goes straight to
- ``FAILED`` so the caller can fix the input.
+ (content policy, unsupported model). The extraction goes straight to
+ ``failed`` so the caller can fix the input.
* ``retryable`` -- a timeout, a 5xx from the LLM provider, a transient
- network glitch. The worker re-publishes the same ``IDPJobSubmitted``
- event on the same bus after a capped-exponential backoff with jitter,
- so the next worker (or this one, after re-delivery) picks it up. The
+ network glitch. The worker re-publishes the same submitted event on
+ the same bus after a capped-exponential backoff with jitter, so the
+ next worker (or this one, after re-delivery) picks it up. The
``attempts`` counter is persisted in Postgres so the budget survives
worker restarts.
+
+When the caller asked for bbox refinement, the worker calls
+``repository.mark_succeeded(..., request_bbox_refinement=True)`` to
+flip the post-processing bbox status to ``pending`` atomically with
+the main success transition, then publishes a separate
+``extraction.post_processing.requested`` event the dedicated
+:class:`BboxRefineWorker` consumes out of band. The main extraction is
+``succeeded`` immediately -- the bbox leg is purely additive.
"""
from __future__ import annotations
@@ -33,28 +41,32 @@
from datetime import UTC, datetime
from typing import Any
-from pyfly.eda import EventEnvelope, EventPublisher
+from pyfly.eda import EventEnvelope as EdaEnvelope
+from pyfly.eda import EventPublisher
from flydocs.config import IDPSettings
from flydocs.core.observability import log_outbound
+from flydocs.core.services.extractions._projector import row_to_extraction
from flydocs.core.services.pipeline import PipelineOrchestrator
from flydocs.core.services.webhook import WebhookPublisher
-from flydocs.interfaces.dtos.doc import DocSpec
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec
from flydocs.interfaces.dtos.event import (
- IDPBboxRefineRequestedEvent,
- IDPJobSubmittedEvent,
+ EVENT_TYPE_EXTRACTION_COMPLETED,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
+ EVENT_TYPE_EXTRACTION_SUBMITTED,
+ EventEnvelope,
envelope_for_publish,
)
from flydocs.interfaces.dtos.extract import (
- DocumentInput,
ExtractionOptions,
ExtractionRequest,
ExtractionResult,
+ FileInput,
)
+from flydocs.interfaces.dtos.extraction import Extraction
from flydocs.interfaces.dtos.rule import RuleSpec
-from flydocs.interfaces.dtos.webhook import JobWebhookPayload
-from flydocs.interfaces.enums.job_status import JobStatus
-from flydocs.models.repositories import ExtractionJobRepository
+from flydocs.interfaces.enums.extraction_status import ExtractionStatus
+from flydocs.models.repositories import ExtractionRepository
logger = logging.getLogger(__name__)
@@ -85,12 +97,12 @@ def _is_permanent(exc: Exception) -> bool:
return any(hint in message for hint in _PERMANENT_ERROR_HINTS)
-class JobWorker:
+class ExtractionWorker:
def __init__(
self,
*,
orchestrator: PipelineOrchestrator,
- repository: ExtractionJobRepository,
+ repository: ExtractionRepository,
event_publisher: EventPublisher,
webhook: WebhookPublisher,
settings: IDPSettings,
@@ -108,14 +120,14 @@ async def run_forever(self) -> None:
# Subscribe BEFORE starting the bus -- the EDA adapters spin up
# consumer loops at ``start()`` time, and they only do so when at
# least one handler is registered.
- self._publisher.subscribe(self._settings.jobs_event_type, self._on_event)
+ self._publisher.subscribe(EVENT_TYPE_EXTRACTION_SUBMITTED, self._on_event)
await self._publisher.start()
logger.info(
- "JobWorker %s started (adapter=%s, destination=%s, event_type=%s)",
+ "ExtractionWorker %s started (adapter=%s, destination=%s, event_type=%s)",
self._consumer_id,
self._settings.eda_adapter,
self._settings.jobs_topic,
- self._settings.jobs_event_type,
+ EVENT_TYPE_EXTRACTION_SUBMITTED,
)
try:
await self._stop.wait()
@@ -127,60 +139,68 @@ def stop(self) -> None:
# ------------------------------------------------------------------
- async def _on_event(self, envelope: EventEnvelope) -> None:
- job_id = envelope.payload.get("job_id") if isinstance(envelope.payload, dict) else None
- if not job_id:
+ async def _on_event(self, envelope: EdaEnvelope) -> None:
+ extraction_id = _extraction_id_from_payload(envelope.payload)
+ if not extraction_id:
logger.warning(
- "Received %s event without job_id: %r -- dropping",
+ "Received %s event without extraction id: %r -- dropping",
envelope.event_type,
envelope.payload,
)
return
- await self._process(str(job_id))
+ await self._process(extraction_id)
- async def _process(self, job_id: str) -> None:
- job = await self._repository.get(job_id)
- if job is None:
- logger.warning("EDA delivered unknown job %s -- dropping", job_id)
+ async def _process(self, extraction_id: str) -> None:
+ row = await self._repository.get(extraction_id)
+ if row is None:
+ logger.warning("EDA delivered unknown extraction %s -- dropping", extraction_id)
return
- if JobStatus(job.status) in (JobStatus.SUCCEEDED, JobStatus.CANCELLED, JobStatus.FAILED):
- logger.info("Job %s already in terminal status %s -- skipping", job.id, job.status)
+ if ExtractionStatus(row.status) in (
+ ExtractionStatus.SUCCEEDED,
+ ExtractionStatus.CANCELLED,
+ ExtractionStatus.FAILED,
+ ):
+ logger.info(
+ "Extraction %s already in terminal status %s -- skipping",
+ row.id,
+ row.status,
+ )
return
# Atomic compare-and-swap: only one worker can claim a QUEUED
- # (or stale-RUNNING) job. ``None`` means another worker beat us
- # to it or the job was cancelled between our ``get`` and this
- # claim -- both are silent no-ops.
- claimed = await self._repository.mark_running(job.id, lease_seconds=self._settings.job_run_lease_s)
+ # (or stale-RUNNING) extraction. ``None`` means another worker
+ # beat us to it or the row was cancelled between our ``get``
+ # and this claim -- both are silent no-ops.
+ claimed = await self._repository.mark_running(row.id, lease_seconds=self._settings.job_run_lease_s)
if claimed is None:
logger.info(
- "Job %s could not be claimed -- already owned by another worker or "
+ "Extraction %s could not be claimed -- already owned by another worker or "
"no longer in a claimable state. Skipping at-least-once redelivery.",
- job.id,
+ row.id,
)
return
- job = claimed
- attempts = job.attempts or 1
+ row = claimed
+ attempts = row.attempts or 1
log_outbound(
"worker",
- op="job.run",
+ op="extraction.run",
status="started",
latency_ms=0.0,
- job_id=job.id,
+ extraction_id=row.id,
attempt=attempts,
)
- request = self._build_request(job)
+ request = self._build_request(row)
# Capture the original intent BEFORE we mutate the request: we
# need to know whether the caller wanted bbox refinement so we
- # can publish the IDPBboxRefineRequested event afterwards, even
- # if we skip the inline node below.
+ # can publish the post-processing event afterwards, even if we
+ # skip the inline node below.
wants_bbox_refine = bool(getattr(request.options.stages, "bbox_refine", False))
if wants_bbox_refine:
# Architectural decision: on the async path, skip the inline
# bbox_refine node entirely. The dedicated BboxRefineWorker
- # picks up the IDPBboxRefineRequested event we publish below
- # and grounds bboxes there. Running both wastes minutes of
- # CPU + LLM tokens on duplicate work — and when the inline
+ # picks up the post-processing event we publish below and
+ # grounds bboxes there. Running both wastes minutes of CPU
+ # and LLM tokens on duplicate work — and when the inline
# step times out (which it does on multi-PDF bundles) the
# pipeline framework marks the node as failed, which is
# misleading because the out-of-band path recovers
@@ -194,81 +214,84 @@ async def _process(self, job_id: str) -> None:
started = time.monotonic()
try:
result = await asyncio.wait_for(
- self._orchestrator.execute(request), timeout=self._settings.async_timeout_s
+ self._orchestrator.execute(request, extraction_id=row.id),
+ timeout=self._settings.async_timeout_s,
)
result_payload = result.model_dump(mode="json", by_alias=True)
# Branch on bbox_refine: when the caller asked for grounded
- # coordinates, the job becomes ``PARTIAL_SUCCEEDED`` here and
- # the actual grounding is delegated to ``BboxRefineWorker`` via
- # a second EDA event. The result is already readable -- only
- # the bboxes change between PARTIAL_SUCCEEDED and SUCCEEDED.
- terminal_status = JobStatus.PARTIAL_SUCCEEDED if wants_bbox_refine else JobStatus.SUCCEEDED
- if wants_bbox_refine:
- finalised = await self._repository.mark_partial_succeeded(job.id, result=result_payload)
- else:
- finalised = await self._repository.mark_succeeded(job.id, result=result_payload)
+ # coordinates the main pipeline is still ``succeeded`` here
+ # -- bbox refinement is additive post-processing -- but the
+ # bbox leg flips to ``pending`` atomically so the bbox
+ # worker can pick it up.
+ finalised = await self._repository.mark_succeeded(
+ row.id,
+ result=result_payload,
+ request_bbox_refinement=wants_bbox_refine,
+ )
if finalised is None:
# Another worker (or the bbox leg) already advanced the
# row past RUNNING. Our work is duplicate -- don't fire
# the webhook a second time, don't republish.
logger.info(
- "Job %s already finalised by another worker -- discarding our duplicate result",
- job.id,
+ "Extraction %s already finalised by another worker -- discarding our duplicate result",
+ row.id,
)
return
log_outbound(
"worker",
- op="job.run",
+ op="extraction.run",
status="ok",
latency_ms=(time.monotonic() - started) * 1000,
- job_id=job.id,
+ extraction_id=row.id,
attempt=attempts,
- terminal=terminal_status.value,
+ terminal=ExtractionStatus.SUCCEEDED.value,
)
- correlation_headers = _extract_correlation(job.metadata_json)
+ correlation_headers = _extract_correlation(row.metadata_json)
+ extraction_dto = row_to_extraction(finalised)
await self._fire_webhook(
- job_id=job.id,
- status=terminal_status,
+ event_type=EVENT_TYPE_EXTRACTION_COMPLETED,
+ extraction=extraction_dto,
result=result,
- metadata=job.metadata_json or {},
- callback_url=job.callback_url,
+ metadata=row.metadata_json or {},
+ callback_url=row.callback_url,
correlation=correlation_headers,
- started_at=getattr(job, "started_at", None),
- finished_at=datetime.now(UTC),
- attempts=attempts,
)
if wants_bbox_refine:
- refine_event = IDPBboxRefineRequestedEvent(
- job_id=job.id,
- attempt=1,
+ # Publish the post-processing event using the SAME
+ # EventEnvelope shape that the SDK / webhook consumers
+ # see -- the EDA bus and the webhook delivery now agree
+ # on a single model.
+ refine_envelope = EventEnvelope(
+ event_type=EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
correlation_id=correlation_headers.get("X-Correlation-Id"),
tenant_id=correlation_headers.get("X-Tenant-Id"),
+ extraction=extraction_dto,
)
await self._publisher.publish(
destination=self._settings.bbox_refine_topic,
- event_type=self._settings.bbox_refine_event_type,
- payload=envelope_for_publish(refine_event),
+ event_type=EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
+ payload=envelope_for_publish(refine_envelope),
headers=correlation_headers,
)
log_outbound(
"eda",
- op="publish.bbox_refine",
+ op="publish.post_processing",
status="ok",
latency_ms=0.0,
- job_id=job.id,
+ extraction_id=row.id,
destination=self._settings.bbox_refine_topic,
)
except Exception as exc: # noqa: BLE001
permanent = _is_permanent(exc)
exhausted = attempts >= self._settings.job_max_attempts
terminal = permanent or exhausted
- error_code = "PERMANENT_ERROR" if permanent else "EXTRACTION_FAILED"
+ error_code = "permanent_error" if permanent else "extraction_failed"
log_outbound(
"worker",
- op="job.run",
+ op="extraction.run",
status="error",
latency_ms=(time.monotonic() - started) * 1000,
- job_id=job.id,
+ extraction_id=row.id,
attempt=attempts,
permanent=permanent,
exhausted=exhausted,
@@ -276,32 +299,28 @@ async def _process(self, job_id: str) -> None:
)
if terminal:
- failed = await self._repository.mark_failed(job.id, code=error_code, message=str(exc))
+ failed = await self._repository.mark_failed(row.id, code=error_code, message=str(exc))
if failed is None:
logger.info(
- "Job %s no longer in RUNNING -- another worker handled the "
+ "Extraction %s no longer in RUNNING -- another worker handled the "
"terminal transition, skipping our webhook",
- job.id,
+ row.id,
)
return
+ failed_dto = row_to_extraction(failed)
await self._fire_webhook(
- job_id=job.id,
- status=JobStatus.FAILED,
+ event_type=EVENT_TYPE_EXTRACTION_COMPLETED,
+ extraction=failed_dto,
result=None,
- metadata=job.metadata_json or {},
- callback_url=job.callback_url,
- error_code=error_code,
- error_message=str(exc),
- correlation=_extract_correlation(job.metadata_json),
- started_at=getattr(job, "started_at", None),
- finished_at=datetime.now(UTC),
- attempts=attempts,
+ metadata=row.metadata_json or {},
+ callback_url=row.callback_url,
+ correlation=_extract_correlation(row.metadata_json),
)
else:
delay = self._backoff_delay(attempts)
logger.warning(
- "Job %s failed attempt %d (%s); re-publishing in %.1fs",
- job.id,
+ "Extraction %s failed attempt %d (%s); re-publishing in %.1fs",
+ row.id,
attempts,
exc,
delay,
@@ -311,14 +330,14 @@ async def _process(self, job_id: str) -> None:
# cancel won the race against our claim, or another
# worker took over after our lease expired) we skip the
# republish: someone else owns the next step.
- requeued = await self._repository.requeue_for_retry(job.id)
+ requeued = await self._repository.requeue_for_retry(row.id)
if requeued is None:
logger.info(
- "Job %s not requeueable (status changed under us) -- skipping retry publish",
- job.id,
+ "Extraction %s not requeueable (status changed under us) -- skipping retry publish",
+ row.id,
)
else:
- asyncio.create_task(self._delayed_publish(job.id, delay))
+ asyncio.create_task(self._delayed_publish(row.id, delay))
def _backoff_delay(self, attempts: int) -> float:
"""Capped exponential backoff with a 20% jitter."""
@@ -329,55 +348,57 @@ def _backoff_delay(self, attempts: int) -> float:
jitter = capped * 0.2 * random.random()
return capped + jitter
- async def _delayed_publish(self, job_id: str, delay_s: float) -> None:
+ async def _delayed_publish(self, extraction_id: str, delay_s: float) -> None:
try:
await asyncio.sleep(delay_s)
- # We don't have the original correlation context here (this
- # runs in a detached task after the handler returned), so
- # emit the envelope without correlation. The dedupe-by-event-id
- # guarantee still holds for clients tracking re-deliveries.
- republish_event = IDPJobSubmittedEvent(
- job_id=job_id,
- attempt=2, # any republish is at least attempt 2 from the worker's POV
+ # Re-resolve the row so we can publish a fresh envelope
+ # carrying the current resource snapshot.
+ row = await self._repository.get(extraction_id)
+ if row is None:
+ logger.warning("Delayed republish: extraction %s vanished", extraction_id)
+ return
+ envelope = EventEnvelope(
+ event_type=EVENT_TYPE_EXTRACTION_SUBMITTED,
+ extraction=row_to_extraction(row),
)
await self._publisher.publish(
destination=self._settings.jobs_topic,
- event_type=self._settings.jobs_event_type,
- payload=envelope_for_publish(republish_event),
+ event_type=EVENT_TYPE_EXTRACTION_SUBMITTED,
+ payload=envelope_for_publish(envelope),
)
log_outbound(
"eda",
op="republish",
status="ok",
latency_ms=delay_s * 1000,
- job_id=job_id,
+ extraction_id=extraction_id,
)
except Exception as exc: # noqa: BLE001
- logger.error("Failed to re-publish job %s after backoff: %s", job_id, exc)
+ logger.error("Failed to re-publish extraction %s after backoff: %s", extraction_id, exc)
- def _build_request(self, job: Any) -> ExtractionRequest:
- schema = job.schema_json or {}
+ def _build_request(self, row: Any) -> ExtractionRequest:
+ schema = row.schema_json or {}
intention = schema.get("intention", "Extract structured data from the document.")
- docs = [DocSpec.model_validate(d) for d in schema.get("docs", [])]
+ document_types = [DocumentTypeSpec.model_validate(d) for d in schema.get("document_types", [])]
rules = [RuleSpec.model_validate(r) for r in schema.get("rules", [])]
- options = ExtractionOptions.model_validate(job.options_json or {})
- documents_payload = schema.get("documents") or []
- if not documents_payload:
+ options = ExtractionOptions.model_validate(row.options_json or {})
+ files_payload = schema.get("files") or []
+ if not files_payload:
raise ValueError(
- f"job {job.id} schema_json missing 'documents' — cannot rebuild ExtractionRequest"
+ f"extraction {row.id} schema_json missing 'files' — cannot rebuild ExtractionRequest"
)
return ExtractionRequest(
intention=intention,
- documents=[
- DocumentInput(
- filename=d.get("filename", job.filename),
+ files=[
+ FileInput(
+ filename=d.get("filename", row.filename),
content_base64=d.get("content_base64", ""),
content_type=d.get("content_type"),
- document_type=d.get("document_type"),
+ expected_type=d.get("expected_type"),
)
- for d in documents_payload
+ for d in files_payload
],
- docs=docs,
+ document_types=document_types,
rules=rules,
options=options,
)
@@ -385,37 +406,46 @@ def _build_request(self, job: Any) -> ExtractionRequest:
async def _fire_webhook(
self,
*,
- job_id: str,
- status: JobStatus,
+ event_type: str,
+ extraction: Extraction,
result: ExtractionResult | None,
metadata: dict[str, Any],
callback_url: str | None,
- error_code: str | None = None,
- error_message: str | None = None,
correlation: dict[str, str] | None = None,
- started_at: datetime | None = None,
- finished_at: datetime | None = None,
- attempts: int = 1,
) -> None:
if not callback_url:
return
clean_metadata = {k: v for k, v in (metadata or {}).items() if not k.startswith("_")}
corr = correlation or {}
- payload = JobWebhookPayload(
- job_id=job_id,
- status=status,
+ envelope = EventEnvelope(
+ event_type=event_type,
occurred_at=datetime.now(UTC),
- started_at=started_at,
- finished_at=finished_at,
- attempts=attempts,
correlation_id=corr.get("X-Correlation-Id"),
tenant_id=corr.get("X-Tenant-Id"),
+ extraction=extraction,
+ result=result if extraction.status == ExtractionStatus.SUCCEEDED else None,
metadata=clean_metadata,
- result=result,
- error_code=error_code,
- error_message=error_message,
)
- await self._webhook.deliver(callback_url, payload, extra_headers=corr)
+ await self._webhook.deliver(callback_url, envelope, extra_headers=corr)
+
+
+def _extraction_id_from_payload(payload: Any) -> str | None:
+ """Pull the extraction id out of an inbound EDA payload.
+
+ Accepts both the v1 ``EventEnvelope`` shape (``extraction.id``) and
+ the bare ``extraction_id`` / ``job_id`` keys still produced by
+ legacy republishers on the bus during the migration window.
+ """
+ if not isinstance(payload, dict):
+ return None
+ extraction = payload.get("extraction")
+ if isinstance(extraction, dict) and extraction.get("id"):
+ return str(extraction["id"])
+ for key in ("extraction_id", "job_id"):
+ value = payload.get(key)
+ if value:
+ return str(value)
+ return None
def _extract_correlation(metadata: dict[str, Any] | None) -> dict[str, str]:
@@ -426,3 +456,11 @@ def _extract_correlation(metadata: dict[str, Any] | None) -> dict[str, str]:
if not isinstance(raw, dict):
return {}
return {str(k): str(v) for k, v in raw.items() if v}
+
+
+# Backwards-compat alias for callers (CLI, tests) that still import the
+# old name. New code should use :class:`ExtractionWorker`.
+JobWorker = ExtractionWorker
+
+
+__all__ = ["ExtractionWorker", "JobWorker"]
diff --git a/src/flydocs/interfaces/dtos/__init__.py b/src/flydocs/interfaces/dtos/__init__.py
index 200154a..e158501 100644
--- a/src/flydocs/interfaces/dtos/__init__.py
+++ b/src/flydocs/interfaces/dtos/__init__.py
@@ -5,83 +5,129 @@
ContentAuthenticity,
ContentCoherenceCheck,
DocumentAuthenticity,
- VisualValidationOutcome,
+ VisualCheckResult,
)
-from flydocs.interfaces.dtos.bbox import BoundingBox
-from flydocs.interfaces.dtos.doc import DocSpec, DocType, ValidatorsSpec, VisualValidatorSpec
+from flydocs.interfaces.dtos.bbox import BboxQuality, BboxSource, BoundingBox
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec, VisualCheck
from flydocs.interfaces.dtos.error import ProblemDetails
+from flydocs.interfaces.dtos.event import (
+ ALL_EVENT_TYPES,
+ EVENT_TYPE_EXTRACTION_COMPLETED,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
+ EVENT_TYPE_EXTRACTION_SUBMITTED,
+ EventEnvelope,
+ envelope_for_publish,
+)
from flydocs.interfaces.dtos.extract import (
- DocumentInfo,
- DocumentInput,
- ExtractedDocument,
+ ClassificationInfo,
+ Document,
+ EscalationConfig,
+ EscalationInfo,
ExtractionOptions,
ExtractionRequest,
ExtractionResult,
+ FileInput,
+ FileSummary,
+ PipelineError,
+ PipelineMeta,
StageToggles,
+ TraceEntry,
+ UsageBreakdown,
+)
+from flydocs.interfaces.dtos.extraction import (
+ BboxRefinementInfo,
+ Extraction,
+ ExtractionError,
+ ExtractionListQuery,
+ ExtractionListResponse,
+ ExtractionResultEnvelope,
+ PostProcessing,
+ SubmitExtractionRequest,
)
from flydocs.interfaces.dtos.field import (
ExtractedField,
ExtractedFieldGroup,
+ Field,
FieldGroup,
- FieldItem,
- FieldSpec,
FieldValidation,
FieldValidationError,
JudgeOutcome,
)
-from flydocs.interfaces.dtos.job import (
- JobResult,
- JobStatusResponse,
- SubmitJobRequest,
- SubmitJobResponse,
-)
from flydocs.interfaces.dtos.rule import (
RuleFieldParent,
RuleOutputSpec,
+ RuleParent,
RuleResult,
RuleRuleParent,
RuleSpec,
RuleValidatorParent,
)
-from flydocs.interfaces.dtos.standard_validator import StandardValidatorSpec
-from flydocs.interfaces.dtos.webhook import JobWebhookPayload
+from flydocs.interfaces.dtos.transformation import (
+ EntityResolutionTransformation,
+ LlmTransformation,
+ Transformation,
+ TransformationScope,
+)
+from flydocs.interfaces.dtos.validator import ValidatorSpec
__all__ = [
+ "ALL_EVENT_TYPES",
+ "BboxQuality",
+ "BboxRefinementInfo",
+ "BboxSource",
"BoundingBox",
+ "ClassificationInfo",
"ContentAuthenticity",
"ContentCoherenceCheck",
- "DocSpec",
- "DocType",
+ "Document",
"DocumentAuthenticity",
- "DocumentInfo",
- "DocumentInput",
- "ExtractedDocument",
- "ExtractedField",
- "ExtractedFieldGroup",
+ "DocumentTypeSpec",
+ "EVENT_TYPE_EXTRACTION_COMPLETED",
+ "EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED",
+ "EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED",
+ "EVENT_TYPE_EXTRACTION_SUBMITTED",
+ "EntityResolutionTransformation",
+ "EscalationConfig",
+ "EscalationInfo",
+ "EventEnvelope",
+ "Extraction",
+ "ExtractionError",
+ "ExtractionListQuery",
+ "ExtractionListResponse",
"ExtractionOptions",
"ExtractionRequest",
"ExtractionResult",
+ "ExtractionResultEnvelope",
+ "ExtractedField",
+ "ExtractedFieldGroup",
+ "Field",
"FieldGroup",
- "FieldItem",
- "FieldSpec",
"FieldValidation",
"FieldValidationError",
- "JobResult",
- "JobStatusResponse",
- "JobWebhookPayload",
+ "FileInput",
+ "FileSummary",
"JudgeOutcome",
+ "LlmTransformation",
+ "PipelineError",
+ "PipelineMeta",
+ "PostProcessing",
"ProblemDetails",
"RuleFieldParent",
"RuleOutputSpec",
+ "RuleParent",
"RuleResult",
"RuleRuleParent",
"RuleSpec",
"RuleValidatorParent",
"StageToggles",
- "StandardValidatorSpec",
- "SubmitJobRequest",
- "SubmitJobResponse",
- "ValidatorsSpec",
- "VisualValidationOutcome",
- "VisualValidatorSpec",
+ "SubmitExtractionRequest",
+ "TraceEntry",
+ "Transformation",
+ "TransformationScope",
+ "UsageBreakdown",
+ "ValidatorSpec",
+ "VisualCheck",
+ "VisualCheckResult",
+ "envelope_for_publish",
]
diff --git a/src/flydocs/interfaces/dtos/authenticity.py b/src/flydocs/interfaces/dtos/authenticity.py
index fda3e18..8ae79b7 100644
--- a/src/flydocs/interfaces/dtos/authenticity.py
+++ b/src/flydocs/interfaces/dtos/authenticity.py
@@ -3,27 +3,35 @@
from __future__ import annotations
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, ConfigDict, Field
from flydocs.interfaces.enums.status import CheckStatus, ContentIntegrityStatus
-class VisualValidationOutcome(BaseModel):
+class VisualCheckResult(BaseModel):
+ """One visual check's outcome on a document."""
+
+ model_config = ConfigDict(extra="forbid")
+
name: str
passed: bool
confidence: float = Field(default=0.0, ge=0.0, le=1.0)
- notes: str = ""
+ notes: str | None = None
class ContentCoherenceCheck(BaseModel):
+ model_config = ConfigDict(extra="forbid")
+
name: str
description: str
status: CheckStatus
- evidence: str = ""
- reasoning: str = ""
+ evidence: str | None = None
+ reasoning: str | None = None
class ContentAuthenticity(BaseModel):
+ model_config = ConfigDict(extra="forbid")
+
overall_integrity_status: ContentIntegrityStatus = ContentIntegrityStatus.UNCERTAIN
checks: list[ContentCoherenceCheck] = Field(default_factory=list)
@@ -31,5 +39,7 @@ class ContentAuthenticity(BaseModel):
class DocumentAuthenticity(BaseModel):
"""Aggregated authenticity result for a single document instance."""
- visual: list[VisualValidationOutcome] = Field(default_factory=list)
- content: ContentAuthenticity = Field(default_factory=ContentAuthenticity)
+ model_config = ConfigDict(extra="forbid")
+
+ visual: list[VisualCheckResult] = Field(default_factory=list)
+ content: ContentAuthenticity | None = None
diff --git a/src/flydocs/interfaces/dtos/bbox.py b/src/flydocs/interfaces/dtos/bbox.py
index 55fef3b..c6754a5 100644
--- a/src/flydocs/interfaces/dtos/bbox.py
+++ b/src/flydocs/interfaces/dtos/bbox.py
@@ -1,27 +1,26 @@
# Copyright 2026 Firefly Software Solutions Inc
"""Bounding box in normalised image-space coordinates.
-All values are floats in ``[0, 1]``. ``(0, 0)`` is the top-left of the
-rendered page; ``(1, 1)`` is the bottom-right. The contract is enforced
-both by the prompt sent to the LLM and by post-processing in
-:mod:`flydocs.core.services.extraction.postprocess`.
-
-The :class:`BboxSource` discriminator tells callers how each coordinate
-set was produced:
-
-* ``llm`` -- the multimodal model's visual estimate (default for
- every extraction). Imprecise: the box lands in the
- right region of the page but is routinely off by a
- line or more.
+All values are floats in [0, 1]. (0, 0) is the top-left of the rendered
+page; (1, 1) is the bottom-right.
+
+Absence of a bbox is represented by ``null`` at the consuming field site
+(``ExtractedField.bbox = None``) — there is no synthetic "empty" placeholder
+box in v1.
+
+The :class:`BboxSource` discriminator tells callers how each coordinate set
+was produced:
+
+* ``llm`` -- multimodal model's visual estimate (default for every
+ first-pass extraction). Imprecise: lands in the right
+ region but is routinely off by a line or more.
* ``pdf_text`` -- grounded against the PDF's text layer via PyMuPDF.
Sub-pixel accurate.
-* ``ocr`` -- grounded against an OCR word stream (PaddleOCR /
- Mistral OCR / etc.) for image-PDFs and raster
- inputs. Accuracy depends on the engine.
-* ``none`` -- no bbox was produced (placeholder field).
+* ``ocr`` -- grounded against an OCR word stream for image-PDFs and
+ raster inputs. Accuracy depends on the engine.
-The refinement runs as the optional ``bbox_refine`` pipeline stage --
-opt-in via ``ExtractionOptions.stages.bbox_refine`` -- and replaces the
+The refinement runs as the optional ``bbox_refine`` pipeline stage —
+opt-in via ``ExtractionOptions.stages.bbox_refine`` — and replaces the
LLM coordinates with the tight word-union when a fuzzy match is found
above the configured threshold; otherwise the LLM bbox is kept,
tagged ``source=llm, refinement_confidence=null``.
@@ -31,90 +30,39 @@
from enum import StrEnum
-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, ConfigDict, Field, model_validator
class BboxQuality(StrEnum):
- """Coarse-grained verdict on whether a bbox is trustworthy.
-
- Stamped by :class:`BboxValidator` after extraction. ``invalid``
- means the box is geometrically broken or the field has no value.
- ``empty`` is the placeholder for fields the LLM didn't locate.
- """
+ """Coarse-grained verdict on whether a bbox is trustworthy."""
GOOD = "good"
POOR = "poor"
SUSPICIOUS = "suspicious"
INVALID = "invalid"
- EMPTY = "empty"
class BboxSource(StrEnum):
- """How the coordinates on this bbox were produced.
-
- See the module docstring for the full lifecycle. The discriminator
- lets strict callers filter to grounded-only boxes (``pdf_text`` /
- ``ocr``) and treat ``llm`` boxes as approximate region hints.
- """
+ """How the coordinates on this bbox were produced."""
LLM = "llm"
PDF_TEXT = "pdf_text"
OCR = "ocr"
- NONE = "none"
class BoundingBox(BaseModel):
"""Normalised rectangle on a single page."""
+ model_config = ConfigDict(extra="forbid")
+
xmin: float = Field(..., ge=0.0, le=1.0, description="Left edge in [0, 1].")
ymin: float = Field(..., ge=0.0, le=1.0, description="Top edge in [0, 1].")
xmax: float = Field(..., ge=0.0, le=1.0, description="Right edge in [0, 1].")
ymax: float = Field(..., ge=0.0, le=1.0, description="Bottom edge in [0, 1].")
- quality: BboxQuality | None = Field(
- default=None,
- description=(
- "Geometric verdict on whether the bbox looks plausibly real "
- "or like an LLM hallucination. Populated by the bbox "
- "validator that runs at the end of the pipeline; ``null`` "
- "means the validator hasn't run for this field yet."
- ),
- )
- quality_score: float = Field(
- default=0.0,
- ge=0.0,
- le=1.0,
- description=(
- "Continuous geometric quality score in ``[0, 1]``. Combines "
- "area, aspect ratio, and edge sanity. 0.0 for empty / "
- "missing boxes; ~1.0 for boxes that fall in a plausible "
- "text-bounding region. **Does NOT mean the box actually "
- "fences the real text** -- it only means the shape is "
- "plausible. See the module docstring on LLM bbox imprecision."
- ),
- )
- source: BboxSource | None = Field(
- default=None,
- description=(
- "Origin of the coordinates: ``llm`` (multimodal model "
- "estimate, imprecise), ``pdf_text`` (grounded via PyMuPDF "
- "against the PDF text layer, sub-pixel accurate), ``ocr`` "
- "(grounded via an OCR engine for image-PDFs and rasters), "
- "or ``none`` (no bbox produced). ``null`` means the bbox "
- "refiner has not run for this field yet."
- ),
- )
- refinement_confidence: float | None = Field(
- default=None,
- ge=0.0,
- le=1.0,
- description=(
- "Fuzzy-match score from the bbox refiner in ``[0, 1]``. "
- "Populated when ``source in {pdf_text, ocr}`` and the "
- "matcher located the value above its threshold. ``null`` "
- "for ``source in {llm, none}`` (the LLM doesn't expose "
- "a comparable score)."
- ),
- )
+ quality: BboxQuality | None = None
+ quality_score: float = Field(default=0.0, ge=0.0, le=1.0)
+ source: BboxSource | None = None
+ refinement_confidence: float | None = Field(default=None, ge=0.0, le=1.0)
@model_validator(mode="after")
def _validate_corners(self) -> BoundingBox:
@@ -123,22 +71,3 @@ def _validate_corners(self) -> BoundingBox:
if self.ymin >= self.ymax:
raise ValueError("ymin must be strictly less than ymax")
return self
-
- @classmethod
- def empty(cls) -> BoundingBox:
- """A degenerate placeholder used when a field is not found.
-
- Returning an explicit zero-area box keeps the response schema
- stable: every field carries the same shape whether or not it
- was located in the document.
- """
- # Use 0..eps so the post-validator does not reject it.
- return cls(
- xmin=0.0,
- ymin=0.0,
- xmax=1e-9,
- ymax=1e-9,
- quality=BboxQuality.EMPTY,
- quality_score=0.0,
- source=BboxSource.NONE,
- )
diff --git a/src/flydocs/interfaces/dtos/doc.py b/src/flydocs/interfaces/dtos/doc.py
deleted file mode 100644
index 89fbf5a..0000000
--- a/src/flydocs/interfaces/dtos/doc.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""Doc-type / validator DTOs -- what each expected document looks like."""
-
-from __future__ import annotations
-
-from pydantic import BaseModel, ConfigDict, Field
-
-from flydocs.interfaces.dtos.field import FieldGroup
-
-
-class DocType(BaseModel):
- documentType: str = Field(
- ..., min_length=1, description="Stable id for this document type (e.g. ``passport``)."
- )
- description: str = ""
- country: str = Field(default="", description="ISO 3166-1 alpha-2 country code.")
-
-
-class VisualValidatorSpec(BaseModel):
- """One visual check to run against the document (e.g. signature, watermark)."""
-
- name: str = Field(..., min_length=1)
- description: str
-
-
-class ValidatorsSpec(BaseModel):
- """Bundle of validator definitions for a single document type.
-
- Currently only visual validators are exposed publicly; future
- additions (audio, structural) plug in here.
- """
-
- visual: list[VisualValidatorSpec] = Field(default_factory=list)
-
-
-class DocSpec(BaseModel):
- """One expected document type the caller is submitting fields / validators for."""
-
- model_config = ConfigDict(populate_by_name=True)
-
- docType: DocType
- fieldGroups: list[FieldGroup] = Field(..., min_length=1)
- validators: ValidatorsSpec = Field(default_factory=ValidatorsSpec)
diff --git a/src/flydocs/interfaces/dtos/document_type.py b/src/flydocs/interfaces/dtos/document_type.py
new file mode 100644
index 0000000..df6f138
--- /dev/null
+++ b/src/flydocs/interfaces/dtos/document_type.py
@@ -0,0 +1,34 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""DocumentTypeSpec -- schema template for one expected document type.
+
+Replaces the v0 ``DocSpec`` and the nested ``DocType`` envelope, flattening
+``docs[i].docType.documentType`` (three layers of "doc" stutter) into
+``document_types[i].id`` (one identifier).
+"""
+
+from __future__ import annotations
+
+from pydantic import BaseModel, ConfigDict, Field
+
+from flydocs.interfaces.dtos.field import FieldGroup
+
+
+class VisualCheck(BaseModel):
+ """One visual check to run against the document (signature, watermark, seal, ...)."""
+
+ model_config = ConfigDict(extra="forbid")
+
+ name: str = Field(..., min_length=1)
+ description: str
+
+
+class DocumentTypeSpec(BaseModel):
+ """One expected document type the caller is submitting fields for."""
+
+ model_config = ConfigDict(extra="forbid")
+
+ id: str = Field(..., min_length=1, description="Stable id (e.g. 'invoice', 'passport').")
+ description: str | None = None
+ country: str | None = Field(default=None, description="ISO 3166-1 alpha-2 country code.")
+ field_groups: list[FieldGroup] = Field(..., min_length=1)
+ visual_checks: list[VisualCheck] = Field(default_factory=list)
diff --git a/src/flydocs/interfaces/dtos/event.py b/src/flydocs/interfaces/dtos/event.py
index e3f6732..b2f339d 100644
--- a/src/flydocs/interfaces/dtos/event.py
+++ b/src/flydocs/interfaces/dtos/event.py
@@ -1,41 +1,38 @@
# Copyright 2026 Firefly Software Solutions Inc
-"""Typed EDA event envelopes published / consumed by flydocs.
-
-Every event the service publishes carries:
-
-* ``event_id`` -- a fresh UUID v4. Lets clients dedupe at-least-once
- deliveries, correlate webhook callbacks with outbox rows, and
- reference a specific notification in audit trails.
-* ``event_type`` -- the same constant string the EDA bus uses for
- routing (``IDPJobSubmitted``, ``IDPJobCompleted``,
- ``IDPBboxRefineRequested``, ``IDPBboxRefineCompleted``).
-* ``version`` -- semver-style payload version. Bump when you change
- the payload shape in a non-backwards-compatible way so consumers
- can branch on it.
-* ``occurred_at`` -- UTC ISO-8601 timestamp; when the event was
- produced by the originating service.
-* ``correlation_id`` -- request-level correlation that propagates
- through the whole pipeline. Echoes ``X-Correlation-Id``.
-* Type-specific payload fields (``job_id``, optionally ``attempt``,
- ``status``, error info, …).
-
-Consumers of the async API (webhooks, the EDA workers themselves)
-read the typed envelope rather than the raw dict — see
-``JobWorker._on_event`` and ``BboxRefineWorker._on_event``. The
-webhook payload now also embeds the envelope under
-``event`` so external clients get the same audit surface the
-internal workers see.
+"""Unified event + webhook envelope.
+
+The same :class:`EventEnvelope` shape is published over the EDA bus
+(Postgres LISTEN/NOTIFY, Kafka, Redis, in-memory) and posted to webhook
+``callback_url``s. Operators see a single mental model in logs, in broker
+UIs, and in receiving webhook handlers.
+
+Event types are dotted snake_case — the only intentional exception to the
+"flat snake_case enums" convention, because dots are the de-facto routing
+convention for Kafka topics, EventBridge buses, and CloudEvents.
"""
from __future__ import annotations
import uuid
from datetime import UTC, datetime
-from typing import Annotated, Any, Literal
+from typing import Any
from pydantic import BaseModel, ConfigDict, Field
-from flydocs.interfaces.enums.job_status import JobStatus
+from flydocs.interfaces.dtos.extract import ExtractionResult
+from flydocs.interfaces.dtos.extraction import Extraction
+
+EVENT_TYPE_EXTRACTION_SUBMITTED = "extraction.submitted"
+EVENT_TYPE_EXTRACTION_COMPLETED = "extraction.completed"
+EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED = "extraction.post_processing.requested"
+EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED = "extraction.post_processing.completed"
+
+ALL_EVENT_TYPES = (
+ EVENT_TYPE_EXTRACTION_SUBMITTED,
+ EVENT_TYPE_EXTRACTION_COMPLETED,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED,
+)
def _now_utc() -> datetime:
@@ -46,150 +43,43 @@ def _new_event_id() -> str:
return str(uuid.uuid4())
-class _BaseEvent(BaseModel):
- """Shared envelope: identity, lifecycle, correlation."""
-
- model_config = ConfigDict(populate_by_name=True)
-
- event_id: str = Field(
- default_factory=_new_event_id,
- description="Unique UUID v4 identifier for this event instance.",
- )
- event_type: str = Field(
- description=(
- "EDA topic constant — must match "
- ":class:`IDPSettings.jobs_event_type` / "
- "``bbox_refine_event_type`` / ``jobs_completed_event_type``."
- ),
- )
- version: str = Field(
- default="1.0.0",
- description=(
- "Semver-style payload version. Consumers should compare "
- "the major component to decide if they understand the "
- "shape; minor / patch are backwards-compatible."
- ),
- )
- occurred_at: datetime = Field(
- default_factory=_now_utc,
- description="UTC timestamp at which the producing service emitted this event.",
- )
- correlation_id: str | None = Field(
- default=None,
- description=(
- "Request-level correlation id propagated through every "
- "stage of the pipeline. Mirrors the value of the inbound "
- "``X-Correlation-Id`` / ``traceparent`` headers when set."
- ),
- )
- tenant_id: str | None = Field(
- default=None,
- description="Optional tenant identifier (echoes ``X-Tenant-Id`` when present).",
- )
-
-
-class IDPJobSubmittedEvent(_BaseEvent):
- """Published by ``SubmitJobHandler`` after the job row is persisted.
-
- Triggers ``JobWorker._on_event`` -> ``_process``.
- Re-published by ``JobWorker._delayed_publish`` with the same
- payload shape during retry back-off (``attempt > 1``).
- """
-
- event_type: Literal["IDPJobSubmitted"] = "IDPJobSubmitted"
- job_id: str = Field(description="Stable UUID of the :class:`ExtractionJob` row.")
- attempt: int = Field(
- default=1,
- ge=1,
- description="1 on first submission, increments on each retry republish.",
- )
- submitted_at: datetime = Field(
- default_factory=_now_utc,
- description=(
- "Persisted ``ExtractionJob.created_at``. May differ from "
- "``occurred_at`` when this is a retry republish (occurred "
- "is *now*, submitted is the original submission time)."
- ),
- )
-
-
-class IDPJobCompletedEvent(_BaseEvent):
- """Published when a job reaches a terminal state.
-
- Terminal here means any of ``SUCCEEDED``, ``PARTIAL_SUCCEEDED``,
- ``FAILED``, ``CANCELLED``. Webhook subscribers receive an envelope
- that wraps this event for parity with the EDA bus.
- """
+class EventEnvelope(BaseModel):
+ """Shared envelope for EDA events and webhook deliveries.
- event_type: Literal["IDPJobCompleted"] = "IDPJobCompleted"
- job_id: str
- status: JobStatus
- started_at: datetime | None = Field(
- default=None,
- description="When the worker first picked the job up.",
- )
- finished_at: datetime | None = Field(
- default=None,
- description="Terminal state timestamp.",
- )
- attempts: int = Field(
- default=1,
- ge=1,
- description="Total attempts consumed before reaching this terminal state.",
- )
- error_code: str | None = None
- error_message: str | None = None
-
-
-class IDPBboxRefineRequestedEvent(_BaseEvent):
- """Fan-out event the main worker emits when ``stages.bbox_refine`` is on.
-
- ``BboxRefineWorker._on_event`` consumes it and grounds the bboxes
- out-of-band, then publishes :class:`IDPBboxRefineCompletedEvent`
- once it finishes (success or terminal failure).
+ ``extraction`` carries a current-state snapshot of the resource.
+ ``result`` is populated only on ``extraction.completed`` events when
+ the terminal status is ``succeeded``; null otherwise.
"""
- event_type: Literal["IDPBboxRefineRequested"] = "IDPBboxRefineRequested"
- job_id: str
- attempt: int = Field(default=1, ge=1)
-
-
-class IDPBboxRefineCompletedEvent(_BaseEvent):
- """Emitted by ``BboxRefineWorker`` after refinement settles."""
+ model_config = ConfigDict(extra="forbid")
- event_type: Literal["IDPBboxRefineCompleted"] = "IDPBboxRefineCompleted"
- job_id: str
- status: Literal["succeeded", "failed"]
- started_at: datetime | None = None
- finished_at: datetime | None = None
- attempts: int = Field(default=1, ge=1)
- error_code: str | None = None
- error_message: str | None = None
-
-
-# Discriminated union of every event the service can produce or consume.
-IDPEvent = Annotated[
- IDPJobSubmittedEvent | IDPJobCompletedEvent | IDPBboxRefineRequestedEvent | IDPBboxRefineCompletedEvent,
- Field(discriminator="event_type"),
-]
+ event_id: str = Field(default_factory=_new_event_id)
+ event_type: str
+ version: str = "1.0.0"
+ occurred_at: datetime = Field(default_factory=_now_utc)
+ correlation_id: str | None = None
+ tenant_id: str | None = None
+ extraction: Extraction
+ result: ExtractionResult | None = None
+ metadata: dict[str, Any] = Field(default_factory=dict)
-def envelope_for_publish(event: _BaseEvent) -> dict[str, Any]:
- """Serialise an event for ``EventPublisher.publish(payload=...)``.
+def envelope_for_publish(env: EventEnvelope) -> dict[str, Any]:
+ """Serialise an envelope for :class:`EventPublisher.publish` payloads.
- ``mode="json"`` so the datetime turns into ISO strings, the UUID
- into its hex form, and pydantic does the right thing for the
- enums. ``by_alias=True`` so we emit the canonical field names
- even when callers register aliases on subclasses.
+ ``mode="json"`` so datetimes become ISO strings and enums become their
+ string values. ``by_alias=True`` for parity with any pydantic aliases
+ consumers register.
"""
- return event.model_dump(mode="json", by_alias=True)
+ return env.model_dump(mode="json", by_alias=True)
__all__ = [
- "IDPBboxRefineCompletedEvent",
- "IDPBboxRefineRequestedEvent",
- "IDPEvent",
- "IDPJobCompletedEvent",
- "IDPJobSubmittedEvent",
+ "ALL_EVENT_TYPES",
+ "EVENT_TYPE_EXTRACTION_COMPLETED",
+ "EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED",
+ "EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED",
+ "EVENT_TYPE_EXTRACTION_SUBMITTED",
+ "EventEnvelope",
"envelope_for_publish",
]
diff --git a/src/flydocs/interfaces/dtos/extract.py b/src/flydocs/interfaces/dtos/extract.py
index a2fd95a..3699b21 100644
--- a/src/flydocs/interfaces/dtos/extract.py
+++ b/src/flydocs/interfaces/dtos/extract.py
@@ -1,70 +1,67 @@
# Copyright 2026 Firefly Software Solutions Inc
"""Top-level request / response DTOs for the public extraction API.
-One :class:`ExtractionRequest` carries the document, the schema (one or
-more :class:`DocSpec`), optional business rules, and a set of stage
-toggles. The :class:`ExtractionResult` returned to the caller folds
-every stage's output into a single object: extracted fields with
-bounding boxes, field-validation verdicts, visual / content
-authenticity outcomes, judge verdicts, and rule results.
+One :class:`ExtractionRequest` carries the input files, the schema templates
+(one or more :class:`DocumentTypeSpec`), optional business rules, and a set
+of stage toggles. The :class:`ExtractionResult` returned to the caller folds
+every stage's output into a single object: extracted fields with bounding
+boxes, field-validation verdicts, visual / content authenticity outcomes,
+judge verdicts, rule results.
"""
from __future__ import annotations
import base64
-import uuid
from datetime import datetime
-from typing import Any
+from typing import Any, Literal
from pydantic import BaseModel, ConfigDict, Field, field_validator
from flydocs.interfaces.dtos.authenticity import DocumentAuthenticity
-from flydocs.interfaces.dtos.doc import DocSpec
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec
from flydocs.interfaces.dtos.field import ExtractedFieldGroup
from flydocs.interfaces.dtos.rule import RuleResult, RuleSpec
from flydocs.interfaces.dtos.transformation import Transformation
# ---------------------------------------------------------------------------
-# Document input
+# FileInput (request)
# ---------------------------------------------------------------------------
-class DocumentInput(BaseModel):
- """The document payload provided by the caller (binary, base64-encoded).
+class FileInput(BaseModel):
+ """One input file for an extraction request.
- A request can carry a single file (``document``) or several
- (``documents``). When several are provided the caller may pin each
- one to a target ``document_type`` directly; otherwise the
- classifier stage decides which ``DocSpec`` applies to which file.
+ JSON mode: caller sets ``content_base64`` (raw base64 or a ``data:`` URL).
+ Multipart mode: the binary rides in a separate file part; ``content_base64``
+ is absent and ``filename`` / ``content_type`` come from the part headers.
"""
+ model_config = ConfigDict(extra="forbid")
+
filename: str = Field(..., min_length=1)
- content_base64: str = Field(
- ...,
+ content_base64: str | None = Field(
+ default=None,
description=(
- "Base64-encoded document bytes. Any media type the configured "
- "multimodal LLM accepts works (PDF, PNG, JPEG, WebP, TIFF, DOCX, ...). "
- "Data URLs (``data:application/pdf;base64,...``) are accepted -- the "
- "``data:`` prefix is stripped server-side."
+ "Base64-encoded document bytes (or ``data:;base64,...`` "
+ "data URL — the prefix is stripped server-side). Absent in multipart "
+ "mode."
),
)
- content_type: str | None = Field(
- default=None,
- description=("Optional MIME type hint. When omitted, the service sniffs from magic bytes."),
- )
- document_type: str | None = Field(
+ content_type: str | None = Field(default=None, description="MIME hint; sniffed when omitted.")
+ expected_type: str | None = Field(
default=None,
description=(
- "When the caller knows which DocSpec this file matches, set it "
- "here (e.g. ``passport``). Skips the classifier for this file. "
- "Must match a ``docs[].docType.documentType`` declared in the "
- "request -- the semantic validator rejects unknown values."
+ "Optional caller hint pointing at one of the declared "
+ "``document_types[].id`` values. Skips the classifier for this "
+ "file when the classifier stage is enabled."
),
)
@field_validator("content_base64")
@classmethod
- def _validate_base64(cls, value: str) -> str:
+ def _validate_base64(cls, value: str | None) -> str | None:
+ if value is None:
+ return None
if "," in value and value.startswith("data:"):
value = value.split(",", 1)[1]
try:
@@ -74,150 +71,79 @@ def _validate_base64(cls, value: str) -> str:
return value
def decoded_bytes(self) -> bytes:
+ if self.content_base64 is None:
+ raise ValueError("FileInput.content_base64 is not set (multipart mode)")
return base64.b64decode(self.content_base64)
# ---------------------------------------------------------------------------
-# Pipeline options
+# Options
# ---------------------------------------------------------------------------
class StageToggles(BaseModel):
- """Opt-in switches for every optional pipeline stage.
+ """Opt-in switches for every optional pipeline stage."""
- The :class:`MultimodalExtractor` is always on (it's what produces
- fields + bbox). Everything else is opt-in. Defaults are conservative
- so a vanilla request stays cheap and fast.
- """
+ model_config = ConfigDict(extra="forbid")
- splitter: bool = Field(
- default=False,
- description=(
- "Run the LLM document splitter to map each target document type "
- "to a page range. Required when ``docs`` has more than one entry "
- "and the submitted file interleaves them."
- ),
- )
- classifier: bool = Field(
- default=True,
- description=(
- "When the caller submits multiple files via ``documents[]`` and "
- "does NOT pin them with ``document_type``, this stage asks the "
- "LLM to classify each file into one of the declared DocSpecs. "
- "Cheap to leave on -- it's a no-op when every file already "
- "carries a ``document_type``."
- ),
- )
+ splitter: bool = False
+ classifier: bool = True
field_validation: bool = True
visual_authenticity: bool = False
content_authenticity: bool = False
judge: bool = False
+ judge_escalation: bool = False
+ bbox_refine: bool = False
+ transform: bool = False
rule_engine: bool = False
- judge_escalation: bool = Field(
- default=False,
- description=(
- "When the judge marks too many fields as FAIL / flag_for_review, "
- "re-run extract + judge with the escalation_model and keep the "
- "result that has the lower failure rate. Requires ``judge`` to "
- "be enabled. Threshold + model come from ``options`` or env "
- "defaults (``FLYDOCS_ESCALATION_THRESHOLD`` / "
- "``FLYDOCS_ESCALATION_MODEL``)."
- ),
- )
- bbox_refine: bool = Field(
- default=False,
- description=(
- "Replace LLM-estimated bounding boxes with grounded ones by "
- "fuzzy-matching every extracted value against the document's "
- "real text layer (PyMuPDF for PDFs with embedded text; OCR "
- "for image-only pages and raster inputs). Sub-pixel accurate "
- "for born-digital PDFs. Multilingual: script-aware tokenisation "
- "handles Latin / CJK / Arabic / etc. Adds ~50-200ms for a "
- "30-page text PDF; image-PDFs depend on the OCR engine. The "
- "bbox ``source`` discriminator distinguishes refined "
- "(``pdf_text`` / ``ocr``) from LLM-only fallbacks."
- ),
- )
- transform: bool = Field(
- default=False,
- description=(
- "Run the ``transform`` stage. The stage applies every "
- ":class:`Transformation` declared on "
- "``ExtractionOptions.transformations`` -- declarative entity "
- "resolution and/or free-form LLM transformations -- after "
- "extract+judge and before rules/assemble. No-op when the "
- "list is empty even with the toggle on."
- ),
- )
+
+
+class EscalationConfig(BaseModel):
+ """Configuration for the judge_escalation stage.
+
+ Null on :class:`ExtractionOptions` when judge_escalation is off. When set
+ AND ``stages.judge_escalation`` is true, fires the rerun when the judge's
+ fail-rate crosses ``threshold``.
+ """
+
+ model_config = ConfigDict(extra="forbid")
+
+ threshold: float = Field(..., ge=0.0, le=1.0)
+ model: str = Field(..., min_length=1)
class ExtractionOptions(BaseModel):
- """Per-request knobs."""
+ """Per-request pipeline knobs."""
+
+ model_config = ConfigDict(extra="forbid")
- return_bboxes: bool = True
- language_hint: str | None = Field(default=None, max_length=16)
model: str | None = None
+ language_hint: str | None = Field(default=None, max_length=16)
+ return_bboxes: bool = True
declared_media_type: str | None = None
stages: StageToggles = Field(default_factory=StageToggles)
- escalation_threshold: float | None = Field(
- default=None,
- ge=0.0,
- le=1.0,
- description=(
- "Failure-rate threshold (0.0–1.0) above which the judge "
- "escalation re-run fires. Overrides "
- "``FLYDOCS_ESCALATION_THRESHOLD`` for this request."
- ),
- )
- escalation_model: str | None = Field(
- default=None,
- description=(
- "Model id used for the escalation re-run "
- "(e.g. ``anthropic:claude-opus-4-7``). Overrides "
- "``FLYDOCS_ESCALATION_MODEL`` for this request."
- ),
- )
- transformations: list[Transformation] = Field(
- default_factory=list,
- description=(
- "Post-extraction transformations applied by the ``transform`` "
- "stage. See :mod:`flydocs.interfaces.dtos.transformation` "
- "for the discriminated union of available types. Empty list "
- "means the stage is a no-op even when ``stages.transform`` "
- "is true."
- ),
- )
+ escalation: EscalationConfig | None = None
+ transformations: list[Transformation] = Field(default_factory=list)
# ---------------------------------------------------------------------------
-# The request itself
+# Request
# ---------------------------------------------------------------------------
class ExtractionRequest(BaseModel):
"""One IDP extraction request.
- Every request carries ``documents`` (a non-empty list). A single
- file is just a one-element list; the pipeline never needs to
- branch on cardinality.
+ Every request carries a non-empty ``files`` list and a non-empty
+ ``document_types`` list. A single-file request is just a one-element
+ ``files``; the pipeline never branches on cardinality.
"""
- request_id: uuid.UUID = Field(default_factory=uuid.uuid4)
- intention: str = Field(
- default="Extract structured data from the document.",
- description="Free-form prompt that nuances every node's behaviour (search, judge, rules).",
- )
- documents: list[DocumentInput] = Field(
- ...,
- min_length=1,
- description=(
- "Input files. Each file is processed independently. A file "
- "may optionally pin its target type via ``document_type``; "
- "otherwise the classifier stage matches it to one of the "
- "declared ``docs`` entries."
- ),
- )
- docs: list[DocSpec] = Field(..., min_length=1)
+ model_config = ConfigDict(extra="forbid")
+
+ intention: str = "Extract structured data from the document."
+ files: list[FileInput] = Field(..., min_length=1)
+ document_types: list[DocumentTypeSpec] = Field(..., min_length=1)
rules: list[RuleSpec] = Field(default_factory=list)
options: ExtractionOptions = Field(default_factory=ExtractionOptions)
@@ -228,98 +154,92 @@ class ExtractionRequest(BaseModel):
class ClassificationInfo(BaseModel):
- """Per-file classifier verdict surfaced in the response.
+ """Per-file classifier verdict surfaced in the response."""
- Populated only when the classifier ran on this file (multi-file
- request with no caller pin and ``stages.classifier`` enabled).
- ``matched=False`` means the file did not fit any declared
- ``DocSpec`` -- the file ends up in ``additional_documents`` with
- ``document_type='unmatched'``.
- """
+ model_config = ConfigDict(extra="forbid")
document_type: str
matched: bool = True
confidence: float = Field(default=0.0, ge=0.0, le=1.0)
- description: str = ""
- notes: str = ""
+ description: str | None = None
+ notes: str | None = None
-class DocumentInfo(BaseModel):
- """Summary of one input file. One entry per submitted document."""
+class FileSummary(BaseModel):
+ """Summary of one input file."""
+
+ model_config = ConfigDict(extra="forbid")
filename: str
media_type: str
page_count: int
bytes: int
- document_type: str | None = Field(
- default=None,
- description=(
- "Final document type assigned to this file: the caller's pin "
- "when one was given, the classifier's verdict otherwise. "
- "``null`` when neither the caller nor the classifier could "
- "settle on a type."
- ),
- )
- classification: ClassificationInfo | None = Field(
+ matched_type: str | None = Field(
default=None,
description=(
- "Classifier output for this file. ``null`` when the caller "
- "pinned a ``document_type`` (classifier was skipped) or when "
- "the classifier stage was disabled."
+ "Final document type assigned to this file: the caller's "
+ "``expected_type`` when one was given, the classifier's verdict "
+ "otherwise. Null when neither resolved."
),
)
+ classification: ClassificationInfo | None = None
-class ExtractedDocument(BaseModel):
- """Result for one document instance (one DocSpec resolved on one file)."""
+class Document(BaseModel):
+ """Result for one extracted document instance."""
- document_type: str
+ model_config = ConfigDict(extra="forbid")
+
+ type: str
+ source_file: str | None = None
missing: bool = False
pages: list[int] = Field(default_factory=list)
- description: str = ""
confidence: float = Field(default=0.0, ge=0.0, le=1.0)
- fields: list[ExtractedFieldGroup] = Field(default_factory=list)
- authenticity: DocumentAuthenticity = Field(default_factory=DocumentAuthenticity)
+ description: str | None = None
notes: str | None = None
- source_file: str | None = Field(
- default=None,
- description="Filename of the input file this extracted document came from.",
- )
+ field_groups: list[ExtractedFieldGroup] = Field(default_factory=list)
+ authenticity: DocumentAuthenticity = Field(default_factory=DocumentAuthenticity)
+
+
+class TraceEntry(BaseModel):
+ """One node's execution in the pipeline DAG."""
+
+ model_config = ConfigDict(extra="forbid")
+
+ node: str
+ started_at: datetime
+ completed_at: datetime
+ latency_ms: float
+ status: Literal["success", "failed", "skipped"]
+
+
+class PipelineError(BaseModel):
+ """Non-fatal per-node failure surfaced in the response."""
+
+ model_config = ConfigDict(extra="forbid")
+
+ node: str
+ code: str
+ message: str
class EscalationInfo(BaseModel):
- """Audit block for the judge-driven escalation re-run.
+ """Audit block for the judge-driven escalation re-run."""
- Populated only when ``stages.judge_escalation`` is enabled and the
- judge's first pass exceeded the configured failure threshold.
- """
+ model_config = ConfigDict(extra="forbid")
triggered: bool = False
primary_model: str | None = None
escalation_model: str | None = None
primary_fail_rate: float = Field(default=0.0, ge=0.0, le=1.0)
escalation_fail_rate: float = Field(default=0.0, ge=0.0, le=1.0)
- accepted: bool = Field(
- default=False,
- description=(
- "True when the escalation re-run produced fewer judge "
- "failures than the primary and was kept as the response."
- ),
- )
+ accepted: bool = False
class UsageBreakdown(BaseModel):
- """Aggregated token usage and cost across every LLM call of one request.
+ """Aggregated token usage and cost across every LLM call of one request."""
- Populated by the orchestrator from the framework's per-call
- :class:`UsageRecord`s, scoped to the request via ``correlation_id``.
- Mirrors :class:`fireflyframework_agentic.observability.UsageSummary`.
-
- ``by_agent`` keys are the internal agent names (e.g.
- ``flydocs-extractor``, ``flydocs-classifier``,
- ``flydocs-splitter``). ``by_model`` keys are the fully-qualified
- model ids (e.g. ``anthropic:claude-opus-4-7``).
- """
+ model_config = ConfigDict(extra="forbid")
total_input_tokens: int = 0
total_output_tokens: int = 0
@@ -334,72 +254,29 @@ class UsageBreakdown(BaseModel):
by_model: dict[str, dict[str, Any]] = Field(default_factory=dict)
-class TraceEntry(BaseModel):
- """One node's execution in the pipeline DAG."""
+class PipelineMeta(BaseModel):
+ """Pipeline-level instrumentation metadata for one extraction."""
- node: str
- started_at: datetime
- completed_at: datetime
- latency_ms: float
- status: str = Field(description="``success`` | ``failed`` | ``skipped``.")
+ model_config = ConfigDict(extra="forbid")
+
+ model: str
+ latency_ms: int = Field(..., ge=0)
+ trace: list[TraceEntry] = Field(default_factory=list)
+ errors: list[PipelineError] = Field(default_factory=list)
+ escalation: EscalationInfo | None = None
+ usage: UsageBreakdown | None = None
class ExtractionResult(BaseModel):
- """Top-level response."""
+ """Top-level response shape (sync /extract, async /extractions/{id}/result)."""
- model_config = ConfigDict(populate_by_name=True)
+ model_config = ConfigDict(extra="forbid")
- request_id: uuid.UUID
- files: list[DocumentInfo] = Field(
- default_factory=list,
- description="Per-file summary for every input file the request carried.",
- )
- documents: list[ExtractedDocument] = Field(default_factory=list)
- additional_documents: list[ExtractedDocument] = Field(
- default_factory=list,
- description="Documents found in the source PDF that don't match any requested doc type.",
- )
+ id: str
+ status: Literal["success", "partial"] = "success"
+ files: list[FileSummary] = Field(default_factory=list)
+ documents: list[Document] = Field(default_factory=list)
+ discovered_documents: list[Document] = Field(default_factory=list)
rule_results: list[RuleResult] = Field(default_factory=list)
- request_transformations: list[ExtractedFieldGroup] = Field(
- default_factory=list,
- description=(
- "Output of every ``scope=request`` transformation applied "
- "by the ``transform`` stage. Each entry is a consolidated, "
- "post-transformation field group keyed by the "
- "``output_group`` name from the originating "
- ":class:`Transformation` (or the ``target_group`` when "
- "``output_group`` is null). Empty list when no "
- "request-scope transformation ran or when none of them "
- "produced output."
- ),
- )
- model: str
- latency_ms: int = Field(..., ge=0)
- pipeline_errors: list[dict[str, Any]] = Field(
- default_factory=list,
- description="Non-fatal per-node failures: ``[{code, message, node}]``.",
- )
- escalation: EscalationInfo | None = Field(
- default=None,
- description=(
- "Audit block populated when judge_escalation runs. ``null`` "
- "when escalation is disabled or didn't fire."
- ),
- )
- usage: UsageBreakdown | None = Field(
- default=None,
- description=(
- "Aggregated token usage and estimated USD cost across every "
- "LLM call this request made (extract, classifier, splitter, "
- "judge, visual, content, rules). ``null`` when cost tracking "
- "is disabled or no LLM calls fired."
- ),
- )
- trace: list[TraceEntry] = Field(
- default_factory=list,
- description=(
- "Per-stage execution trace as the orchestrator's DAG ran it. "
- "One entry per executed node with start/end timestamps, "
- "latency, and status."
- ),
- )
+ request_transformations: list[ExtractedFieldGroup] = Field(default_factory=list)
+ pipeline: PipelineMeta
diff --git a/src/flydocs/interfaces/dtos/extraction.py b/src/flydocs/interfaces/dtos/extraction.py
new file mode 100644
index 0000000..2c15278
--- /dev/null
+++ b/src/flydocs/interfaces/dtos/extraction.py
@@ -0,0 +1,114 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""DTOs for the async extraction lifecycle.
+
+Endpoints:
+
+* ``POST /api/v1/extractions`` -- submit
+* ``GET /api/v1/extractions`` -- list
+* ``GET /api/v1/extractions/{id}`` -- status
+* ``GET /api/v1/extractions/{id}/result`` -- final result envelope
+* ``DELETE /api/v1/extractions/{id}`` -- cancel (only while queued)
+
+The main lifecycle is linear: ``queued -> running -> succeeded | failed |
+cancelled``. Post-processing (bbox refinement today, more tomorrow) lives in
+the additive :class:`PostProcessing` block with its own
+:class:`PostProcessingStatus` lifecycle that does not gate the main status.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Any
+
+from pydantic import AnyHttpUrl, BaseModel, ConfigDict, Field
+
+from flydocs.interfaces.dtos.extract import ExtractionRequest, ExtractionResult
+from flydocs.interfaces.enums.extraction_status import ExtractionStatus, PostProcessingStatus
+
+
+class SubmitExtractionRequest(ExtractionRequest):
+ """Submit shape: full extraction request plus async-only fields."""
+
+ callback_url: AnyHttpUrl | None = Field(
+ default=None,
+ description="If set, the worker POSTs an EventEnvelope here on terminal status.",
+ )
+ metadata: dict[str, Any] = Field(default_factory=dict)
+
+
+class ExtractionError(BaseModel):
+ """Terminal-state error info for a failed extraction."""
+
+ model_config = ConfigDict(extra="forbid")
+
+ code: str
+ message: str
+
+
+class BboxRefinementInfo(BaseModel):
+ """Lifecycle info for the bbox-refinement post-processing leg."""
+
+ model_config = ConfigDict(extra="forbid")
+
+ status: PostProcessingStatus
+ started_at: datetime | None = None
+ finished_at: datetime | None = None
+ attempts: int = 0
+ error: ExtractionError | None = None
+
+
+class PostProcessing(BaseModel):
+ """Container for post-processing legs attached to a succeeded extraction."""
+
+ model_config = ConfigDict(extra="forbid")
+
+ bbox_refinement: BboxRefinementInfo | None = None
+
+
+class Extraction(BaseModel):
+ """Current state snapshot of an async extraction job."""
+
+ model_config = ConfigDict(extra="forbid")
+
+ id: str
+ status: ExtractionStatus
+ submitted_at: datetime
+ started_at: datetime | None = None
+ finished_at: datetime | None = None
+ attempts: int = 0
+ error: ExtractionError | None = None
+ post_processing: PostProcessing | None = None
+
+
+class ExtractionResultEnvelope(BaseModel):
+ """``GET /extractions/{id}/result`` body."""
+
+ model_config = ConfigDict(extra="forbid")
+
+ id: str
+ result: ExtractionResult
+
+
+class ExtractionListQuery(BaseModel):
+ """Query parameters for ``GET /api/v1/extractions``."""
+
+ model_config = ConfigDict(extra="forbid")
+
+ statuses: list[ExtractionStatus] = Field(default_factory=list)
+ post_processing_statuses: list[PostProcessingStatus] = Field(default_factory=list)
+ created_after: datetime | None = None
+ created_before: datetime | None = None
+ idempotency_key: str | None = None
+ limit: int = Field(default=50, ge=1, le=500)
+ offset: int = Field(default=0, ge=0)
+
+
+class ExtractionListResponse(BaseModel):
+ """Paginated list response."""
+
+ model_config = ConfigDict(extra="forbid")
+
+ items: list[Extraction]
+ total: int
+ limit: int
+ offset: int
diff --git a/src/flydocs/interfaces/dtos/field.py b/src/flydocs/interfaces/dtos/field.py
index b6afac1..763b7fe 100644
--- a/src/flydocs/interfaces/dtos/field.py
+++ b/src/flydocs/interfaces/dtos/field.py
@@ -1,127 +1,134 @@
# Copyright 2026 Firefly Software Solutions Inc
"""Field-level DTOs -- schema in, extraction out.
-The request side groups fields under a named :class:`FieldGroup` (for
-example ``personal``, ``billing``) and supports JSON-Schema-style
-constraints plus an extensible :class:`StandardValidatorSpec` list per
-field. The response side carries the parallel :class:`ExtractedField`
-structure with confidence, page, bounding box, judge verdict, and
-field-validation result. Array fields (repeating rows of sub-fields)
-are supported recursively.
+One recursive :class:`Field` handles primitives, arrays, and nested
+objects. Arrays require ``items`` (a single ``Field`` describing the row
+shape, typically of type ``object``); objects require ``fields`` (a list
+of ``Field`` members); primitives forbid both.
+
+The response side carries :class:`ExtractedField` with the same recursion.
"""
from __future__ import annotations
from typing import Any
-from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic import BaseModel, ConfigDict, model_validator
+from pydantic import Field as _PydField
from flydocs.interfaces.dtos.bbox import BoundingBox
-from flydocs.interfaces.dtos.standard_validator import StandardValidatorSpec
+from flydocs.interfaces.dtos.validator import ValidatorSpec
from flydocs.interfaces.enums.field_type import FieldType, StandardFormat
from flydocs.interfaces.enums.status import JudgeStatus, ValidationRule
# ---------------------------------------------------------------------------
-# REQUEST side -- the schema the caller submits
+# REQUEST SIDE -- the schema the caller submits
# ---------------------------------------------------------------------------
-class FieldItem(BaseModel):
- """One sub-field inside an array field (e.g. a column of a line-items table)."""
-
- fieldName: str = Field(..., min_length=1)
- fieldDescription: str = ""
- fieldType: FieldType = FieldType.STRING
+class Field(BaseModel):
+ """One field in a schema. Recursive for arrays and objects."""
- pattern: str | None = None
- format: StandardFormat | None = None
- enum: list[Any] | None = None
- minimum: float | None = None
- maximum: float | None = None
- standard_validators: list[StandardValidatorSpec] = Field(default_factory=list)
+ model_config = ConfigDict(extra="forbid")
-
-class FieldSpec(BaseModel):
- """One field the caller wants extracted.
-
- For ``fieldType == array`` the ``items`` list describes the columns
- of every repeating row. For primitive types ``items`` must be
- empty / null.
- """
-
- model_config = ConfigDict(populate_by_name=True)
-
- fieldName: str = Field(..., min_length=1, alias="name")
- fieldDescription: str = Field(default="", alias="description")
- fieldType: FieldType = Field(default=FieldType.STRING, alias="type")
+ name: str = _PydField(..., min_length=1)
+ description: str | None = None
+ type: FieldType = FieldType.STRING
required: bool = False
-
pattern: str | None = None
format: StandardFormat | None = None
enum: list[Any] | None = None
minimum: float | None = None
maximum: float | None = None
- items: list[FieldItem] | None = None
- standard_validators: list[StandardValidatorSpec] = Field(default_factory=list)
+ items: Field | None = None
+ fields: list[Field] | None = None
+ validators: list[ValidatorSpec] = _PydField(default_factory=list)
@model_validator(mode="after")
- def _check_constraints(self) -> FieldSpec:
+ def _check_constraints(self) -> Field:
if self.minimum is not None and self.maximum is not None and self.minimum > self.maximum:
raise ValueError("minimum must be <= maximum")
- if self.fieldType != FieldType.ARRAY and self.items:
- raise ValueError("items is only valid when fieldType is array")
+
+ if self.type == FieldType.ARRAY:
+ if self.items is None:
+ raise ValueError("type 'array' requires items")
+ if self.fields is not None:
+ raise ValueError("type 'array' must not set fields")
+ elif self.type == FieldType.OBJECT:
+ if not self.fields:
+ raise ValueError("type 'object' requires fields (non-empty list)")
+ if self.items is not None:
+ raise ValueError("type 'object' must not set items")
+ else:
+ if self.items is not None:
+ raise ValueError(f"type '{self.type.value}' must not set items")
+ if self.fields is not None:
+ raise ValueError(f"type '{self.type.value}' must not set fields")
return self
+Field.model_rebuild()
+
+
class FieldGroup(BaseModel):
- fieldGroupName: str = Field(..., min_length=1)
- fieldGroupDesc: str = ""
- fieldGroupFields: list[FieldSpec] = Field(..., min_length=1)
+ """A named bundle of fields the service extracts together."""
+
+ model_config = ConfigDict(extra="forbid")
+
+ name: str = _PydField(..., min_length=1)
+ description: str | None = None
+ fields: list[Field] = _PydField(..., min_length=1)
# ---------------------------------------------------------------------------
-# RESPONSE side -- the structure returned alongside each extracted value
+# RESPONSE SIDE -- structure returned alongside each extracted value
# ---------------------------------------------------------------------------
class FieldValidationError(BaseModel):
+ model_config = ConfigDict(extra="forbid")
+
rule: ValidationRule
message: str
class FieldValidation(BaseModel):
+ model_config = ConfigDict(extra="forbid")
+
valid: bool = True
- errors: list[FieldValidationError] = Field(default_factory=list)
+ errors: list[FieldValidationError] = _PydField(default_factory=list)
class JudgeOutcome(BaseModel):
+ model_config = ConfigDict(extra="forbid")
+
status: JudgeStatus = JudgeStatus.UNCERTAIN
- confidence: float = Field(default=0.0, ge=0.0, le=1.0)
- evidence: str = ""
- notes: str = ""
+ confidence: float = _PydField(default=0.0, ge=0.0, le=1.0)
+ evidence: str | None = None
+ notes: str | None = None
flag_for_review: bool = False
class ExtractedField(BaseModel):
- """One extracted field. Recursive: array fields contain rows of sub-fields."""
+ """One extracted field. Recursive for arrays and objects."""
- model_config = ConfigDict(populate_by_name=True)
+ model_config = ConfigDict(extra="forbid")
- fieldName: str = Field(..., alias="name")
- fieldValueFound: str | int | float | bool | list[ExtractedField] | None = Field(
- default=None, alias="value"
- )
- pagesFound: list[int] = Field(default_factory=list)
- confidence: float = Field(default=0.0, ge=0.0, le=1.0)
- bbox: BoundingBox = Field(default_factory=BoundingBox.empty)
+ name: str
+ value: str | int | float | bool | list[ExtractedField] | None = None
+ pages: list[int] = _PydField(default_factory=list)
+ confidence: float = _PydField(default=0.0, ge=0.0, le=1.0)
+ bbox: BoundingBox | None = None
+ validation: FieldValidation = _PydField(default_factory=FieldValidation)
+ judge: JudgeOutcome = _PydField(default_factory=JudgeOutcome)
notes: str | None = None
- judge: JudgeOutcome = Field(default_factory=JudgeOutcome)
- field_validation: FieldValidation = Field(default_factory=FieldValidation)
-class ExtractedFieldGroup(BaseModel):
- fieldGroupName: str
- fieldGroupFields: list[ExtractedField]
+ExtractedField.model_rebuild()
-ExtractedField.model_rebuild()
+class ExtractedFieldGroup(BaseModel):
+ model_config = ConfigDict(extra="forbid")
+
+ name: str
+ fields: list[ExtractedField]
diff --git a/src/flydocs/interfaces/dtos/job.py b/src/flydocs/interfaces/dtos/job.py
deleted file mode 100644
index 4e164e1..0000000
--- a/src/flydocs/interfaces/dtos/job.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""DTOs for the async (queue-backed) API."""
-
-from __future__ import annotations
-
-from datetime import datetime
-from typing import Any
-
-from pydantic import AnyHttpUrl, BaseModel, Field
-
-from flydocs.interfaces.dtos.doc import DocSpec
-from flydocs.interfaces.dtos.extract import DocumentInput, ExtractionOptions, ExtractionResult
-from flydocs.interfaces.dtos.rule import RuleSpec
-from flydocs.interfaces.enums.job_status import JobStatus
-
-
-class SubmitJobRequest(BaseModel):
- """Async-job submit payload.
-
- Mirrors :class:`flydocs.interfaces.dtos.extract.ExtractionRequest`:
- every submission carries a non-empty ``documents`` list. A single
- file is just a one-element list — the worker pipeline never branches
- on cardinality.
- """
-
- intention: str = "Extract structured data from the document."
- documents: list[DocumentInput] = Field(
- ...,
- min_length=1,
- description="Input files. Each entry is processed independently by the pipeline.",
- )
- docs: list[DocSpec] = Field(..., min_length=1)
- rules: list[RuleSpec] = Field(default_factory=list)
- options: ExtractionOptions = Field(default_factory=ExtractionOptions)
- callback_url: AnyHttpUrl | None = Field(
- default=None,
- description="If set, the worker POSTs a JobWebhookPayload here on terminal status.",
- )
- metadata: dict[str, Any] = Field(default_factory=dict)
-
-
-class SubmitJobResponse(BaseModel):
- job_id: str
- status: JobStatus
- submitted_at: datetime
-
-
-class JobStatusResponse(BaseModel):
- job_id: str
- status: JobStatus
- submitted_at: datetime
- started_at: datetime | None = None
- finished_at: datetime | None = None
- attempts: int = 0
- error_code: str | None = None
- error_message: str | None = None
- bbox_refine_status: str | None = Field(
- default=None,
- description=(
- "Sub-state of the bbox-refine leg when ``options.stages.bbox_refine`` "
- "was enabled at submit time. One of ``pending`` (event published, "
- "worker hasn't picked it up), ``running``, ``succeeded``, ``failed``. "
- "``null`` when the job didn't ask for refinement."
- ),
- )
- bbox_refine_attempts: int = 0
- bbox_refine_started_at: datetime | None = None
- bbox_refine_finished_at: datetime | None = None
- bbox_refine_error_code: str | None = None
- bbox_refine_error_message: str | None = None
-
-
-class JobResult(BaseModel):
- job_id: str
- result: ExtractionResult
-
-
-class JobListQuery(BaseModel):
- """Query parameters for ``GET /api/v1/jobs``.
-
- All filters are optional and combine with ``AND``. ``statuses`` and
- ``bbox_refine_statuses`` are repeated query params (e.g.
- ``?status=SUCCEEDED&status=PARTIAL_SUCCEEDED``). ``created_after`` /
- ``created_before`` are RFC 3339 timestamps inclusive on both ends.
- """
-
- statuses: list[JobStatus] = Field(default_factory=list)
- bbox_refine_statuses: list[str] = Field(default_factory=list)
- created_after: datetime | None = None
- created_before: datetime | None = None
- idempotency_key: str | None = None
- limit: int = Field(default=50, ge=1, le=500)
- offset: int = Field(default=0, ge=0)
-
-
-class JobListResponse(BaseModel):
- """Paginated list of jobs."""
-
- items: list[JobStatusResponse]
- total: int
- limit: int
- offset: int
diff --git a/src/flydocs/interfaces/dtos/rule.py b/src/flydocs/interfaces/dtos/rule.py
index c696134..e90da50 100644
--- a/src/flydocs/interfaces/dtos/rule.py
+++ b/src/flydocs/interfaces/dtos/rule.py
@@ -2,59 +2,65 @@
"""Business-rule DTOs.
Rules express boolean / categorical decisions over extracted fields,
-validator outcomes, and other rules' results. They form a DAG: a rule
-that depends on another rule's output is evaluated *after* the parent.
-Cycles are rejected at request validation time by :class:`RuleEngine`.
+validator outcomes, and other rules' results. They form a DAG; cycles are
+rejected at request validation time by ``RequestValidator`` /
+``RuleEngine``.
+
+The :class:`RuleParent` discriminator is ``kind`` (not ``type``) to avoid
+collision with :class:`Field.type` / :class:`RuleOutputSpec.type` when
+walking a request by literal key name.
"""
from __future__ import annotations
from typing import Annotated, Literal
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class _BaseParent(BaseModel):
+ model_config = ConfigDict(extra="forbid")
-class RuleFieldParent(BaseModel):
- parentType: Literal["field"] = "field"
- documentType: str
- fieldNames: list[str] = Field(..., min_length=1)
+class RuleFieldParent(_BaseParent):
+ kind: Literal["field"] = "field"
+ document_type: str
+ fields: list[str] = Field(..., min_length=1)
-class RuleValidatorParent(BaseModel):
- parentType: Literal["validator"] = "validator"
- documentType: str
- validatorName: str
+class RuleValidatorParent(_BaseParent):
+ kind: Literal["validator"] = "validator"
+ document_type: str
+ validator: str
-class RuleRuleParent(BaseModel):
- parentType: Literal["rule"] = "rule"
- ruleId: str
+class RuleRuleParent(_BaseParent):
+ kind: Literal["rule"] = "rule"
+ rule: str
RuleParent = Annotated[
RuleFieldParent | RuleValidatorParent | RuleRuleParent,
- Field(discriminator="parentType"),
+ Field(discriminator="kind"),
]
class RuleOutputSpec(BaseModel):
"""How the rule's output is interpreted."""
- type: str = Field(default="boolean", description="``boolean``, ``string``, or ``number``.")
- valid_outputs: list[str] | None = Field(
- default=None,
- description=(
- "Optional closed set of valid output strings. The rule engine "
- "treats anything outside this set as ``flag_for_review``."
- ),
- )
+ model_config = ConfigDict(extra="forbid")
+
+ type: str = Field(default="boolean", description="'boolean' | 'string' | 'number'.")
+ valid_outputs: list[str] | None = None
class RuleSpec(BaseModel):
"""One business rule."""
+ model_config = ConfigDict(extra="forbid")
+
id: str = Field(..., min_length=1)
- predicate: str = Field(..., min_length=1, description="Natural-language predicate evaluated by the LLM.")
+ predicate: str = Field(..., min_length=1)
parents: list[RuleParent] = Field(default_factory=list)
output: RuleOutputSpec = Field(default_factory=RuleOutputSpec)
@@ -62,11 +68,11 @@ class RuleSpec(BaseModel):
class RuleResult(BaseModel):
"""Per-rule outcome returned in the response."""
+ model_config = ConfigDict(extra="forbid")
+
rule_id: str
predicate: str
- output: str = Field(
- default="", description="The resolved output value (string form -- ``true``/``false``/...)."
- )
- summary: str = ""
+ output: str = ""
+ summary: str | None = None
notes: list[str] = Field(default_factory=list)
- human_revision: str = Field(default="", description="Instructions for a human reviewer if needed.")
+ human_revision: str | None = None
diff --git a/src/flydocs/interfaces/dtos/standard_validator.py b/src/flydocs/interfaces/dtos/standard_validator.py
deleted file mode 100644
index b87940c..0000000
--- a/src/flydocs/interfaces/dtos/standard_validator.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""``StandardValidatorSpec`` -- request-side declaration for one built-in check."""
-
-from __future__ import annotations
-
-from typing import Any, Literal
-
-from pydantic import BaseModel, Field
-
-from flydocs.interfaces.enums.standard_validator import StandardValidatorType
-
-
-class StandardValidatorSpec(BaseModel):
- """One named built-in validator applied to a field.
-
- Examples::
-
- {"type": "iban"} # generic
- {"type": "phone_e164", "params": {"country": "ES"}}
- {"type": "vat_id", "params": {"country": "ES"}, "severity": "warning"}
-
- ``severity`` distinguishes hard errors (``error`` -- field is
- ``valid=false``) from soft warnings (``warning`` -- error is
- recorded but the field stays ``valid=true``). ``error`` is the
- default.
- """
-
- type: StandardValidatorType
- params: dict[str, Any] = Field(default_factory=dict)
- severity: Literal["error", "warning"] = "error"
diff --git a/src/flydocs/interfaces/dtos/transformation.py b/src/flydocs/interfaces/dtos/transformation.py
index 5099a9b..334aaa3 100644
--- a/src/flydocs/interfaces/dtos/transformation.py
+++ b/src/flydocs/interfaces/dtos/transformation.py
@@ -2,26 +2,21 @@
"""Public DTOs for the ``transform`` pipeline stage.
The transformation stage runs **after** every other LLM stage (extract,
-judge, judge_escalation) and **before** ``rules`` / ``assemble``. It
-lets callers express *post-extraction* logic without pushing it into
-their own application code.
+judge, judge_escalation) and **before** rules / assemble. It lets callers
+express post-extraction logic without pushing it into their own application
+code.
Two transformation types ship in-tree:
* :class:`EntityResolutionTransformation` -- declarative, free,
- millisecond-scale. Deduplicates rows of an array field group across
- documents using accent-fold + token-subset matching. The
- ``bastanteo-poderes-poc`` previously did this work outside the
- service; this stage subsumes it.
+ millisecond-scale. Deduplicates rows of an array/object field group
+ across documents using accent-fold + token-subset matching.
* :class:`LlmTransformation` -- free-form. Caller supplies an
``intention`` (a one-sentence goal in any language) and the engine
runs a focused LLM call against the target group, returning a
transformed list of rows in the same shape.
-Both types are dispatched by
-:class:`TransformationEngine` based on the discriminator ``type``.
-Future declarative types (format normalisation, aggregation, role
-mapping...) add to the discriminated union without changing the API.
+The discriminator is ``type``. New declarative types extend the union.
"""
from __future__ import annotations
@@ -37,130 +32,34 @@ class TransformationScope(StrEnum):
"""Whether a transformation applies per-document or across the whole request."""
TASK = "task"
- """The transformation runs once per ``(segment, DocSpec)`` task and
- mutates that task's extracted groups in place. This is the right
- scope for transformations that only consider one document at a
- time (format normalisation, single-doc dedup)."""
-
REQUEST = "request"
- """Groups with the matching ``fieldGroupName`` are concatenated
- across every task, the transformation runs once over the
- consolidated rows, and the result is emitted as a top-level
- ``request_transformations`` entry on :class:`ExtractionResult`.
- Use this for cross-document entity resolution: the same person
- appearing in multiple deeds collapses into a single canonical row."""
class _BaseTransformation(BaseModel):
- """Common fields every transformation carries."""
-
- model_config = ConfigDict(populate_by_name=True)
+ model_config = ConfigDict(extra="forbid")
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
- target_group: str = Field(
- ...,
- description=(
- "``fieldGroupName`` the transformation operates on (for "
- "example ``personas``, ``line_items``). Must match a group "
- "that the extractor produces; the stage is a no-op if no "
- "matching group is found in the task."
- ),
- )
- output_group: str | None = Field(
- default=None,
- description=(
- "Optional new group name. When set, the original group is "
- "left untouched and the transformation output is appended "
- "as a new group; useful when you want both the raw and the "
- "transformed view in the response. When ``None`` (default), "
- "the target group is replaced in place."
- ),
- )
+ target_group: str = Field(..., min_length=1)
+ output_group: str | None = None
scope: TransformationScope = TransformationScope.TASK
class EntityResolutionTransformation(_BaseTransformation):
- """Deterministic deduplication of an array field group's rows.
-
- The matcher operates in two phases:
-
- 1. **DNI-first**: rows whose normalised DNI (NFKD-fold, alnum only,
- upper) collide are merged unconditionally.
- 2. **Name-variant**: for rows without DNI, names are NFKD-folded
- and tokenised; two rows match when one token set is a subset
- of the other and they share at least ``min_shared_tokens``
- tokens. This handles ``"Andrés Contreras"`` vs
- ``"Andres Contreras Guillen"`` without merging unrelated
- people who happen to share a single first name.
-
- When rows are merged, the canonical row is built by picking the
- most complete value for each field (longest token-count for names,
- first non-empty for everything else).
- """
+ """Deterministic deduplication of an array field group's rows."""
type: Literal["entity_resolution"] = "entity_resolution"
- match_by: list[str] = Field(
- ...,
- min_length=1,
- description=(
- "Field names to consider for matching, in priority order. "
- "The first field in the list whose values are non-empty on "
- "both rows acts as the key. Typical: "
- "``['dni', 'nombre']`` -- match by DNI first, then by name "
- "for rows that lack DNI."
- ),
- )
- min_shared_tokens: int = Field(
- default=2,
- ge=1,
- description=(
- "Minimum shared name tokens required for a name-variant "
- "match. A single-token name (e.g. just ``Andrés``) is "
- "rarely unique enough to merge two rows; 2 is a safe "
- "default that bridges accent / surname variants without "
- "collapsing strangers."
- ),
- )
+ match_by: list[str] = Field(..., min_length=1)
+ min_shared_tokens: int = Field(default=2, ge=1)
class LlmTransformation(_BaseTransformation):
- """Free-form LLM transformation of an array field group's rows.
+ """Free-form LLM transformation of an array field group's rows."""
- Use this for anything the declarative types can't do: role
- classification, summarisation, free-text normalisation, language
- translation, schema migration between extraction passes, etc.
+ type: Literal["llm"] = "llm"
+ intention: str = Field(..., min_length=10)
+ prompt_id: str | None = None
- The transformer serialises the target group's rows to JSON, hands
- them to a focused LLM call with the caller's ``intention``, and
- expects the LLM to return a list of rows in the same shape. The
- response replaces (or, with ``output_group``, augments) the
- original group.
- """
- type: Literal["llm"] = "llm"
- intention: str = Field(
- ...,
- min_length=10,
- description=(
- "One-sentence goal in any language. Example: "
- '``"Normaliza cada cargo a una taxonomía cerrada: '
- '{administrador_unico, consejero, apoderado, otros}"``.'
- ),
- )
- prompt_id: str | None = Field(
- default=None,
- description=(
- "Optional named prompt template id from the catalog. When "
- "omitted, the default transform prompt is used and the "
- "``intention`` is interpolated into it."
- ),
- )
-
-
-# Pydantic discriminated union. Adding a new declarative type later
-# (e.g. ``FormatNormalisationTransformation``) is a single-line union
-# extension here plus a new branch in
-# :class:`TransformationEngine.apply`.
Transformation = Annotated[
EntityResolutionTransformation | LlmTransformation,
Field(discriminator="type"),
diff --git a/src/flydocs/interfaces/dtos/validator.py b/src/flydocs/interfaces/dtos/validator.py
new file mode 100644
index 0000000..e53259a
--- /dev/null
+++ b/src/flydocs/interfaces/dtos/validator.py
@@ -0,0 +1,36 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""ValidatorSpec -- request-side declaration for one built-in check.
+
+Replaces the v0 ``StandardValidatorSpec``. ``name`` is the dispatch key
+(was ``type`` in v0 — renamed to avoid collision with :class:`Field.type`
+when both appear in the same parent envelope).
+"""
+
+from __future__ import annotations
+
+from typing import Any, Literal
+
+from pydantic import BaseModel, ConfigDict, Field
+
+from flydocs.interfaces.enums.validator import ValidatorType
+
+
+class ValidatorSpec(BaseModel):
+ """One named built-in validator applied to a field.
+
+ Examples::
+
+ {"name": "iban"}
+ {"name": "phone_e164", "params": {"country": "ES"}}
+ {"name": "vat_id", "params": {"country": "ES"}, "severity": "warning"}
+
+ ``severity`` distinguishes hard errors (``error`` -- field is
+ ``valid=false``) from soft warnings (``warning`` -- error is recorded
+ but the field stays ``valid=true``). ``error`` is the default.
+ """
+
+ model_config = ConfigDict(extra="forbid")
+
+ name: ValidatorType
+ params: dict[str, Any] = Field(default_factory=dict)
+ severity: Literal["error", "warning"] = "error"
diff --git a/src/flydocs/interfaces/dtos/webhook.py b/src/flydocs/interfaces/dtos/webhook.py
deleted file mode 100644
index 3a32b8e..0000000
--- a/src/flydocs/interfaces/dtos/webhook.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""Outbound webhook payload delivered to ``callback_url``.
-
-Sent by :class:`flydocs.core.services.webhook.webhook_publisher.WebhookPublisher`
-when an async job reaches a terminal state. Signed with HMAC-SHA256 in
-the ``X-Flydocs-Signature`` header when ``FLYDOCS_WEBHOOK_HMAC_SECRET``
-is configured.
-"""
-
-from __future__ import annotations
-
-import uuid
-from datetime import datetime
-from typing import Any
-
-from pydantic import BaseModel, Field
-
-from flydocs.interfaces.dtos.extract import ExtractionResult
-from flydocs.interfaces.enums.job_status import JobStatus
-
-
-def _new_event_id() -> str:
- return str(uuid.uuid4())
-
-
-class JobWebhookPayload(BaseModel):
- """Webhook envelope mirroring the typed EDA events.
-
- Carries identity (``event_id``, ``job_id``), lifecycle
- (``occurred_at``, ``started_at``, ``finished_at``), correlation
- (``correlation_id``), and the terminal status. Consumers should
- dedupe by ``event_id`` since the webhook publisher retries on
- delivery failures.
- """
-
- event_id: str = Field(
- default_factory=_new_event_id,
- description="Unique UUID v4 for this webhook delivery. Use to dedupe on the client.",
- )
- event_type: str = Field(
- default="IDPJobCompleted",
- description="Mirrors the EDA event type that triggered this delivery.",
- )
- version: str = Field(default="1.0.0", description="Payload schema version (semver).")
- job_id: str
- status: JobStatus
- occurred_at: datetime
- started_at: datetime | None = Field(
- default=None,
- description="When the worker first picked the job up.",
- )
- finished_at: datetime | None = Field(
- default=None,
- description="Terminal-state timestamp; mirrors ``ExtractionJob.finished_at``.",
- )
- attempts: int = Field(default=1, ge=1, description="Worker attempts consumed.")
- correlation_id: str | None = Field(
- default=None,
- description="Request-level correlation id propagated through every stage.",
- )
- tenant_id: str | None = None
- metadata: dict[str, Any] = Field(default_factory=dict)
- result: ExtractionResult | None = Field(
- default=None,
- description="Populated when status is SUCCEEDED. Null on FAILED / CANCELLED.",
- )
- error_code: str | None = None
- error_message: str | None = None
diff --git a/src/flydocs/interfaces/enums/__init__.py b/src/flydocs/interfaces/enums/__init__.py
index ce0cfd0..9bde76c 100644
--- a/src/flydocs/interfaces/enums/__init__.py
+++ b/src/flydocs/interfaces/enums/__init__.py
@@ -1,23 +1,24 @@
# Copyright 2026 Firefly Software Solutions Inc
"""Enumerations referenced by public DTOs."""
+from flydocs.interfaces.enums.extraction_status import ExtractionStatus, PostProcessingStatus
from flydocs.interfaces.enums.field_type import FieldType, StandardFormat
-from flydocs.interfaces.enums.job_status import JobStatus
-from flydocs.interfaces.enums.standard_validator import StandardValidatorType
from flydocs.interfaces.enums.status import (
CheckStatus,
ContentIntegrityStatus,
JudgeStatus,
ValidationRule,
)
+from flydocs.interfaces.enums.validator import ValidatorType
__all__ = [
"CheckStatus",
"ContentIntegrityStatus",
+ "ExtractionStatus",
"FieldType",
- "JobStatus",
"JudgeStatus",
+ "PostProcessingStatus",
"StandardFormat",
- "StandardValidatorType",
"ValidationRule",
+ "ValidatorType",
]
diff --git a/src/flydocs/interfaces/enums/extraction_status.py b/src/flydocs/interfaces/enums/extraction_status.py
new file mode 100644
index 0000000..2ee5cf4
--- /dev/null
+++ b/src/flydocs/interfaces/enums/extraction_status.py
@@ -0,0 +1,53 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""Async extraction lifecycle states.
+
+One linear state machine: queued -> running -> succeeded | failed | cancelled.
+Post-processing (bbox refinement today, more tomorrow) lives in a separate
+block on the Extraction with its own PostProcessingStatus lifecycle.
+
+Replaces the legacy JobStatus / BboxRefineStatus pair from v0. The two-phase
+machine (PARTIAL_SUCCEEDED -> REFINING_BBOXES -> SUCCEEDED) is gone: a job
+reaches "succeeded" the moment the main pipeline finishes, and bbox
+refinement runs as an additive post-processing step that does not gate the
+main lifecycle.
+"""
+
+from __future__ import annotations
+
+from enum import StrEnum
+
+
+class ExtractionStatus(StrEnum):
+ QUEUED = "queued"
+ RUNNING = "running"
+ SUCCEEDED = "succeeded"
+ FAILED = "failed"
+ CANCELLED = "cancelled"
+
+ @property
+ def is_terminal(self) -> bool:
+ """True when no further state transition is expected for this extraction."""
+ return self in (ExtractionStatus.SUCCEEDED, ExtractionStatus.FAILED, ExtractionStatus.CANCELLED)
+
+ @property
+ def has_result(self) -> bool:
+ """True when the extraction carries a readable ExtractionResult.
+
+ Only ``succeeded`` does; partial / refining states no longer exist in
+ v1 — refinement runs as additive post-processing on a fully-succeeded
+ result.
+ """
+ return self is ExtractionStatus.SUCCEEDED
+
+
+class PostProcessingStatus(StrEnum):
+ """Sub-state for additive post-processing legs (bbox refinement today)."""
+
+ PENDING = "pending"
+ RUNNING = "running"
+ SUCCEEDED = "succeeded"
+ FAILED = "failed"
+
+ @property
+ def is_terminal(self) -> bool:
+ return self in (PostProcessingStatus.SUCCEEDED, PostProcessingStatus.FAILED)
diff --git a/src/flydocs/interfaces/enums/field_type.py b/src/flydocs/interfaces/enums/field_type.py
index ed98c64..913e62b 100644
--- a/src/flydocs/interfaces/enums/field_type.py
+++ b/src/flydocs/interfaces/enums/field_type.py
@@ -7,18 +7,23 @@
class FieldType(StrEnum):
+ """JSON-Schema-aligned primitive set for the public Field model."""
+
STRING = "string"
NUMBER = "number"
INTEGER = "integer"
BOOLEAN = "boolean"
ARRAY = "array"
+ OBJECT = "object"
class StandardFormat(StrEnum):
- """JSON Schema-style standard formats applied at validation time."""
+ """Standard format hints applied to typed field values at validation time."""
DATE = "date"
DATE_TIME = "date-time"
+ TIME = "time"
EMAIL = "email"
URI = "uri"
UUID = "uuid"
+ CURRENCY = "currency"
diff --git a/src/flydocs/interfaces/enums/job_status.py b/src/flydocs/interfaces/enums/job_status.py
deleted file mode 100644
index 76c3191..0000000
--- a/src/flydocs/interfaces/enums/job_status.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""Async-job lifecycle states.
-
-Two state machines live in parallel:
-
-**Default flow** (``options.stages.bbox_refine == false``)::
-
- QUEUED -> RUNNING -> SUCCEEDED | FAILED
- QUEUED -> CANCELLED (only while still QUEUED)
-
-**Bbox-refine flow** (``options.stages.bbox_refine == true``)::
-
- QUEUED -> RUNNING -> PARTIAL_SUCCEEDED -> REFINING_BBOXES -> SUCCEEDED
- \\-> (stays PARTIAL_SUCCEEDED if
- bbox refine fails -- the
- LLM-bbox result is still
- readable; bbox_refine_status
- column carries the failure)
-
-A job that has already started cannot be cancelled. ``PARTIAL_SUCCEEDED``
-results are queryable via ``GET /api/v1/jobs/{id}/result`` -- they carry
-the full extraction with LLM-estimated bboxes; the grounded bboxes land
-once the bbox refiner finishes and the status transitions to ``SUCCEEDED``.
-"""
-
-from __future__ import annotations
-
-from enum import StrEnum
-
-
-class JobStatus(StrEnum):
- QUEUED = "QUEUED"
- RUNNING = "RUNNING"
- PARTIAL_SUCCEEDED = "PARTIAL_SUCCEEDED"
- REFINING_BBOXES = "REFINING_BBOXES"
- SUCCEEDED = "SUCCEEDED"
- FAILED = "FAILED"
- CANCELLED = "CANCELLED"
-
- @property
- def is_terminal(self) -> bool:
- """True when no further state transition is expected for this job."""
- return self in (JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.CANCELLED)
-
- @property
- def has_result(self) -> bool:
- """True when the job carries a readable ExtractionResult.
-
- ``PARTIAL_SUCCEEDED`` and ``REFINING_BBOXES`` are readable too --
- the LLM-bbox version of the result is already persisted; bbox
- grounding is an additive overlay that lands later.
- """
- return self in (
- JobStatus.PARTIAL_SUCCEEDED,
- JobStatus.REFINING_BBOXES,
- JobStatus.SUCCEEDED,
- )
-
-
-class BboxRefineStatus(StrEnum):
- """Sub-state for the out-of-band bbox refinement leg.
-
- Populated only when ``options.stages.bbox_refine == true``. ``null``
- on the job row means the refiner was never requested (default flow).
- """
-
- PENDING = "pending" # event published, worker has not picked it up
- RUNNING = "running" # worker is grounding bboxes right now
- SUCCEEDED = "succeeded" # bboxes grounded, job is now ``SUCCEEDED``
- FAILED = "failed" # refiner failed; job stays ``PARTIAL_SUCCEEDED``
diff --git a/src/flydocs/interfaces/enums/standard_validator.py b/src/flydocs/interfaces/enums/standard_validator.py
deleted file mode 100644
index 4ec499d..0000000
--- a/src/flydocs/interfaces/enums/standard_validator.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""Built-in :class:`StandardValidator` types.
-
-Standard validators are pure-Python value checks that the
-:class:`FieldValidator` runs after extraction. They sit on top of the
-simple ``pattern`` / ``format`` / ``enum`` / ``min`` / ``max`` constraints
-and cover the high-value cases ("is this an IBAN?", "is this a valid
-Spanish NIE?", "is this a credit card number?") that callers would
-otherwise hand-roll.
-
-Adding a new validator: append a member here, then implement
-``_check_`` in :mod:`flydocs.core.services.validation.standard_validator_registry`.
-"""
-
-from __future__ import annotations
-
-from enum import StrEnum
-
-
-class StandardValidatorType(StrEnum):
- # --- network / web --------------------------------------------------
- EMAIL = "email"
- URI = "uri"
- IPV4 = "ipv4"
- IPV6 = "ipv6"
- DOMAIN = "domain"
- SLUG = "slug"
- URL = "url"
-
- # --- temporal --------------------------------------------------------
- DATE = "date"
- DATETIME = "datetime"
- TIME = "time"
- ISO_8601 = "iso_8601"
-
- # --- identifiers -----------------------------------------------------
- UUID = "uuid"
- JSON = "json"
- HEX_COLOR = "hex_color"
-
- # --- finance ---------------------------------------------------------
- IBAN = "iban" # ISO 13616
- BIC = "bic" # ISO 9362 (SWIFT)
- CREDIT_CARD = "credit_card" # Luhn check
- CURRENCY_CODE = "currency_code" # ISO 4217
- AMOUNT = "amount" # numeric > 0
-
- # --- telephony -------------------------------------------------------
- PHONE_E164 = "phone_e164" # ``+``
-
- # --- geographic ------------------------------------------------------
- COUNTRY_CODE = "country_code" # ISO 3166-1 alpha-2
- LANGUAGE_CODE = "language_code" # ISO 639-1
- POSTAL_CODE = "postal_code" # generic, country-aware
- LATITUDE = "latitude"
- LONGITUDE = "longitude"
-
- # --- national identifiers (require country param when ambiguous) ----
- NIF = "nif" # ES -- person tax id
- NIE = "nie" # ES -- foreign person tax id
- CIF = "cif" # ES -- legacy company tax id
- VAT_ID = "vat_id" # EU VAT number
- SSN = "ssn" # US SSN
- PASSPORT_NUMBER = "passport_number" # ICAO 9303 (length / charset only)
diff --git a/src/flydocs/interfaces/enums/status.py b/src/flydocs/interfaces/enums/status.py
index 1a4c4cb..a774e47 100644
--- a/src/flydocs/interfaces/enums/status.py
+++ b/src/flydocs/interfaces/enums/status.py
@@ -1,6 +1,9 @@
# Copyright 2026 Firefly Software Solutions Inc
-"""Status enums shared across nodes (validation rules, judge verdicts,
-content-authenticity verdicts)."""
+"""Status enums shared across pipeline nodes (validation rules, judge verdicts,
+content-authenticity verdicts).
+
+All values are lowercase snake_case to match the universal v1 enum convention.
+"""
from __future__ import annotations
@@ -16,26 +19,25 @@ class ValidationRule(StrEnum):
ENUM = "enum"
MINIMUM = "minimum"
MAXIMUM = "maximum"
- STANDARD = "standard"
+ VALIDATOR = "validator"
class JudgeStatus(StrEnum):
- PASS = "PASS"
- FAIL = "FAIL"
- UNCERTAIN = "UNCERTAIN"
+ PASS = "pass"
+ FAIL = "fail"
+ UNCERTAIN = "uncertain"
class ContentIntegrityStatus(StrEnum):
- VALID = "VALID"
- INVALID = "INVALID"
- UNCERTAIN = "UNCERTAIN"
+ VALID = "valid"
+ INVALID = "invalid"
+ UNCERTAIN = "uncertain"
class CheckStatus(StrEnum):
- PASS = "PASS"
- FAIL = "FAIL"
- UNCERTAIN = "UNCERTAIN"
+ PASS = "pass"
+ FAIL = "fail"
+ UNCERTAIN = "uncertain"
-# Re-export so importers can grab everything from .status
__all__ = ["CheckStatus", "ContentIntegrityStatus", "JudgeStatus", "ValidationRule"]
diff --git a/src/flydocs/interfaces/enums/validator.py b/src/flydocs/interfaces/enums/validator.py
new file mode 100644
index 0000000..75ad8a6
--- /dev/null
+++ b/src/flydocs/interfaces/enums/validator.py
@@ -0,0 +1,58 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""Built-in validator catalogue applied to extracted field values.
+
+Replaces the v0 ``StandardValidatorType`` — the "standard" prefix carried no
+semantic distinction since there is only one validator catalogue in the
+public API.
+"""
+
+from __future__ import annotations
+
+from enum import StrEnum
+
+
+class ValidatorType(StrEnum):
+ # Network / web
+ EMAIL = "email"
+ URI = "uri"
+ URL = "url"
+ DOMAIN = "domain"
+ SLUG = "slug"
+ IPV4 = "ipv4"
+ IPV6 = "ipv6"
+
+ # Temporal
+ DATE = "date"
+ DATETIME = "datetime"
+ TIME = "time"
+ ISO_8601 = "iso_8601"
+
+ # Identifiers
+ UUID = "uuid"
+ JSON = "json"
+ HEX_COLOR = "hex_color"
+
+ # Finance
+ IBAN = "iban"
+ BIC = "bic"
+ CREDIT_CARD = "credit_card"
+ CURRENCY_CODE = "currency_code"
+ AMOUNT = "amount"
+
+ # Telephony
+ PHONE_E164 = "phone_e164"
+
+ # Geographic
+ COUNTRY_CODE = "country_code"
+ LANGUAGE_CODE = "language_code"
+ POSTAL_CODE = "postal_code"
+ LATITUDE = "latitude"
+ LONGITUDE = "longitude"
+
+ # National IDs
+ NIF = "nif"
+ NIE = "nie"
+ CIF = "cif"
+ VAT_ID = "vat_id"
+ SSN = "ssn"
+ PASSPORT_NUMBER = "passport_number"
diff --git a/src/flydocs/models/__init__.py b/src/flydocs/models/__init__.py
index 352dfbd..2d57ff2 100644
--- a/src/flydocs/models/__init__.py
+++ b/src/flydocs/models/__init__.py
@@ -1,6 +1,6 @@
# Copyright 2026 Firefly Software Solutions Inc
"""Persistence layer -- SQLAlchemy entities + async repositories."""
-from flydocs.models.entities.extraction_job import Base, ExtractionJob
+from flydocs.models.entities.extraction import Base, Extraction
-__all__ = ["Base", "ExtractionJob"]
+__all__ = ["Base", "Extraction"]
diff --git a/src/flydocs/models/entities/__init__.py b/src/flydocs/models/entities/__init__.py
index ad13e23..ed255a0 100644
--- a/src/flydocs/models/entities/__init__.py
+++ b/src/flydocs/models/entities/__init__.py
@@ -1,6 +1,6 @@
# Copyright 2026 Firefly Software Solutions Inc
"""SQLAlchemy entity classes."""
-from flydocs.models.entities.extraction_job import Base, ExtractionJob
+from flydocs.models.entities.extraction import Base, Extraction
-__all__ = ["Base", "ExtractionJob"]
+__all__ = ["Base", "Extraction"]
diff --git a/src/flydocs/models/entities/extraction.py b/src/flydocs/models/entities/extraction.py
new file mode 100644
index 0000000..4cb95d6
--- /dev/null
+++ b/src/flydocs/models/entities/extraction.py
@@ -0,0 +1,93 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""``Extraction`` -- persistent state for asynchronous extraction jobs.
+
+Stores everything the worker needs to resume an extraction after a restart
+and everything callers can later query through
+``GET /api/v1/extractions/{id}``. We deliberately do NOT store the document
+bytes -- the payload is only held in memory during processing (the
+``schema_json`` column carries enough to re-render the request when
+needed).
+
+The v1 lifecycle collapses to ``queued | running | succeeded | failed |
+cancelled`` on the main status column. Post-processing (bbox refinement
+today) lives in columns prefixed ``post_processing_bbox_*`` -- still
+atomic at the SQL level, projected to a ``post_processing`` JSON object
+in the public DTO. Keeping them as columns lets ``UPDATE ... RETURNING``
+remain race-safe without needing ``jsonb_set`` games.
+"""
+
+from __future__ import annotations
+
+import uuid
+from datetime import datetime
+from typing import Any
+
+from sqlalchemy import JSON, CheckConstraint, DateTime, Index, Integer, String, Text, func
+from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
+
+
+class Base(DeclarativeBase):
+ """Declarative base shared by every entity in this service."""
+
+
+class Extraction(Base):
+ __tablename__ = "extractions"
+
+ id: Mapped[str] = mapped_column(
+ String(48),
+ primary_key=True,
+ default=lambda: f"ext_{uuid.uuid4().hex[:26].upper()}",
+ doc="Public extraction id (prefixed: ``ext_<26-hex>``).",
+ )
+ idempotency_key: Mapped[str | None] = mapped_column(String(128), nullable=True)
+ status: Mapped[str] = mapped_column(String(16), nullable=False, index=True, default="queued")
+ filename: Mapped[str] = mapped_column(String(255), nullable=False)
+ content_sha256: Mapped[str] = mapped_column(String(64), nullable=False, index=True)
+ content_bytes: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
+
+ schema_json: Mapped[dict[str, Any]] = mapped_column(JSON, nullable=False)
+ options_json: Mapped[dict[str, Any]] = mapped_column(JSON, nullable=False, default=dict)
+ callback_url: Mapped[str | None] = mapped_column(Text, nullable=True)
+ metadata_json: Mapped[dict[str, Any]] = mapped_column(JSON, nullable=False, default=dict)
+
+ result_json: Mapped[dict[str, Any] | None] = mapped_column(JSON, nullable=True)
+ error_code: Mapped[str | None] = mapped_column(String(64), nullable=True)
+ error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
+
+ attempts: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
+ submitted_at: Mapped[datetime] = mapped_column(
+ DateTime(timezone=True), nullable=False, server_default=func.now()
+ )
+ started_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
+ finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
+
+ # Post-processing: bbox refinement leg. ``null`` until/unless requested.
+ post_processing_bbox_status: Mapped[str | None] = mapped_column(String(16), nullable=True)
+ post_processing_bbox_attempts: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
+ post_processing_bbox_started_at: Mapped[datetime | None] = mapped_column(
+ DateTime(timezone=True), nullable=True
+ )
+ post_processing_bbox_finished_at: Mapped[datetime | None] = mapped_column(
+ DateTime(timezone=True), nullable=True
+ )
+ post_processing_bbox_error_code: Mapped[str | None] = mapped_column(String(64), nullable=True)
+ post_processing_bbox_error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
+
+ __table_args__ = (
+ CheckConstraint(
+ "status IN ('queued', 'running', 'succeeded', 'failed', 'cancelled')",
+ name="ck_extractions_status",
+ ),
+ CheckConstraint(
+ "post_processing_bbox_status IS NULL OR "
+ "post_processing_bbox_status IN ('pending', 'running', 'succeeded', 'failed')",
+ name="ck_extractions_post_processing_bbox_status",
+ ),
+ Index(
+ "uq_extractions_idempotency_key",
+ "idempotency_key",
+ unique=True,
+ postgresql_where=(idempotency_key.is_not(None)),
+ sqlite_where=(idempotency_key.is_not(None)),
+ ),
+ )
diff --git a/src/flydocs/models/entities/extraction_job.py b/src/flydocs/models/entities/extraction_job.py
deleted file mode 100644
index 5722965..0000000
--- a/src/flydocs/models/entities/extraction_job.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""``ExtractionJob`` -- persistent state for asynchronous extraction jobs.
-
-Stores everything the worker needs to resume a job after a restart and
-everything callers can later query through ``GET /api/v1/jobs/{id}``.
-We deliberately do NOT store the document bytes -- the payload is only
-held in memory during processing.
-"""
-
-from __future__ import annotations
-
-import uuid
-from datetime import datetime
-from typing import Any
-
-from sqlalchemy import JSON, DateTime, Index, Integer, String, Text, func
-from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
-
-
-class Base(DeclarativeBase):
- """Declarative base shared by every entity in this service."""
-
-
-class ExtractionJob(Base):
- __tablename__ = "extraction_jobs"
-
- id: Mapped[str] = mapped_column(
- String(36),
- primary_key=True,
- default=lambda: str(uuid.uuid4()),
- doc="Stable string UUID used as the public job id.",
- )
- # ``UNIQUE WHERE NOT NULL`` is enforced by the partial index below.
- idempotency_key: Mapped[str | None] = mapped_column(String(128), nullable=True)
- # ``String(24)`` -- fits ``PARTIAL_SUCCEEDED`` (17 chars) and ``REFINING_BBOXES`` (15);
- # the original ``String(16)`` truncated the former and crashed asyncpg on commit.
- status: Mapped[str] = mapped_column(String(24), nullable=False, index=True)
- filename: Mapped[str] = mapped_column(String(255), nullable=False)
- content_sha256: Mapped[str] = mapped_column(String(64), nullable=False, index=True)
- content_bytes: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
-
- # Inputs.
- schema_json: Mapped[dict[str, Any]] = mapped_column(JSON, nullable=False)
- options_json: Mapped[dict[str, Any]] = mapped_column(JSON, nullable=False, default=dict)
- callback_url: Mapped[str | None] = mapped_column(Text, nullable=True)
- metadata_json: Mapped[dict[str, Any]] = mapped_column(JSON, nullable=False, default=dict)
-
- # Outputs.
- result_json: Mapped[dict[str, Any] | None] = mapped_column(JSON, nullable=True)
- error_code: Mapped[str | None] = mapped_column(String(64), nullable=True)
- error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
-
- # Bookkeeping.
- attempts: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
- created_at: Mapped[datetime] = mapped_column(
- DateTime(timezone=True), nullable=False, server_default=func.now()
- )
- started_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
- finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
-
- # Bbox-refine leg state -- populated only when the caller enabled
- # ``options.stages.bbox_refine``. ``null`` for jobs that never asked
- # for grounding. See ``interfaces/enums/job_status.py::BboxRefineStatus``.
- bbox_refine_status: Mapped[str | None] = mapped_column(String(24), nullable=True)
- bbox_refine_attempts: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
- bbox_refine_started_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
- bbox_refine_finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
- bbox_refine_error_code: Mapped[str | None] = mapped_column(String(64), nullable=True)
- bbox_refine_error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
-
- __table_args__ = (
- Index(
- "uq_extraction_jobs_idempotency_key",
- "idempotency_key",
- unique=True,
- postgresql_where=(idempotency_key.is_not(None)),
- sqlite_where=(idempotency_key.is_not(None)),
- ),
- )
diff --git a/src/flydocs/models/repositories/__init__.py b/src/flydocs/models/repositories/__init__.py
index ef1f000..5bb4a73 100644
--- a/src/flydocs/models/repositories/__init__.py
+++ b/src/flydocs/models/repositories/__init__.py
@@ -1,6 +1,6 @@
# Copyright 2026 Firefly Software Solutions Inc
"""Async repositories for the persistence layer."""
-from flydocs.models.repositories.extraction_job_repository import ExtractionJobRepository
+from flydocs.models.repositories.extraction_repository import ExtractionRepository
-__all__ = ["ExtractionJobRepository"]
+__all__ = ["ExtractionRepository"]
diff --git a/src/flydocs/models/repositories/extraction_job_repository.py b/src/flydocs/models/repositories/extraction_job_repository.py
deleted file mode 100644
index ec18718..0000000
--- a/src/flydocs/models/repositories/extraction_job_repository.py
+++ /dev/null
@@ -1,562 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""Async repository for :class:`ExtractionJob`.
-
-Wraps an ``AsyncSession`` factory so callers can be ignorant of the
-transaction boundaries. Each method opens its own short-lived session +
-transaction.
-
-Concurrency model
-=================
-
-Every state-changing method is a **single conditional UPDATE** with an
-explicit precondition on ``status`` (and where relevant, a lease
-threshold on ``started_at``). The query returns the new row only when
-the precondition matched; otherwise it returns ``None`` so the caller
-can detect "I lost the race". This pattern makes the transitions safe
-under concurrent delivery from multiple worker replicas without needing
-``SELECT ... FOR UPDATE`` or serialisable isolation -- two writers
-trying the same transition will be serialised by Postgres' row-level
-lock on UPDATE, and the loser's ``WHERE`` clause won't match the
-already-updated row.
-
-The legal predecessor set for each transition is documented inline.
-"""
-
-from __future__ import annotations
-
-from collections.abc import Callable
-from contextlib import asynccontextmanager
-from datetime import UTC, datetime, timedelta
-from typing import Any
-
-from sqlalchemy import and_, func, or_, select, update
-from sqlalchemy.exc import IntegrityError
-from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
-
-from flydocs.models.entities.extraction_job import ExtractionJob
-
-
-class ExtractionJobRepository:
- """Async repository for ``extraction_jobs``."""
-
- def __init__(
- self,
- session_factory: async_sessionmaker[AsyncSession],
- *,
- engine: Any = None,
- ) -> None:
- self._session_factory = session_factory
- self._engine = engine
-
- @property
- def engine(self) -> Any:
- """Underlying ``AsyncEngine``. Used by the actuator health probe."""
- return self._engine
-
- # -- factories -----------------------------------------------------
-
- @classmethod
- def from_url(cls, database_url: str, *, echo: bool = False) -> ExtractionJobRepository:
- engine = create_async_engine(database_url, echo=echo, future=True, pool_pre_ping=True)
- factory = async_sessionmaker(engine, expire_on_commit=False)
- return cls(factory, engine=engine)
-
- @asynccontextmanager
- async def session(self):
- """Open a session for callers that need to compose multiple operations."""
- async with self._session_factory() as session:
- yield session
- await session.commit()
-
- # -- queries -------------------------------------------------------
-
- async def get(self, job_id: str) -> ExtractionJob | None:
- async with self._session_factory() as session:
- return await session.get(ExtractionJob, job_id)
-
- async def get_by_idempotency_key(self, key: str) -> ExtractionJob | None:
- async with self._session_factory() as session:
- result = await session.execute(select(ExtractionJob).where(ExtractionJob.idempotency_key == key))
- return result.scalars().first()
-
- async def list_jobs(
- self,
- *,
- statuses: list[str] | None = None,
- bbox_refine_statuses: list[str] | None = None,
- created_after: datetime | None = None,
- created_before: datetime | None = None,
- idempotency_key: str | None = None,
- limit: int = 50,
- offset: int = 0,
- ) -> tuple[list[ExtractionJob], int]:
- """Return ``(rows, total_count)`` for jobs matching the filters.
-
- Rows are ordered ``created_at DESC`` (newest first). The total
- count reflects the filtered set, not the page; callers paginate
- with ``limit`` / ``offset`` against that total.
- """
- conditions: list[Any] = []
- if statuses:
- conditions.append(ExtractionJob.status.in_(statuses))
- if bbox_refine_statuses:
- conditions.append(ExtractionJob.bbox_refine_status.in_(bbox_refine_statuses))
- if created_after is not None:
- conditions.append(ExtractionJob.created_at >= created_after)
- if created_before is not None:
- conditions.append(ExtractionJob.created_at <= created_before)
- if idempotency_key:
- conditions.append(ExtractionJob.idempotency_key == idempotency_key)
- async with self._session_factory() as session:
- count_stmt = select(func.count()).select_from(ExtractionJob)
- data_stmt = select(ExtractionJob).order_by(ExtractionJob.created_at.desc())
- for c in conditions:
- count_stmt = count_stmt.where(c)
- data_stmt = data_stmt.where(c)
- data_stmt = data_stmt.limit(limit).offset(offset)
- total = int((await session.execute(count_stmt)).scalar_one() or 0)
- rows = list((await session.execute(data_stmt)).scalars().all())
- return rows, total
-
- # -- reaper helpers ------------------------------------------------
- #
- # The :class:`flydocs.core.services.workers.JobReaper` /
- # :class:`BboxReaper` use these to find rows that are stuck in
- # non-terminal states because their triggering event was lost or
- # because the claimant crashed past its lease window. The reaper
- # republishes a fresh EDA event for each id returned; the atomic
- # ``mark_*`` claims dedupe duplicate publishes.
-
- async def find_stale_running(
- self,
- *,
- lease_seconds: int,
- limit: int = 100,
- ) -> list[str]:
- """Job ids where status='RUNNING' AND started_at < now()-lease."""
- cutoff = _utcnow() - timedelta(seconds=max(0, lease_seconds))
- async with self._session_factory() as session:
- result = await session.execute(
- select(ExtractionJob.id)
- .where(
- ExtractionJob.status == "RUNNING",
- ExtractionJob.started_at < cutoff,
- )
- .limit(limit)
- )
- return list(result.scalars().all())
-
- async def find_stale_queued(
- self,
- *,
- older_than_seconds: int,
- limit: int = 100,
- ) -> list[str]:
- """Job ids where status='QUEUED' AND nothing's happened in a while.
-
- Covers both submit-crash orphans (started_at IS NULL → fall back
- to created_at) and retry-publish orphans (started_at IS NOT NULL,
- from the prior failed run). ``COALESCE`` picks the most recent
- timestamp so jobs that were briefly running and got requeued
- aren't reaped any earlier than necessary.
- """
- cutoff = _utcnow() - timedelta(seconds=max(0, older_than_seconds))
- async with self._session_factory() as session:
- result = await session.execute(
- select(ExtractionJob.id)
- .where(
- ExtractionJob.status == "QUEUED",
- func.coalesce(
- ExtractionJob.started_at,
- ExtractionJob.created_at,
- )
- < cutoff,
- )
- .limit(limit)
- )
- return list(result.scalars().all())
-
- async def find_stale_refining_bboxes(
- self,
- *,
- lease_seconds: int,
- limit: int = 100,
- ) -> list[str]:
- """Bbox-leg analogue of ``find_stale_running``."""
- cutoff = _utcnow() - timedelta(seconds=max(0, lease_seconds))
- async with self._session_factory() as session:
- result = await session.execute(
- select(ExtractionJob.id)
- .where(
- ExtractionJob.status == "REFINING_BBOXES",
- ExtractionJob.bbox_refine_started_at < cutoff,
- )
- .limit(limit)
- )
- return list(result.scalars().all())
-
- async def find_pending_bbox_revive(
- self,
- *,
- partial_threshold_seconds: int,
- bbox_lease_seconds: int,
- limit: int = 100,
- ) -> list[str]:
- """PARTIAL_SUCCEEDED rows whose bbox event needs republishing.
-
- Two sub-cases unified into one query:
-
- * ``bbox_refine_started_at IS NULL`` -- the initial publish never
- landed (worker crashed between ``mark_partial_succeeded`` and
- ``publish``). The clock starts at ``started_at`` (the main
- extraction's claim time).
- * ``bbox_refine_started_at IS NOT NULL`` -- the row was
- previously REFINING_BBOXES, the bbox worker requeued itself
- via ``requeue_bbox_refine``, and the delayed-publish task was
- lost. The clock starts at ``bbox_refine_started_at`` (the
- previous refine attempt's claim time).
- """
- now = _utcnow()
- main_cutoff = now - timedelta(seconds=max(0, partial_threshold_seconds))
- refine_cutoff = now - timedelta(seconds=max(0, bbox_lease_seconds))
- async with self._session_factory() as session:
- result = await session.execute(
- select(ExtractionJob.id)
- .where(
- ExtractionJob.status == "PARTIAL_SUCCEEDED",
- ExtractionJob.bbox_refine_status == "pending",
- or_(
- and_(
- ExtractionJob.bbox_refine_started_at.is_(None),
- ExtractionJob.started_at < main_cutoff,
- ),
- and_(
- ExtractionJob.bbox_refine_started_at.is_not(None),
- ExtractionJob.bbox_refine_started_at < refine_cutoff,
- ),
- ),
- )
- .limit(limit)
- )
- return list(result.scalars().all())
-
- # -- mutations -----------------------------------------------------
-
- async def add(self, job: ExtractionJob) -> ExtractionJob:
- async with self._session_factory() as session:
- session.add(job)
- await session.commit()
- await session.refresh(job)
- return job
-
- # ``IntegrityError`` re-export so callers can ``except`` it without
- # importing SQLAlchemy themselves -- keeps the repository the single
- # ORM-facing boundary.
- IntegrityError = IntegrityError
-
- async def update(self, job_id: str, **changes: Any) -> ExtractionJob | None:
- """Unconditional field update.
-
- WARNING: this is a read-modify-write and is NOT safe for
- status transitions or any other field where concurrent writers
- can race. Use it only for fields no other writer touches (or
- idempotent overwrites). Status transitions go through the
- ``mark_*`` methods below.
- """
- async with self._session_factory() as session:
- job = await session.get(ExtractionJob, job_id)
- if job is None:
- return None
- for key, value in changes.items():
- setattr(job, key, value)
- await session.commit()
- await session.refresh(job)
- return job
-
- # ------------------------------------------------------------------
- # Atomic state transitions.
- #
- # Each method runs a single ``UPDATE ... WHERE id=? AND status IN
- # (legal_predecessors)`` against Postgres. The row-level lock on
- # UPDATE serialises concurrent writers; the WHERE precondition then
- # decides who wins. A return value of ``None`` means "the row was
- # already past this transition" -- the caller should treat the work
- # as having been done by someone else.
- # ------------------------------------------------------------------
-
- async def _atomic_update(
- self,
- *,
- job_id: str,
- where: Any,
- values: dict[str, Any],
- ) -> ExtractionJob | None:
- """Execute ``UPDATE ... WHERE id AND RETURNING *``.
-
- ``RETURNING`` works on Postgres (always) and SQLite >= 3.35 (the
- Python 3.13 stdlib ships >=3.45). Combining row-level UPDATE
- locking with the WHERE-precondition predicate gives us a
- compare-and-swap without ``FOR UPDATE``.
- """
- async with self._session_factory() as session:
- stmt = (
- update(ExtractionJob)
- .where(ExtractionJob.id == job_id, where)
- .values(**values)
- .returning(ExtractionJob)
- .execution_options(synchronize_session=False)
- )
- result = await session.execute(stmt)
- row = result.scalar_one_or_none()
- await session.commit()
- return row
-
- async def mark_running(
- self,
- job_id: str,
- *,
- lease_seconds: int,
- ) -> ExtractionJob | None:
- """Atomically claim a QUEUED job and transition it to RUNNING.
-
- Legal predecessors:
-
- * ``QUEUED`` -- first delivery (or the worker requeued itself
- for a retry).
- * ``RUNNING`` with a stale ``started_at`` -- the worker that
- owned the previous claim crashed (``lease_seconds`` window
- elapsed), so another worker can pick the orphan up.
-
- Returns the post-claim row when the claim won, ``None`` when
- the job is no longer claimable (e.g. cancelled, succeeded, or
- another worker holds a fresh lease).
- """
- now = _utcnow()
- stale_cutoff = now - timedelta(seconds=max(0, lease_seconds))
- return await self._atomic_update(
- job_id=job_id,
- where=or_(
- ExtractionJob.status == "QUEUED",
- and_(
- ExtractionJob.status == "RUNNING",
- # ``started_at < cutoff`` evaluates NULL → row excluded,
- # so jobs that somehow have RUNNING without started_at
- # are not reclaimed (defensive).
- ExtractionJob.started_at < stale_cutoff,
- ),
- ),
- values={
- "status": "RUNNING",
- "started_at": now,
- "attempts": ExtractionJob.attempts + 1,
- },
- )
-
- async def mark_succeeded(
- self,
- job_id: str,
- *,
- result: dict[str, Any],
- ) -> ExtractionJob | None:
- """RUNNING (or REFINING_BBOXES) -> SUCCEEDED with the final result.
-
- ``REFINING_BBOXES`` is allowed because the bbox-refine leg
- terminates by writing SUCCEEDED through this method.
- """
- return await self._atomic_update(
- job_id=job_id,
- where=ExtractionJob.status.in_(("RUNNING", "REFINING_BBOXES")),
- values={
- "status": "SUCCEEDED",
- "finished_at": _utcnow(),
- "result_json": result,
- "error_code": None,
- "error_message": None,
- },
- )
-
- async def mark_failed(
- self,
- job_id: str,
- *,
- code: str,
- message: str,
- ) -> ExtractionJob | None:
- """RUNNING -> FAILED (terminal). No-op if the row already moved on."""
- return await self._atomic_update(
- job_id=job_id,
- where=ExtractionJob.status == "RUNNING",
- values={
- "status": "FAILED",
- "finished_at": _utcnow(),
- "error_code": code,
- "error_message": message,
- },
- )
-
- async def mark_cancelled(self, job_id: str) -> ExtractionJob | None:
- """QUEUED -> CANCELLED. Returns None if the worker has already started.
-
- We deliberately do NOT permit cancelling a RUNNING job — there's
- no mid-flight cancellation hook in the orchestrator today. The
- atomic precondition guarantees a cancel sent the same instant
- the worker claims the row will either: (a) win and the worker's
- ``mark_running`` returns ``None``, or (b) lose and the caller
- gets a ``JobNotCancellable`` response. There is no third state.
- """
- return await self._atomic_update(
- job_id=job_id,
- where=ExtractionJob.status == "QUEUED",
- values={
- "status": "CANCELLED",
- "finished_at": _utcnow(),
- },
- )
-
- async def requeue_for_retry(self, job_id: str) -> ExtractionJob | None:
- """RUNNING -> QUEUED. Used by the worker's retry path.
-
- Atomic: a cancel arriving in the same instant can race with this.
- The retry's WHERE matches RUNNING; the cancel's WHERE matches
- QUEUED. They cannot both succeed -- one will return None and
- bail. (Cancel cannot match RUNNING anyway, so concretely:
- requeue wins, cancel was already rejected upstream.)
- """
- return await self._atomic_update(
- job_id=job_id,
- where=ExtractionJob.status == "RUNNING",
- values={"status": "QUEUED"},
- )
-
- # -- bbox-refine leg ----------------------------------------------
-
- async def mark_partial_succeeded(
- self,
- job_id: str,
- *,
- result: dict[str, Any],
- ) -> ExtractionJob | None:
- """RUNNING -> PARTIAL_SUCCEEDED. Main extraction done; bbox pending.
-
- Persists the LLM-bbox result, transitions the job, and stamps
- the bbox leg as ``pending``. Callers reading
- ``GET /api/v1/jobs/{id}/result`` get the ungrounded result
- immediately; grounded coordinates land once the refine worker
- finishes.
- """
- return await self._atomic_update(
- job_id=job_id,
- where=ExtractionJob.status == "RUNNING",
- values={
- "status": "PARTIAL_SUCCEEDED",
- "result_json": result,
- "error_code": None,
- "error_message": None,
- "bbox_refine_status": "pending",
- },
- )
-
- async def mark_bbox_refining(
- self,
- job_id: str,
- *,
- lease_seconds: int,
- ) -> ExtractionJob | None:
- """Atomically claim a PARTIAL_SUCCEEDED job for bbox refinement.
-
- Legal predecessors:
-
- * ``PARTIAL_SUCCEEDED`` -- the main extraction just finished
- and published the refine event.
- * ``REFINING_BBOXES`` with stale ``bbox_refine_started_at`` --
- the previous bbox-worker crashed; reclaim is allowed.
-
- Returns ``None`` when another worker holds a fresh lease, or
- when the job advanced to SUCCEEDED / FAILED in the meantime.
- """
- now = _utcnow()
- stale_cutoff = now - timedelta(seconds=max(0, lease_seconds))
- return await self._atomic_update(
- job_id=job_id,
- where=or_(
- ExtractionJob.status == "PARTIAL_SUCCEEDED",
- and_(
- ExtractionJob.status == "REFINING_BBOXES",
- ExtractionJob.bbox_refine_started_at < stale_cutoff,
- ),
- ),
- values={
- "status": "REFINING_BBOXES",
- "bbox_refine_status": "running",
- "bbox_refine_started_at": now,
- "bbox_refine_attempts": ExtractionJob.bbox_refine_attempts + 1,
- },
- )
-
- async def requeue_bbox_refine(self, job_id: str) -> ExtractionJob | None:
- """REFINING_BBOXES -> PARTIAL_SUCCEEDED (with status=pending).
-
- Used by the bbox worker's retry path: revert the leg so the next
- delivery's claim precondition matches again.
- """
- return await self._atomic_update(
- job_id=job_id,
- where=ExtractionJob.status == "REFINING_BBOXES",
- values={
- "status": "PARTIAL_SUCCEEDED",
- "bbox_refine_status": "pending",
- },
- )
-
- async def mark_bbox_refined(
- self,
- job_id: str,
- *,
- result: dict[str, Any],
- ) -> ExtractionJob | None:
- """REFINING_BBOXES -> SUCCEEDED with grounded coordinates."""
- return await self._atomic_update(
- job_id=job_id,
- where=ExtractionJob.status == "REFINING_BBOXES",
- values={
- "status": "SUCCEEDED",
- "finished_at": _utcnow(),
- "result_json": result,
- "bbox_refine_status": "succeeded",
- "bbox_refine_finished_at": _utcnow(),
- "bbox_refine_error_code": None,
- "bbox_refine_error_message": None,
- },
- )
-
- async def mark_bbox_refine_failed(
- self,
- job_id: str,
- *,
- code: str,
- message: str,
- ) -> ExtractionJob | None:
- """REFINING_BBOXES -> PARTIAL_SUCCEEDED with a failure record.
-
- The LLM-bbox result stays readable; only the grounded overlay
- is missing. The caller (bbox worker) already published a
- partial webhook -- nothing new to deliver.
- """
- return await self._atomic_update(
- job_id=job_id,
- where=ExtractionJob.status == "REFINING_BBOXES",
- values={
- "status": "PARTIAL_SUCCEEDED",
- "bbox_refine_status": "failed",
- "bbox_refine_finished_at": _utcnow(),
- "bbox_refine_error_code": code,
- "bbox_refine_error_message": message,
- },
- )
-
-
-def _utcnow() -> datetime:
- return datetime.now(UTC)
-
-
-__all__: list[Callable[..., Any] | str] = ["ExtractionJobRepository"]
diff --git a/src/flydocs/models/repositories/extraction_repository.py b/src/flydocs/models/repositories/extraction_repository.py
new file mode 100644
index 0000000..09dd472
--- /dev/null
+++ b/src/flydocs/models/repositories/extraction_repository.py
@@ -0,0 +1,443 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""Async repository for :class:`Extraction`.
+
+Wraps an ``AsyncSession`` factory so callers can be ignorant of
+transaction boundaries. Each method opens its own short-lived session +
+transaction.
+
+Concurrency model
+=================
+
+Every state-changing method is a **single conditional UPDATE** with an
+explicit precondition on ``status`` (and where relevant, a lease
+threshold on a timestamp). The query returns the new row only when the
+precondition matched; otherwise it returns ``None`` so the caller can
+detect "I lost the race". This pattern makes the transitions safe under
+concurrent delivery from multiple worker replicas without needing
+``SELECT ... FOR UPDATE`` or serialisable isolation -- two writers
+trying the same transition will be serialised by Postgres' row-level
+lock on UPDATE, and the loser's ``WHERE`` clause won't match the
+already-updated row.
+
+The v1 lifecycle is linear (``queued -> running -> succeeded | failed |
+cancelled``). The bbox refinement leg is additive: a job is fully
+``succeeded`` the moment the main pipeline completes; the
+``post_processing_bbox_*`` columns separately track grounding progress.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from contextlib import asynccontextmanager
+from datetime import UTC, datetime, timedelta
+from typing import Any
+
+from sqlalchemy import and_, func, or_, select, update
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
+
+from flydocs.models.entities.extraction import Extraction
+
+
+class ExtractionRepository:
+ """Async repository for ``extractions``."""
+
+ def __init__(
+ self,
+ session_factory: async_sessionmaker[AsyncSession],
+ *,
+ engine: Any = None,
+ ) -> None:
+ self._session_factory = session_factory
+ self._engine = engine
+
+ @property
+ def engine(self) -> Any:
+ """Underlying ``AsyncEngine`` -- used by the actuator health probe."""
+ return self._engine
+
+ @classmethod
+ def from_url(cls, database_url: str, *, echo: bool = False) -> ExtractionRepository:
+ engine = create_async_engine(database_url, echo=echo, future=True, pool_pre_ping=True)
+ factory = async_sessionmaker(engine, expire_on_commit=False)
+ return cls(factory, engine=engine)
+
+ @asynccontextmanager
+ async def session(self):
+ async with self._session_factory() as session:
+ yield session
+ await session.commit()
+
+ # ---- queries -------------------------------------------------------
+
+ async def get(self, ext_id: str) -> Extraction | None:
+ async with self._session_factory() as session:
+ return await session.get(Extraction, ext_id)
+
+ async def get_by_idempotency_key(self, key: str) -> Extraction | None:
+ async with self._session_factory() as session:
+ result = await session.execute(select(Extraction).where(Extraction.idempotency_key == key))
+ return result.scalars().first()
+
+ async def list_extractions(
+ self,
+ *,
+ statuses: list[str] | None = None,
+ post_processing_bbox_statuses: list[str] | None = None,
+ created_after: datetime | None = None,
+ created_before: datetime | None = None,
+ idempotency_key: str | None = None,
+ limit: int = 50,
+ offset: int = 0,
+ ) -> tuple[list[Extraction], int]:
+ """Return ``(rows, total_count)`` for extractions matching the filters."""
+ conditions: list[Any] = []
+ if statuses:
+ conditions.append(Extraction.status.in_(statuses))
+ if post_processing_bbox_statuses:
+ conditions.append(Extraction.post_processing_bbox_status.in_(post_processing_bbox_statuses))
+ if created_after is not None:
+ conditions.append(Extraction.submitted_at >= created_after)
+ if created_before is not None:
+ conditions.append(Extraction.submitted_at <= created_before)
+ if idempotency_key:
+ conditions.append(Extraction.idempotency_key == idempotency_key)
+ async with self._session_factory() as session:
+ count_stmt = select(func.count()).select_from(Extraction)
+ data_stmt = select(Extraction).order_by(Extraction.submitted_at.desc())
+ for c in conditions:
+ count_stmt = count_stmt.where(c)
+ data_stmt = data_stmt.where(c)
+ data_stmt = data_stmt.limit(limit).offset(offset)
+ total = int((await session.execute(count_stmt)).scalar_one() or 0)
+ rows = list((await session.execute(data_stmt)).scalars().all())
+ return rows, total
+
+ # ---- reaper helpers ------------------------------------------------
+
+ async def find_stale_running(
+ self,
+ *,
+ lease_seconds: int,
+ limit: int = 100,
+ ) -> list[str]:
+ cutoff = _utcnow() - timedelta(seconds=max(0, lease_seconds))
+ async with self._session_factory() as session:
+ result = await session.execute(
+ select(Extraction.id)
+ .where(
+ Extraction.status == "running",
+ Extraction.started_at < cutoff,
+ )
+ .limit(limit)
+ )
+ return list(result.scalars().all())
+
+ async def find_stale_queued(
+ self,
+ *,
+ older_than_seconds: int,
+ limit: int = 100,
+ ) -> list[str]:
+ cutoff = _utcnow() - timedelta(seconds=max(0, older_than_seconds))
+ async with self._session_factory() as session:
+ result = await session.execute(
+ select(Extraction.id)
+ .where(
+ Extraction.status == "queued",
+ func.coalesce(Extraction.started_at, Extraction.submitted_at) < cutoff,
+ )
+ .limit(limit)
+ )
+ return list(result.scalars().all())
+
+ async def find_stale_bbox_refining(
+ self,
+ *,
+ lease_seconds: int,
+ limit: int = 100,
+ ) -> list[str]:
+ cutoff = _utcnow() - timedelta(seconds=max(0, lease_seconds))
+ async with self._session_factory() as session:
+ result = await session.execute(
+ select(Extraction.id)
+ .where(
+ Extraction.post_processing_bbox_status == "running",
+ Extraction.post_processing_bbox_started_at < cutoff,
+ )
+ .limit(limit)
+ )
+ return list(result.scalars().all())
+
+ async def find_pending_bbox_revive(
+ self,
+ *,
+ pending_threshold_seconds: int,
+ bbox_lease_seconds: int,
+ limit: int = 100,
+ ) -> list[str]:
+ """Succeeded extractions whose bbox refinement event needs republishing.
+
+ Two sub-cases:
+ * ``post_processing_bbox_started_at IS NULL`` -- the initial publish
+ never landed; clock starts at the main extraction's ``finished_at``.
+ * ``post_processing_bbox_started_at IS NOT NULL`` -- the previous
+ refining attempt's worker crashed or requeued itself and the
+ delayed-publish task was lost; clock starts at that prior
+ attempt's start.
+ """
+ now = _utcnow()
+ main_cutoff = now - timedelta(seconds=max(0, pending_threshold_seconds))
+ refine_cutoff = now - timedelta(seconds=max(0, bbox_lease_seconds))
+ async with self._session_factory() as session:
+ result = await session.execute(
+ select(Extraction.id)
+ .where(
+ Extraction.status == "succeeded",
+ Extraction.post_processing_bbox_status == "pending",
+ or_(
+ and_(
+ Extraction.post_processing_bbox_started_at.is_(None),
+ Extraction.finished_at < main_cutoff,
+ ),
+ and_(
+ Extraction.post_processing_bbox_started_at.is_not(None),
+ Extraction.post_processing_bbox_started_at < refine_cutoff,
+ ),
+ ),
+ )
+ .limit(limit)
+ )
+ return list(result.scalars().all())
+
+ # ---- mutations -----------------------------------------------------
+
+ async def add(self, ext: Extraction) -> Extraction:
+ async with self._session_factory() as session:
+ session.add(ext)
+ await session.commit()
+ await session.refresh(ext)
+ return ext
+
+ IntegrityError = IntegrityError
+
+ async def update(self, ext_id: str, **changes: Any) -> Extraction | None:
+ """Unconditional field update.
+
+ WARNING: read-modify-write; NOT safe for status transitions. Use the
+ ``mark_*`` / ``request_bbox_refinement`` / ``claim_bbox_refinement``
+ atomic methods below for any field where concurrent writers can race.
+ """
+ async with self._session_factory() as session:
+ ext = await session.get(Extraction, ext_id)
+ if ext is None:
+ return None
+ for key, value in changes.items():
+ setattr(ext, key, value)
+ await session.commit()
+ await session.refresh(ext)
+ return ext
+
+ async def _atomic_update(
+ self,
+ *,
+ ext_id: str,
+ where: Any,
+ values: dict[str, Any],
+ ) -> Extraction | None:
+ """Execute ``UPDATE ... WHERE id AND RETURNING *``."""
+ async with self._session_factory() as session:
+ stmt = (
+ update(Extraction)
+ .where(Extraction.id == ext_id, where)
+ .values(**values)
+ .returning(Extraction)
+ .execution_options(synchronize_session=False)
+ )
+ result = await session.execute(stmt)
+ row = result.scalar_one_or_none()
+ await session.commit()
+ return row
+
+ # ---- main lifecycle transitions ------------------------------------
+
+ async def mark_running(
+ self,
+ ext_id: str,
+ *,
+ lease_seconds: int,
+ ) -> Extraction | None:
+ """queued (or stale running) -> running."""
+ now = _utcnow()
+ stale_cutoff = now - timedelta(seconds=max(0, lease_seconds))
+ return await self._atomic_update(
+ ext_id=ext_id,
+ where=or_(
+ Extraction.status == "queued",
+ and_(
+ Extraction.status == "running",
+ Extraction.started_at < stale_cutoff,
+ ),
+ ),
+ values={
+ "status": "running",
+ "started_at": now,
+ "attempts": Extraction.attempts + 1,
+ },
+ )
+
+ async def mark_succeeded(
+ self,
+ ext_id: str,
+ *,
+ result: dict[str, Any],
+ request_bbox_refinement: bool = False,
+ ) -> Extraction | None:
+ """running -> succeeded with the final result.
+
+ When ``request_bbox_refinement`` is True, simultaneously sets the
+ post-processing leg to ``pending`` so the refine event publisher
+ can pick it up. This single atomic update is the only place where
+ the bbox leg starts.
+ """
+ values: dict[str, Any] = {
+ "status": "succeeded",
+ "finished_at": _utcnow(),
+ "result_json": result,
+ "error_code": None,
+ "error_message": None,
+ }
+ if request_bbox_refinement:
+ values["post_processing_bbox_status"] = "pending"
+ return await self._atomic_update(
+ ext_id=ext_id,
+ where=Extraction.status == "running",
+ values=values,
+ )
+
+ async def mark_failed(
+ self,
+ ext_id: str,
+ *,
+ code: str,
+ message: str,
+ ) -> Extraction | None:
+ return await self._atomic_update(
+ ext_id=ext_id,
+ where=Extraction.status == "running",
+ values={
+ "status": "failed",
+ "finished_at": _utcnow(),
+ "error_code": code,
+ "error_message": message,
+ },
+ )
+
+ async def mark_cancelled(self, ext_id: str) -> Extraction | None:
+ return await self._atomic_update(
+ ext_id=ext_id,
+ where=Extraction.status == "queued",
+ values={
+ "status": "cancelled",
+ "finished_at": _utcnow(),
+ },
+ )
+
+ async def requeue_for_retry(self, ext_id: str) -> Extraction | None:
+ return await self._atomic_update(
+ ext_id=ext_id,
+ where=Extraction.status == "running",
+ values={"status": "queued"},
+ )
+
+ # ---- bbox-refinement leg -------------------------------------------
+
+ async def claim_bbox_refinement(
+ self,
+ ext_id: str,
+ *,
+ lease_seconds: int,
+ ) -> Extraction | None:
+ """pending (or stale running) -> running. Bbox-leg sub-status only."""
+ now = _utcnow()
+ stale_cutoff = now - timedelta(seconds=max(0, lease_seconds))
+ return await self._atomic_update(
+ ext_id=ext_id,
+ where=and_(
+ Extraction.status == "succeeded",
+ or_(
+ Extraction.post_processing_bbox_status == "pending",
+ and_(
+ Extraction.post_processing_bbox_status == "running",
+ Extraction.post_processing_bbox_started_at < stale_cutoff,
+ ),
+ ),
+ ),
+ values={
+ "post_processing_bbox_status": "running",
+ "post_processing_bbox_started_at": now,
+ "post_processing_bbox_attempts": Extraction.post_processing_bbox_attempts + 1,
+ },
+ )
+
+ async def requeue_bbox_refinement(self, ext_id: str) -> Extraction | None:
+ """running -> pending on the bbox-leg sub-status."""
+ return await self._atomic_update(
+ ext_id=ext_id,
+ where=and_(
+ Extraction.status == "succeeded",
+ Extraction.post_processing_bbox_status == "running",
+ ),
+ values={"post_processing_bbox_status": "pending"},
+ )
+
+ async def complete_bbox_refinement(
+ self,
+ ext_id: str,
+ *,
+ result: dict[str, Any],
+ ) -> Extraction | None:
+ """running -> succeeded on the bbox-leg sub-status, with refined result."""
+ return await self._atomic_update(
+ ext_id=ext_id,
+ where=and_(
+ Extraction.status == "succeeded",
+ Extraction.post_processing_bbox_status == "running",
+ ),
+ values={
+ "result_json": result,
+ "post_processing_bbox_status": "succeeded",
+ "post_processing_bbox_finished_at": _utcnow(),
+ "post_processing_bbox_error_code": None,
+ "post_processing_bbox_error_message": None,
+ },
+ )
+
+ async def fail_bbox_refinement(
+ self,
+ ext_id: str,
+ *,
+ code: str,
+ message: str,
+ ) -> Extraction | None:
+ """running -> failed on the bbox-leg sub-status (main result is unchanged)."""
+ return await self._atomic_update(
+ ext_id=ext_id,
+ where=and_(
+ Extraction.status == "succeeded",
+ Extraction.post_processing_bbox_status == "running",
+ ),
+ values={
+ "post_processing_bbox_status": "failed",
+ "post_processing_bbox_finished_at": _utcnow(),
+ "post_processing_bbox_error_code": code,
+ "post_processing_bbox_error_message": message,
+ },
+ )
+
+
+def _utcnow() -> datetime:
+ return datetime.now(UTC)
+
+
+__all__: list[Callable[..., Any] | str] = ["ExtractionRepository"]
diff --git a/src/flydocs/resources/prompts/transform.yaml b/src/flydocs/resources/prompts/transform.yaml
index 7472519..f02b0be 100644
--- a/src/flydocs/resources/prompts/transform.yaml
+++ b/src/flydocs/resources/prompts/transform.yaml
@@ -1,10 +1,13 @@
name: flydocs/transform
-version: 1.0.0
+version: "1.0.0"
description: >-
Post-extraction LLM transformation of an array field group. The caller's
``intention`` is the goal in one sentence; the LLM rewrites the rows of
the target group in the same shape.
-
+required_variables:
+ - target_group
+ - intention
+ - rows_json
system_template: |
You are a deterministic post-extraction transformer. Given:
diff --git a/src/flydocs/web/advice/exception_advice.py b/src/flydocs/web/advice/exception_advice.py
index a1ddc6c..4eeb11a 100644
--- a/src/flydocs/web/advice/exception_advice.py
+++ b/src/flydocs/web/advice/exception_advice.py
@@ -13,32 +13,58 @@
from pyfly.web import controller_advice, exception_handler
from flydocs.core.services.binary import BinaryNormalizationError
-from flydocs.core.services.jobs.cancel_job_handler import JobNotCancellable
-from flydocs.core.services.jobs.get_job_result_handler import JobNotReady
+from flydocs.core.services.extract.extract_handler import ExtractionTimedOutError
+from flydocs.core.services.extractions.cancel_extraction_handler import (
+ ExtractionNotCancellable,
+)
+from flydocs.core.services.extractions.get_extraction_result_handler import (
+ ExtractionNotReady,
+)
from flydocs.interfaces.dtos.error import ProblemDetails
@controller_advice
class ExceptionAdvice:
- @exception_handler(JobNotReady)
- async def job_not_ready(self, exc: JobNotReady) -> dict[str, Any]:
+ @exception_handler(ExtractionNotReady)
+ async def extraction_not_ready(self, exc: ExtractionNotReady) -> dict[str, Any]:
problem = ProblemDetails(
- type="https://flydocs.dev/problems/job-not-ready",
- title="Job not ready",
+ type="https://flydocs.dev/problems/not-ready",
+ title="Extraction not ready",
status=409,
- code="job_not_ready",
+ code="not_ready",
detail=str(exc),
- extensions={"job_id": exc.job_id, "status": exc.status.value},
+ extensions={
+ "extraction_id": exc.extraction_id,
+ "status": exc.status.value,
+ },
)
return problem.model_dump(exclude_none=True)
- @exception_handler(JobNotCancellable)
- async def job_not_cancellable(self, exc: JobNotCancellable) -> dict[str, Any]:
+ @exception_handler(ExtractionTimedOutError)
+ async def extraction_timed_out(self, exc: ExtractionTimedOutError) -> dict[str, Any]:
+ """Map ``ExtractionTimedOutError`` (sync ceiling exceeded) to HTTP 408.
+
+ The handler raises this when the in-process orchestrator exceeds
+ ``FLYDOCS_SYNC_TIMEOUT_S``. Callers expecting long-running
+ extractions should switch to ``POST /api/v1/extractions``.
+ """
+ problem = ProblemDetails(
+ type="https://flydocs.dev/problems/timeout",
+ title="Extraction timed out",
+ status=408,
+ code="timeout",
+ detail=str(exc),
+ extensions={"timeout_s": exc.timeout_s},
+ )
+ return problem.model_dump(exclude_none=True)
+
+ @exception_handler(ExtractionNotCancellable)
+ async def extraction_not_cancellable(self, exc: ExtractionNotCancellable) -> dict[str, Any]:
problem = ProblemDetails(
- type="https://flydocs.dev/problems/job-not-cancellable",
- title="Job cannot be cancelled",
+ type="https://flydocs.dev/problems/not-cancellable",
+ title="Extraction cannot be cancelled",
status=409,
- code="job_not_cancellable",
+ code="not_cancellable",
detail=str(exc),
)
return problem.model_dump(exclude_none=True)
diff --git a/src/flydocs/web/controllers/extract_controller.py b/src/flydocs/web/controllers/extract_controller.py
index 3f6a3df..302ddc0 100644
--- a/src/flydocs/web/controllers/extract_controller.py
+++ b/src/flydocs/web/controllers/extract_controller.py
@@ -17,6 +17,7 @@
from flydocs.config import IDPSettings
from flydocs.core.services.extract import ExtractCommand
+from flydocs.core.services.extract.extract_handler import ExtractionTimedOutError
from flydocs.core.services.validation import RequestValidator, ValidationReport
from flydocs.interfaces.dtos.extract import ExtractionRequest, ExtractionResult
@@ -27,7 +28,7 @@ class ValidationResponse(BaseModel):
Returned by ``POST /api/v1/extract:validate`` -- always status 200,
even when errors are present. The caller inspects ``ok`` to decide
whether to submit the payload to the real ``POST /api/v1/extract``
- or ``POST /api/v1/jobs`` endpoints.
+ or ``POST /api/v1/extractions`` endpoints.
"""
ok: bool = Field(description="True when the report has zero errors.")
@@ -49,8 +50,8 @@ class ExtractController:
Beyond the per-stage timeouts enforced inside the orchestrator,
the handler wraps the whole call in
``asyncio.wait_for(FLYDOCS_SYNC_TIMEOUT_S)``; if that elapses
- the caller gets a 408 ``extraction_timeout`` problem-detail and is
- expected to retry through ``POST /api/v1/jobs``.
+ the caller gets a 408 ``timeout`` problem-detail and is
+ expected to retry through ``POST /api/v1/extractions``.
Two gates run *before* the request enters the pipeline so a
malformed call never reaches the LLM provider:
@@ -105,43 +106,46 @@ async def extract(self, request: Valid[Body[ExtractionRequest]]) -> ExtractionRe
Use this endpoint when you can wait for the answer
(sub-minute, single document). For long-running or fire-and-forget
- workloads, prefer ``POST /api/v1/jobs``.
+ workloads, prefer ``POST /api/v1/extractions``.
Errors map to RFC 7807 problem-details:
- ``408 extraction_timeout`` (pipeline exceeded the sync ceiling),
- ``413 document_too_large`` (document over ``FLYDOCS_MAX_BYTES``),
+ ``408 timeout`` (pipeline exceeded the sync ceiling),
+ ``413 file_too_large`` (file over ``FLYDOCS_MAX_BYTES``),
``422 invalid_base64`` (``content_base64`` failed strict parsing),
- ``422 invalid_request`` (semantic mismatch detected by the
+ ``422 validation_failed`` (semantic mismatch detected by the
:class:`RequestValidator`, e.g. a rule referencing an unknown
- documentType -- the response includes a list of every issue
+ document type -- the response includes a list of every issue
found so the caller can fix them all at once).
"""
_enforce_size_limits(request, max_bytes=self._settings.max_bytes)
_enforce_semantic_validation(request, self._validator)
try:
return await self._commands.send(ExtractCommand(request=request))
+ except ExtractionTimedOutError as exc:
+ raise _http_problem(408, "timeout", "Extraction timed out", str(exc)) from exc
except TimeoutError as exc:
- raise _http_problem(408, "extraction_timeout", "Extraction timed out", str(exc)) from exc
+ raise _http_problem(408, "timeout", "Extraction timed out", str(exc)) from exc
def _enforce_size_limits(request: ExtractionRequest, *, max_bytes: int) -> None:
"""Per-file size + base64 sanity."""
- for file in request.documents:
- encoded = file.content_base64
+ for file in request.files:
+ encoded = file.content_base64 or ""
decoded_size = (len(encoded) * 3) // 4
if decoded_size > max_bytes:
raise _http_problem(
413,
- "document_too_large",
- "Document too large",
+ "file_too_large",
+ "File too large",
f"{file.filename} is {decoded_size} bytes (max {max_bytes})",
)
- try:
- base64.b64decode(encoded, validate=True)
- except Exception as exc: # noqa: BLE001
- raise _http_problem(
- 422, "invalid_base64", "Invalid base64 content", f"{file.filename}: {exc}"
- ) from exc
+ if encoded:
+ try:
+ base64.b64decode(encoded, validate=True)
+ except Exception as exc: # noqa: BLE001
+ raise _http_problem(
+ 422, "invalid_base64", "Invalid base64 content", f"{file.filename}: {exc}"
+ ) from exc
def _enforce_semantic_validation(request: ExtractionRequest, validator: RequestValidator) -> None:
@@ -150,7 +154,7 @@ def _enforce_semantic_validation(request: ExtractionRequest, validator: RequestV
if report.has_errors:
raise _http_problem_with_payload(
status_code=422,
- code="invalid_request",
+ code="validation_failed",
title="Request failed semantic validation",
detail=(
f"{len(report.errors)} error(s) and {len(report.warnings)} "
diff --git a/src/flydocs/web/controllers/extractions_controller.py b/src/flydocs/web/controllers/extractions_controller.py
new file mode 100644
index 0000000..104ac3e
--- /dev/null
+++ b/src/flydocs/web/controllers/extractions_controller.py
@@ -0,0 +1,269 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""Asynchronous extraction endpoints -- ``POST /api/v1/extractions`` + lifecycle."""
+
+from __future__ import annotations
+
+import logging
+from datetime import datetime
+
+from pyfly.container import rest_controller
+
+# Depend on the concrete bus classes -- pyfly's container resolves by
+# exact type and the CQRS auto-config registers DefaultCommandBus /
+# DefaultQueryBus (the Protocols are not registered as bean types).
+from pyfly.cqrs import DefaultCommandBus, DefaultQueryBus
+from pyfly.kernel import ResourceNotFoundException
+from pyfly.web import (
+ Body,
+ Header,
+ PathVar,
+ QueryParam,
+ Valid,
+ delete_mapping,
+ get_mapping,
+ post_mapping,
+ request_mapping,
+)
+
+from flydocs.core.services.extractions import (
+ CancelExtractionCommand,
+ GetExtractionQuery,
+ GetExtractionResultQuery,
+ ListExtractionsQuery,
+ SubmitExtractionCommand,
+)
+from flydocs.core.services.extractions.cancel_extraction_handler import (
+ ExtractionNotCancellable,
+)
+from flydocs.core.services.extractions.get_extraction_result_handler import (
+ ExtractionNotReady,
+)
+from flydocs.core.services.extractions.submit_extraction_handler import (
+ InvalidRequestError,
+)
+from flydocs.interfaces.dtos.extraction import (
+ Extraction,
+ ExtractionListResponse,
+ ExtractionResultEnvelope,
+ SubmitExtractionRequest,
+)
+from flydocs.interfaces.enums.extraction_status import (
+ ExtractionStatus,
+ PostProcessingStatus,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@rest_controller
+@request_mapping("/api/v1/extractions")
+class ExtractionsController:
+ """REST adapter for the asynchronous, queue-backed extraction API.
+
+ The five endpoints cover the full lifecycle: submit (returns an
+ extraction id and 202), list, poll status, fetch the final result
+ envelope, cancel.
+
+ Submit honours an ``Idempotency-Key`` header so a retried submission
+ returns the original response instead of a duplicate row.
+ """
+
+ def __init__(self, commands: DefaultCommandBus, queries: DefaultQueryBus) -> None:
+ self._commands = commands
+ self._queries = queries
+
+ @post_mapping("", status_code=202)
+ async def submit(
+ self,
+ request: Valid[Body[SubmitExtractionRequest]],
+ idempotency_key: Header[str] = "",
+ ) -> Extraction:
+ """Submit a queued extraction.
+
+ The request body is the same as ``POST /api/v1/extract`` plus
+ the optional ``callback_url`` and ``metadata`` fields. The
+ endpoint persists the extraction, publishes it to the bus, and
+ returns ``202 Accepted`` with the new extraction id and the
+ initial ``queued`` status. The worker drives the same pipeline
+ as the sync endpoint behind the scenes.
+
+ Send the same ``Idempotency-Key`` header to replay an existing
+ submission instead of creating a duplicate row. The handler also
+ runs the semantic ``RequestValidator`` before persisting; a
+ mismatch -- e.g. a rule referencing an unknown document type --
+ returns ``422 validation_failed`` with every issue surfaced, and
+ nothing is written to Postgres or the EDA outbox.
+ """
+ try:
+ return await self._commands.send(
+ SubmitExtractionCommand(request=request, idempotency_key=idempotency_key or None)
+ )
+ except InvalidRequestError as exc:
+ raise _http_problem_with_payload(
+ status_code=422,
+ code="validation_failed",
+ title="Request failed semantic validation",
+ detail=(
+ f"{len(exc.report.errors)} error(s) and "
+ f"{len(exc.report.warnings)} warning(s) detected before queueing."
+ ),
+ extra=exc.report.to_payload(),
+ ) from exc
+
+ @get_mapping("")
+ async def list_extractions(
+ self,
+ status: QueryParam[str] = "",
+ post_processing_status: QueryParam[str] = "",
+ idempotency_key: QueryParam[str] = "",
+ created_after: QueryParam[str] = "",
+ created_before: QueryParam[str] = "",
+ limit: QueryParam[int] = 50,
+ offset: QueryParam[int] = 0,
+ ) -> ExtractionListResponse:
+ """Paginated, filterable listing of extractions.
+
+ Filters are optional and combine with ``AND``:
+
+ * ``status`` -- comma-separated list of statuses (e.g.
+ ``?status=succeeded,failed``). Empty = any status.
+ * ``post_processing_status`` -- comma-separated list of
+ post-processing bbox sub-states: ``pending``, ``running``,
+ ``succeeded``, ``failed``.
+ * ``idempotency_key`` -- exact match against the submit-time key.
+ * ``created_after`` / ``created_before`` -- RFC 3339 timestamps,
+ both inclusive.
+
+ Rows are returned ``submitted_at DESC`` (newest first) with
+ ``total`` reflecting the filtered set so the caller can paginate.
+ ``limit`` is capped at 500.
+ """
+ return await self._queries.query(
+ ListExtractionsQuery(
+ statuses=tuple(ExtractionStatus(s) for s in _split_csv(status)),
+ post_processing_statuses=tuple(
+ PostProcessingStatus(s) for s in _split_csv(post_processing_status)
+ ),
+ created_after=_parse_iso(created_after),
+ created_before=_parse_iso(created_before),
+ idempotency_key=idempotency_key or None,
+ limit=int(limit),
+ offset=int(offset),
+ )
+ )
+
+ @get_mapping("/{extraction_id}")
+ async def get_status(self, extraction_id: PathVar[str]) -> Extraction:
+ """Read the current state of an extraction.
+
+ Returns the lifecycle metadata (``queued`` / ``running`` /
+ ``succeeded`` / ``failed`` / ``cancelled``), the attempt
+ counter, the timestamps for submission / start / finish, and
+ (when applicable) the additive ``post_processing`` block.
+ Returns ``404`` for an unknown id.
+ """
+ status = await self._queries.query(GetExtractionQuery(extraction_id=extraction_id))
+ if status is None:
+ raise ResourceNotFoundException(
+ f"Extraction {extraction_id!r} not found",
+ code="not_found",
+ context={"extraction_id": extraction_id},
+ )
+ return status
+
+ @get_mapping("/{extraction_id}/result")
+ async def get_result(
+ self,
+ extraction_id: PathVar[str],
+ wait_for_post_processing: QueryParam[bool] = False,
+ timeout: QueryParam[float] = 60.0,
+ ) -> ExtractionResultEnvelope:
+ """Fetch the :class:`ExtractionResult` for a succeeded extraction.
+
+ Returns the result when the extraction is in ``succeeded``.
+ ``queued`` / ``running`` / ``failed`` / ``cancelled`` return
+ ``409 not_ready``. Unknown id returns ``404``.
+
+ ``wait_for_post_processing=true`` long-polls the row until the
+ bbox refinement leg finishes (``post_processing.bbox_refinement.status``
+ ∈ ``succeeded`` / ``failed``) or ``timeout`` (seconds, default 60)
+ elapses; on timeout the result is returned with whatever bbox
+ coordinates are currently persisted.
+ """
+ try:
+ result = await self._queries.query(
+ GetExtractionResultQuery(
+ extraction_id=extraction_id,
+ wait_for_post_processing=bool(wait_for_post_processing),
+ timeout_s=float(timeout),
+ )
+ )
+ except ExtractionNotReady as exc:
+ raise _http_problem(409, "not_ready", "Extraction not ready", str(exc)) from exc
+ if result is None:
+ raise ResourceNotFoundException(
+ f"Extraction {extraction_id!r} not found",
+ code="not_found",
+ context={"extraction_id": extraction_id},
+ )
+ return result
+
+ @delete_mapping("/{extraction_id}")
+ async def cancel(self, extraction_id: PathVar[str]) -> Extraction:
+ """Cancel an extraction that hasn't started yet.
+
+ Only valid while ``status == queued``. After the worker has
+ started on an extraction there is no mid-flight cancellation
+ hook -- the endpoint returns ``409 not_cancellable``. Unknown
+ id returns ``404``.
+ """
+ try:
+ cancelled = await self._commands.send(CancelExtractionCommand(extraction_id=extraction_id))
+ except ExtractionNotCancellable as exc:
+ raise _http_problem(409, "not_cancellable", "Extraction cannot be cancelled", str(exc)) from exc
+ if cancelled is None:
+ raise ResourceNotFoundException(
+ f"Extraction {extraction_id!r} not found",
+ code="not_found",
+ context={"extraction_id": extraction_id},
+ )
+ return cancelled
+
+
+def _split_csv(value: str) -> list[str]:
+ """Split a comma-separated query value into trimmed non-empty tokens."""
+ if not value:
+ return []
+ return [piece.strip() for piece in value.split(",") if piece.strip()]
+
+
+def _parse_iso(value: str) -> datetime | None:
+ """Parse an RFC 3339 timestamp; return ``None`` for empty input."""
+ if not value:
+ return None
+ # Python's ``fromisoformat`` accepts ``Z`` from 3.11+ but be defensive.
+ return datetime.fromisoformat(value.replace("Z", "+00:00"))
+
+
+def _http_problem(status_code: int, code: str, title: str, detail: str) -> Exception:
+ from fastapi import HTTPException
+
+ return HTTPException(
+ status_code=status_code,
+ detail={"code": code, "title": title, "detail": detail},
+ )
+
+
+def _http_problem_with_payload(
+ *,
+ status_code: int,
+ code: str,
+ title: str,
+ detail: str,
+ extra: dict,
+) -> Exception:
+ """RFC 7807-ish problem-detail that also surfaces the validator findings."""
+ from fastapi import HTTPException
+
+ body = {"code": code, "title": title, "detail": detail, **extra}
+ return HTTPException(status_code=status_code, detail=body)
diff --git a/src/flydocs/web/controllers/jobs_controller.py b/src/flydocs/web/controllers/jobs_controller.py
deleted file mode 100644
index ba3243e..0000000
--- a/src/flydocs/web/controllers/jobs_controller.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""Asynchronous job endpoints -- ``POST /api/v1/jobs`` + lifecycle."""
-
-from __future__ import annotations
-
-import logging
-from datetime import datetime
-
-from pyfly.container import rest_controller
-
-# Depend on the concrete bus classes -- pyfly's container resolves by
-# exact type and the CQRS auto-config registers DefaultCommandBus /
-# DefaultQueryBus (the Protocols are not registered as bean types).
-from pyfly.cqrs import DefaultCommandBus, DefaultQueryBus
-from pyfly.kernel import ResourceNotFoundException
-from pyfly.web import (
- Body,
- Header,
- PathVar,
- QueryParam,
- Valid,
- delete_mapping,
- get_mapping,
- post_mapping,
- request_mapping,
-)
-
-from flydocs.core.services.jobs import (
- CancelJobCommand,
- GetJobQuery,
- GetJobResultQuery,
- ListJobsQuery,
- SubmitJobCommand,
-)
-from flydocs.core.services.jobs.cancel_job_handler import JobNotCancellable
-from flydocs.core.services.jobs.get_job_result_handler import JobNotReady
-from flydocs.core.services.jobs.submit_job_handler import InvalidRequestError
-from flydocs.interfaces.dtos.job import (
- JobListResponse,
- JobResult,
- JobStatusResponse,
- SubmitJobRequest,
- SubmitJobResponse,
-)
-from flydocs.interfaces.enums.job_status import JobStatus
-
-logger = logging.getLogger(__name__)
-
-
-@rest_controller
-@request_mapping("/api/v1/jobs")
-class JobsController:
- """REST adapter for the asynchronous, queue-backed extraction API.
-
- The four endpoints cover the full job lifecycle: submit (returns
- a job id and 202), poll status, fetch the final result, cancel.
- Submit honours an ``Idempotency-Key`` header so a retried submission
- returns the original response instead of a duplicate job.
- """
-
- def __init__(self, commands: DefaultCommandBus, queries: DefaultQueryBus) -> None:
- self._commands = commands
- self._queries = queries
-
- @post_mapping("", status_code=202)
- async def submit(
- self,
- request: Valid[Body[SubmitJobRequest]],
- idempotency_key: Header[str] = "",
- ) -> SubmitJobResponse:
- """Submit a queued extraction job.
-
- The request body is the same as ``POST /api/v1/extract`` plus
- the optional ``callback_url`` and ``metadata`` fields. The
- endpoint persists the job, publishes it to the queue, and
- returns ``202 Accepted`` with the new ``job_id`` and the
- initial ``QUEUED`` status. The worker drives the same pipeline
- as the sync endpoint behind the scenes.
-
- Send the same ``Idempotency-Key`` header to replay an existing
- submission instead of creating a duplicate job. The handler also
- runs the semantic ``RequestValidator`` before persisting the job;
- a mismatch -- e.g. a rule referencing an unknown documentType --
- returns ``422 invalid_request`` with every issue surfaced, and
- nothing is written to Postgres or Redis.
- """
- try:
- return await self._commands.send(
- SubmitJobCommand(request=request, idempotency_key=idempotency_key or None)
- )
- except InvalidRequestError as exc:
- raise _http_problem_with_payload(
- status_code=422,
- code="invalid_request",
- title="Request failed semantic validation",
- detail=(
- f"{len(exc.report.errors)} error(s) and "
- f"{len(exc.report.warnings)} warning(s) detected before queueing."
- ),
- extra=exc.report.to_payload(),
- ) from exc
-
- @get_mapping("")
- async def list_jobs(
- self,
- status: QueryParam[str] = "",
- bbox_refine_status: QueryParam[str] = "",
- idempotency_key: QueryParam[str] = "",
- created_after: QueryParam[str] = "",
- created_before: QueryParam[str] = "",
- limit: QueryParam[int] = 50,
- offset: QueryParam[int] = 0,
- ) -> JobListResponse:
- """Paginated, filterable listing of extraction jobs.
-
- Filters are optional and combine with ``AND``:
-
- * ``status`` -- comma-separated list of statuses (e.g.
- ``?status=SUCCEEDED,PARTIAL_SUCCEEDED``). Empty = any status.
- * ``bbox_refine_status`` -- comma-separated list of refine
- sub-states: ``pending``, ``running``, ``succeeded``, ``failed``.
- * ``idempotency_key`` -- exact match against the submit-time key.
- * ``created_after`` / ``created_before`` -- RFC 3339 timestamps,
- both inclusive.
-
- Rows are returned ``created_at DESC`` (newest first) with
- ``total`` reflecting the filtered set so the caller can paginate.
- ``limit`` is capped at 500.
- """
- return await self._queries.query(
- ListJobsQuery(
- statuses=tuple(JobStatus(s) for s in _split_csv(status)),
- bbox_refine_statuses=tuple(_split_csv(bbox_refine_status)),
- created_after=_parse_iso(created_after),
- created_before=_parse_iso(created_before),
- idempotency_key=idempotency_key or None,
- limit=int(limit),
- offset=int(offset),
- )
- )
-
- @get_mapping("/{job_id}")
- async def get_status(self, job_id: PathVar[str]) -> JobStatusResponse:
- """Read the current status of a job.
-
- Returns the job's lifecycle metadata (``QUEUED`` / ``RUNNING``
- / ``SUCCEEDED`` / ``FAILED`` / ``CANCELLED``), the attempt
- counter, and the timestamps for submission / start / finish.
- Returns ``404`` for an unknown ``job_id``.
- """
- status = await self._queries.query(GetJobQuery(job_id=job_id))
- if status is None:
- raise ResourceNotFoundException(
- f"Job {job_id!r} not found", code="JOB_NOT_FOUND", context={"job_id": job_id}
- )
- return status
-
- @get_mapping("/{job_id}/result")
- async def get_result(
- self,
- job_id: PathVar[str],
- wait_for_bboxes: QueryParam[bool] = False,
- timeout: QueryParam[float] = 60.0,
- ) -> JobResult:
- """Fetch the ``ExtractionResult`` of a finished or partial job.
-
- Returns the result when the job is in ``SUCCEEDED``,
- ``PARTIAL_SUCCEEDED``, or ``REFINING_BBOXES``. ``QUEUED`` /
- ``RUNNING`` / ``FAILED`` / ``CANCELLED`` return
- ``409 job_not_ready``. Unknown ``job_id`` returns ``404``.
-
- ``wait_for_bboxes=true`` long-polls the row until the bbox refiner
- finishes (status -> ``SUCCEEDED``) or ``timeout`` (seconds, default
- 60) elapses; on timeout the partial result is returned with the
- LLM-bbox version intact. Useful for callers that submitted async
- but need grounded coordinates inline.
- """
- try:
- result = await self._queries.query(
- GetJobResultQuery(
- job_id=job_id,
- wait_for_bboxes=bool(wait_for_bboxes),
- timeout_s=float(timeout),
- )
- )
- except JobNotReady as exc:
- raise _http_problem(409, "job_not_ready", "Job not ready", str(exc)) from exc
- if result is None:
- raise ResourceNotFoundException(
- f"Job {job_id!r} not found", code="JOB_NOT_FOUND", context={"job_id": job_id}
- )
- return result
-
- @delete_mapping("/{job_id}")
- async def cancel(self, job_id: PathVar[str]) -> JobStatusResponse:
- """Cancel a job that hasn't started yet.
-
- Only valid while ``status == QUEUED``. After the worker has
- started on a job there is no mid-flight cancellation hook --
- the endpoint returns ``409 job_not_cancellable``. Unknown
- ``job_id`` returns ``404``.
- """
- try:
- cancelled = await self._commands.send(CancelJobCommand(job_id=job_id))
- except JobNotCancellable as exc:
- raise _http_problem(409, "job_not_cancellable", "Job cannot be cancelled", str(exc)) from exc
- if cancelled is None:
- raise ResourceNotFoundException(
- f"Job {job_id!r} not found", code="JOB_NOT_FOUND", context={"job_id": job_id}
- )
- return cancelled
-
-
-def _split_csv(value: str) -> list[str]:
- """Split a comma-separated query value into trimmed non-empty tokens."""
- if not value:
- return []
- return [piece.strip() for piece in value.split(",") if piece.strip()]
-
-
-def _parse_iso(value: str) -> datetime | None:
- """Parse an RFC 3339 timestamp; return ``None`` for empty input."""
- if not value:
- return None
- # Python's ``fromisoformat`` accepts ``Z`` from 3.11+ but be defensive.
- return datetime.fromisoformat(value.replace("Z", "+00:00"))
-
-
-def _http_problem(status_code: int, code: str, title: str, detail: str) -> Exception:
- from fastapi import HTTPException
-
- return HTTPException(
- status_code=status_code,
- detail={"code": code, "title": title, "detail": detail},
- )
-
-
-def _http_problem_with_payload(
- *,
- status_code: int,
- code: str,
- title: str,
- detail: str,
- extra: dict,
-) -> Exception:
- """RFC 7807-ish problem-detail that also surfaces the validator findings."""
- from fastapi import HTTPException
-
- body = {"code": code, "title": title, "detail": detail, **extra}
- return HTTPException(status_code=status_code, detail=body)
diff --git a/tests/integration/test_postgres_concurrency.py b/tests/integration/test_postgres_concurrency.py
index c09eb7b..3749c28 100644
--- a/tests/integration/test_postgres_concurrency.py
+++ b/tests/integration/test_postgres_concurrency.py
@@ -2,7 +2,7 @@
"""Real-Postgres integration tests for the concurrency fixes.
These tests exercise the same contracts as
-``tests/unit/test_extraction_job_repository.py`` and
+``tests/unit/test_extraction_repository.py`` and
``tests/unit/test_worker_concurrency.py``, but against a live Postgres
server -- the production substrate. SQLite serialises writers at the
file level so it can hide cases where Postgres' row-level locking +
@@ -21,8 +21,8 @@
import pytest
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
-from flydocs.models.entities.extraction_job import Base, ExtractionJob
-from flydocs.models.repositories import ExtractionJobRepository
+from flydocs.models.entities.extraction import Base, Extraction
+from flydocs.models.repositories import ExtractionRepository
_PG_URL = os.environ.get("FLYDOCS_TEST_PG_URL")
@@ -32,22 +32,22 @@
@pytest.fixture
-async def pg_repo() -> ExtractionJobRepository:
- """Fresh Postgres engine with a clean ``extraction_jobs`` table per test."""
+async def pg_repo() -> ExtractionRepository:
+ """Fresh Postgres engine with a clean ``extractions`` table per test."""
engine = create_async_engine(_PG_URL, future=True) # type: ignore[arg-type]
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.drop_all)
await conn.run_sync(Base.metadata.create_all)
factory = async_sessionmaker(engine, expire_on_commit=False)
- repo = ExtractionJobRepository(factory, engine=engine)
+ repo = ExtractionRepository(factory, engine=engine)
yield repo
await engine.dispose()
-async def _seed(repo: ExtractionJobRepository, **overrides) -> ExtractionJob:
- job = ExtractionJob(
+async def _seed(repo: ExtractionRepository, **overrides) -> Extraction:
+ ext = Extraction(
idempotency_key=overrides.get("idempotency_key"),
- status=overrides.get("status", "QUEUED"),
+ status=overrides.get("status", "queued"),
filename=overrides.get("filename", "test.pdf"),
content_sha256=overrides.get("content_sha256", "0" * 64),
content_bytes=overrides.get("content_bytes", 1),
@@ -56,14 +56,14 @@ async def _seed(repo: ExtractionJobRepository, **overrides) -> ExtractionJob:
metadata_json=overrides.get("metadata_json", {}),
attempts=overrides.get("attempts", 0),
started_at=overrides.get("started_at"),
- bbox_refine_status=overrides.get("bbox_refine_status"),
- bbox_refine_started_at=overrides.get("bbox_refine_started_at"),
+ post_processing_bbox_status=overrides.get("post_processing_bbox_status"),
+ post_processing_bbox_started_at=overrides.get("post_processing_bbox_started_at"),
)
- return await repo.add(job)
+ return await repo.add(ext)
@pytest.mark.asyncio
-async def test_postgres_atomic_claim_single_winner(pg_repo: ExtractionJobRepository) -> None:
+async def test_postgres_atomic_claim_single_winner(pg_repo: ExtractionRepository) -> None:
"""Under real Postgres row-level locking: exactly one of N concurrent
``mark_running`` calls wins, ``attempts`` increments exactly once."""
seeded = await _seed(pg_repo)
@@ -76,7 +76,7 @@ async def test_postgres_atomic_claim_single_winner(pg_repo: ExtractionJobReposit
@pytest.mark.asyncio
-async def test_postgres_concurrent_cancel_vs_claim(pg_repo: ExtractionJobRepository) -> None:
+async def test_postgres_concurrent_cancel_vs_claim(pg_repo: ExtractionRepository) -> None:
"""Cancel + worker-claim race: exactly one wins, the other gets None."""
seeded = await _seed(pg_repo)
@@ -94,8 +94,8 @@ async def test_postgres_concurrent_cancel_vs_claim(pg_repo: ExtractionJobReposit
@pytest.mark.asyncio
-async def test_postgres_stale_lease_reclaim(pg_repo: ExtractionJobRepository) -> None:
- """A RUNNING job past its lease window is reclaimable; a fresh one isn't."""
+async def test_postgres_stale_lease_reclaim(pg_repo: ExtractionRepository) -> None:
+ """A running extraction past its lease window is reclaimable; a fresh one isn't."""
seeded = await _seed(pg_repo)
first = await pg_repo.mark_running(seeded.id, lease_seconds=300)
assert first is not None
@@ -111,8 +111,8 @@ async def test_postgres_stale_lease_reclaim(pg_repo: ExtractionJobRepository) ->
async with pg_repo._session_factory() as session: # type: ignore[attr-defined]
await session.execute(
- update(ExtractionJob)
- .where(ExtractionJob.id == seeded.id)
+ update(Extraction)
+ .where(Extraction.id == seeded.id)
.values(started_at=datetime.now(UTC) - timedelta(seconds=600))
)
await session.commit()
@@ -124,9 +124,9 @@ async def test_postgres_stale_lease_reclaim(pg_repo: ExtractionJobRepository) ->
@pytest.mark.asyncio
-async def test_postgres_finalisation_idempotent(pg_repo: ExtractionJobRepository) -> None:
+async def test_postgres_finalisation_idempotent(pg_repo: ExtractionRepository) -> None:
"""Two concurrent ``mark_succeeded`` calls: one wins, one returns None."""
- seeded = await _seed(pg_repo, status="RUNNING", started_at=datetime.now(UTC))
+ seeded = await _seed(pg_repo, status="running", started_at=datetime.now(UTC))
results = await asyncio.gather(
pg_repo.mark_succeeded(seeded.id, result={"first": True}),
diff --git a/tests/integration/test_reaper_postgres.py b/tests/integration/test_reaper_postgres.py
index fd2b5f7..82626a9 100644
--- a/tests/integration/test_reaper_postgres.py
+++ b/tests/integration/test_reaper_postgres.py
@@ -4,16 +4,16 @@
Demonstrates that the five orphan classes identified in the second
audit are actually revived by the reaper + worker claim cycle:
-1. ``QUEUED`` orphan (submit-publish crashed) -> JobReaper revives.
-2. ``RUNNING`` orphan (worker crashed past its lease) -> JobReaper.
-3. ``QUEUED`` orphan (retry-publish crashed) -> JobReaper.
-4. ``PARTIAL_SUCCEEDED`` orphan (bbox-publish crashed) -> BboxReaper.
-5. ``REFINING_BBOXES`` orphan (bbox worker crashed) -> BboxReaper.
+1. ``queued`` orphan (submit-publish crashed) -> ExtractionReaper revives.
+2. ``running`` orphan (worker crashed past its lease) -> ExtractionReaper.
+3. ``queued`` orphan (retry-publish crashed) -> ExtractionReaper.
+4. ``succeeded`` orphan with bbox sub-status ``pending`` -> BboxReaper.
+5. ``succeeded`` orphan with stale bbox sub-status ``running`` -> BboxReaper.
In each case we seed the row directly in the stuck state, run a single
reaper sweep, and assert that a fresh event was published with the
-right job id. The actual claim-then-process is covered by the
-``test_extraction_job_repository.py`` atomic-transition tests, so we
+right extraction id. The actual claim-then-process is covered by the
+``test_extraction_repository.py`` atomic-transition tests, so we
stop at "event published" here -- the rest of the chain is
identical to the happy path.
"""
@@ -30,9 +30,9 @@
from flydocs.config import IDPSettings
from flydocs.core.services.workers.bbox_reaper import BboxReaper
-from flydocs.core.services.workers.job_reaper import JobReaper
-from flydocs.models.entities.extraction_job import Base, ExtractionJob
-from flydocs.models.repositories import ExtractionJobRepository
+from flydocs.core.services.workers.job_reaper import ExtractionReaper
+from flydocs.models.entities.extraction import Base, Extraction
+from flydocs.models.repositories import ExtractionRepository
_PG_URL = os.environ.get("FLYDOCS_TEST_PG_URL")
@@ -42,20 +42,20 @@
@pytest.fixture
-async def pg_repo() -> ExtractionJobRepository:
+async def pg_repo() -> ExtractionRepository:
engine = create_async_engine(_PG_URL, future=True) # type: ignore[arg-type]
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.drop_all)
await conn.run_sync(Base.metadata.create_all)
factory = async_sessionmaker(engine, expire_on_commit=False)
- repo = ExtractionJobRepository(factory, engine=engine)
+ repo = ExtractionRepository(factory, engine=engine)
yield repo
await engine.dispose()
-async def _seed(repo: ExtractionJobRepository, **overrides) -> ExtractionJob:
- job = ExtractionJob(
- status=overrides.get("status", "QUEUED"),
+async def _seed(repo: ExtractionRepository, **overrides) -> Extraction:
+ ext = Extraction(
+ status=overrides.get("status", "queued"),
filename=overrides.get("filename", "test.pdf"),
content_sha256=overrides.get("content_sha256", "0" * 64),
content_bytes=overrides.get("content_bytes", 1),
@@ -64,20 +64,21 @@ async def _seed(repo: ExtractionJobRepository, **overrides) -> ExtractionJob:
metadata_json=overrides.get("metadata_json", {}),
attempts=overrides.get("attempts", 0),
started_at=overrides.get("started_at"),
- bbox_refine_status=overrides.get("bbox_refine_status"),
- bbox_refine_started_at=overrides.get("bbox_refine_started_at"),
+ finished_at=overrides.get("finished_at"),
+ post_processing_bbox_status=overrides.get("post_processing_bbox_status"),
+ post_processing_bbox_started_at=overrides.get("post_processing_bbox_started_at"),
)
- job = await repo.add(job)
- if "created_at" in overrides:
+ ext = await repo.add(ext)
+ if "submitted_at" in overrides:
async with repo._session_factory() as session: # type: ignore[attr-defined]
await session.execute(
- update(ExtractionJob)
- .where(ExtractionJob.id == job.id)
- .values(created_at=overrides["created_at"])
+ update(Extraction)
+ .where(Extraction.id == ext.id)
+ .values(submitted_at=overrides["submitted_at"])
)
await session.commit()
- job = await repo.get(job.id) # type: ignore[assignment]
- return job
+ ext = await repo.get(ext.id) # type: ignore[assignment]
+ return ext
def _publisher() -> MagicMock:
@@ -87,36 +88,36 @@ def _publisher() -> MagicMock:
@pytest.mark.asyncio
-async def test_job_reaper_revives_all_three_job_orphan_classes(
- pg_repo: ExtractionJobRepository,
+async def test_extraction_reaper_revives_all_three_orphan_classes(
+ pg_repo: ExtractionRepository,
) -> None:
"""End-to-end: seed stuck rows, sweep, verify republish for each."""
now = datetime.now(UTC)
- # Orphan 1: QUEUED, submit-publish crashed.
+ # Orphan 1: queued, submit-publish crashed.
submit_orphan = await _seed(
pg_repo,
- status="QUEUED",
- created_at=now - timedelta(seconds=1200),
+ status="queued",
+ submitted_at=now - timedelta(seconds=1200),
)
- # Orphan 2: RUNNING, worker crashed past lease.
+ # Orphan 2: running, worker crashed past lease.
crashed_runner = await _seed(
pg_repo,
- status="RUNNING",
+ status="running",
started_at=now - timedelta(seconds=2000),
attempts=1,
)
- # Orphan 3: QUEUED after requeue, delayed-publish task killed.
+ # Orphan 3: queued after requeue, delayed-publish task killed.
retry_orphan = await _seed(
pg_repo,
- status="QUEUED",
+ status="queued",
started_at=now - timedelta(seconds=1200),
attempts=1,
)
- # Negative control: a fresh QUEUED row should NOT be reaped.
- fresh = await _seed(pg_repo, status="QUEUED")
+ # Negative control: a fresh queued row should NOT be reaped.
+ fresh = await _seed(pg_repo, status="queued")
publisher = _publisher()
- reaper = JobReaper(
+ reaper = ExtractionReaper(
repository=pg_repo,
event_publisher=publisher,
settings=IDPSettings(
@@ -127,7 +128,7 @@ async def test_job_reaper_revives_all_three_job_orphan_classes(
await reaper._sweep()
- published_ids = [c.kwargs["payload"]["job_id"] for c in publisher.publish.await_args_list]
+ published_ids = [c.kwargs["payload"]["extraction"]["id"] for c in publisher.publish.await_args_list]
assert submit_orphan.id in published_ids
assert crashed_runner.id in published_ids
assert retry_orphan.id in published_ids
@@ -136,29 +137,29 @@ async def test_job_reaper_revives_all_three_job_orphan_classes(
@pytest.mark.asyncio
async def test_bbox_reaper_revives_both_bbox_orphan_classes(
- pg_repo: ExtractionJobRepository,
+ pg_repo: ExtractionRepository,
) -> None:
now = datetime.now(UTC)
- # Orphan A: PARTIAL_SUCCEEDED, main-worker bbox-publish crashed.
+ # Orphan A: succeeded, main-worker bbox-publish crashed.
publish_orphan = await _seed(
pg_repo,
- status="PARTIAL_SUCCEEDED",
- bbox_refine_status="pending",
- started_at=now - timedelta(seconds=2000),
+ status="succeeded",
+ post_processing_bbox_status="pending",
+ finished_at=now - timedelta(seconds=2000),
)
- # Orphan B: REFINING_BBOXES, bbox-worker crashed past lease.
+ # Orphan B: succeeded, bbox-worker crashed past lease.
crashed_bbox = await _seed(
pg_repo,
- status="REFINING_BBOXES",
- bbox_refine_status="running",
- bbox_refine_started_at=now - timedelta(seconds=2000),
+ status="succeeded",
+ post_processing_bbox_status="running",
+ post_processing_bbox_started_at=now - timedelta(seconds=2000),
)
- # Negative control: a fresh REFINING_BBOXES claim.
+ # Negative control: a fresh bbox-leg claim.
fresh = await _seed(
pg_repo,
- status="REFINING_BBOXES",
- bbox_refine_status="running",
- bbox_refine_started_at=now,
+ status="succeeded",
+ post_processing_bbox_status="running",
+ post_processing_bbox_started_at=now,
)
publisher = _publisher()
@@ -173,7 +174,7 @@ async def test_bbox_reaper_revives_both_bbox_orphan_classes(
await reaper._sweep()
- published_ids = [c.kwargs["payload"]["job_id"] for c in publisher.publish.await_args_list]
+ published_ids = [c.kwargs["payload"]["extraction"]["id"] for c in publisher.publish.await_args_list]
assert publish_orphan.id in published_ids
assert crashed_bbox.id in published_ids
assert fresh.id not in published_ids
@@ -181,13 +182,13 @@ async def test_bbox_reaper_revives_both_bbox_orphan_classes(
@pytest.mark.asyncio
async def test_reaper_republish_revives_through_full_claim_cycle(
- pg_repo: ExtractionJobRepository,
+ pg_repo: ExtractionRepository,
) -> None:
- """Crash-recovery proof: stale RUNNING is reclaimable after the lease.
+ """Crash-recovery proof: stale running is reclaimable after the lease.
Sequence:
- 1. Worker A claims a QUEUED job (status=RUNNING, fresh lease).
- 2. Worker A "crashes" -- we leave the row in RUNNING.
+ 1. Worker A claims a queued extraction (status=running, fresh lease).
+ 2. Worker A "crashes" -- we leave the row in running.
3. Reaper sees the row is past its lease (we backdate started_at).
4. Reaper publishes a fresh event (verified via publisher mock).
5. A "fresh" worker calls mark_running with the same lease and wins
@@ -195,19 +196,19 @@ async def test_reaper_republish_revives_through_full_claim_cycle(
"""
seeded = await _seed(pg_repo)
first_claim = await pg_repo.mark_running(seeded.id, lease_seconds=1260)
- assert first_claim is not None and first_claim.status == "RUNNING"
+ assert first_claim is not None and first_claim.status == "running"
# Backdate started_at to past the lease window.
async with pg_repo._session_factory() as session: # type: ignore[attr-defined]
await session.execute(
- update(ExtractionJob)
- .where(ExtractionJob.id == seeded.id)
+ update(Extraction)
+ .where(Extraction.id == seeded.id)
.values(started_at=datetime.now(UTC) - timedelta(seconds=2000))
)
await session.commit()
publisher = _publisher()
- reaper = JobReaper(
+ reaper = ExtractionReaper(
repository=pg_repo,
event_publisher=publisher,
settings=IDPSettings(job_run_lease_s=1260, queued_orphan_threshold_s=600),
diff --git a/tests/llm/test_real_claude_extraction.py b/tests/llm/test_real_claude_extraction.py
index a958300..17b5bb7 100644
--- a/tests/llm/test_real_claude_extraction.py
+++ b/tests/llm/test_real_claude_extraction.py
@@ -12,8 +12,8 @@
Exercises:
- multimodal extraction with bounding boxes,
-- pure-Python field validation including :class:`StandardValidator`
- checks (NIF / NIE),
+- pure-Python field validation including built-in validator checks
+ (NIF / NIE),
- visual-authenticity LLM check,
- LLM judge re-evaluation,
- the business :class:`RuleEngine` with cross-field predicates.
@@ -33,18 +33,18 @@
import pytest
-from flydocs.interfaces.dtos.doc import DocSpec, DocType, ValidatorsSpec, VisualValidatorSpec
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec, VisualCheck
from flydocs.interfaces.dtos.extract import (
- DocumentInput,
ExtractionOptions,
ExtractionRequest,
+ FileInput,
StageToggles,
)
-from flydocs.interfaces.dtos.field import FieldGroup, FieldSpec
+from flydocs.interfaces.dtos.field import Field, FieldGroup
from flydocs.interfaces.dtos.rule import RuleFieldParent, RuleOutputSpec, RuleSpec
-from flydocs.interfaces.dtos.standard_validator import StandardValidatorSpec
+from flydocs.interfaces.dtos.validator import ValidatorSpec
from flydocs.interfaces.enums.field_type import FieldType
-from flydocs.interfaces.enums.standard_validator import StandardValidatorType
+from flydocs.interfaces.enums.validator import ValidatorType
PDF_PATH = Path.home() / "Downloads" / "escritura_poderes_2025.pdf"
MODEL = os.environ.get("FLYDOCS_TEST_MODEL", "anthropic:claude-opus-4-7")
@@ -56,59 +56,59 @@
_DOC_TYPE = "escritura_poderes"
-_FIELD_SPECS = [
- FieldSpec(
- fieldName="numero_protocolo",
- fieldDescription="Número de protocolo notarial.",
- fieldType=FieldType.STRING,
+_FIELDS = [
+ Field(
+ name="numero_protocolo",
+ description="Número de protocolo notarial.",
+ type=FieldType.STRING,
),
- FieldSpec(
- fieldName="fecha",
- fieldDescription="Fecha del otorgamiento en formato ISO YYYY-MM-DD.",
- fieldType=FieldType.STRING,
- standard_validators=[StandardValidatorSpec(type=StandardValidatorType.DATE)],
+ Field(
+ name="fecha",
+ description="Fecha del otorgamiento en formato ISO YYYY-MM-DD.",
+ type=FieldType.STRING,
+ validators=[ValidatorSpec(name=ValidatorType.DATE)],
),
- FieldSpec(
- fieldName="notario",
- fieldDescription="Nombre completo del notario que autoriza.",
- fieldType=FieldType.STRING,
+ Field(
+ name="notario",
+ description="Nombre completo del notario que autoriza.",
+ type=FieldType.STRING,
),
- FieldSpec(
- fieldName="otorgante_nombre",
- fieldDescription="Nombre completo del otorgante (poderdante).",
- fieldType=FieldType.STRING,
+ Field(
+ name="otorgante_nombre",
+ description="Nombre completo del otorgante (poderdante).",
+ type=FieldType.STRING,
),
- FieldSpec(
- fieldName="otorgante_dni_nie",
- fieldDescription="DNI o NIE del otorgante (8 dígitos + letra, o letra + 7 dígitos + letra).",
- fieldType=FieldType.STRING,
- standard_validators=[
- StandardValidatorSpec(type=StandardValidatorType.NIF, severity="warning"),
- StandardValidatorSpec(type=StandardValidatorType.NIE, severity="warning"),
+ Field(
+ name="otorgante_dni_nie",
+ description="DNI o NIE del otorgante (8 dígitos + letra, o letra + 7 dígitos + letra).",
+ type=FieldType.STRING,
+ validators=[
+ ValidatorSpec(name=ValidatorType.NIF, severity="warning"),
+ ValidatorSpec(name=ValidatorType.NIE, severity="warning"),
],
),
- FieldSpec(
- fieldName="apoderado_nombre",
- fieldDescription="Nombre completo del apoderado.",
- fieldType=FieldType.STRING,
+ Field(
+ name="apoderado_nombre",
+ description="Nombre completo del apoderado.",
+ type=FieldType.STRING,
),
- FieldSpec(
- fieldName="apoderado_dni_nie",
- fieldDescription="DNI o NIE del apoderado.",
- fieldType=FieldType.STRING,
- standard_validators=[
- StandardValidatorSpec(type=StandardValidatorType.NIF, severity="warning"),
- StandardValidatorSpec(type=StandardValidatorType.NIE, severity="warning"),
+ Field(
+ name="apoderado_dni_nie",
+ description="DNI o NIE del apoderado.",
+ type=FieldType.STRING,
+ validators=[
+ ValidatorSpec(name=ValidatorType.NIF, severity="warning"),
+ ValidatorSpec(name=ValidatorType.NIE, severity="warning"),
],
),
]
-_VISUAL_VALIDATORS = [
- VisualValidatorSpec(
+_VISUAL_CHECKS = [
+ VisualCheck(
name="firma_notario",
description="The notary's handwritten signature is present.",
),
- VisualValidatorSpec(
+ VisualCheck(
name="sello_notarial",
description="The notary's official stamp / seal is present.",
),
@@ -124,9 +124,9 @@
),
parents=[
RuleFieldParent(
- parentType="field",
- documentType=_DOC_TYPE,
- fieldNames=[
+ kind="field",
+ document_type=_DOC_TYPE,
+ fields=[
"otorgante_nombre",
"apoderado_nombre",
"otorgante_dni_nie",
@@ -145,9 +145,9 @@
),
parents=[
RuleFieldParent(
- parentType="field",
- documentType=_DOC_TYPE,
- fieldNames=["otorgante_nombre", "apoderado_nombre"],
+ kind="field",
+ document_type=_DOC_TYPE,
+ fields=["otorgante_nombre", "apoderado_nombre"],
),
],
output=RuleOutputSpec(type="boolean", valid_outputs=["true", "false"]),
@@ -160,9 +160,9 @@
),
parents=[
RuleFieldParent(
- parentType="field",
- documentType=_DOC_TYPE,
- fieldNames=["fecha"],
+ kind="field",
+ document_type=_DOC_TYPE,
+ fields=["fecha"],
),
],
output=RuleOutputSpec(type="boolean", valid_outputs=["true", "false"]),
@@ -178,35 +178,35 @@
def _render(result, request) -> str:
out: list[str] = []
out.append("=" * 70)
- out.append(f" flydocs -- real Claude run ({result.model})")
+ out.append(f" flydocs -- real Claude run ({result.pipeline.model})")
out.append("=" * 70)
out.append("")
primary = result.files[0]
out.append(f"document : {primary.filename} ({primary.media_type})")
out.append(f"pages : {primary.page_count}")
out.append(f"bytes : {primary.bytes:,}")
- out.append(f"latency_ms : {result.latency_ms:,}")
- out.append(f"request_id : {result.request_id}")
+ out.append(f"latency_ms : {result.pipeline.latency_ms:,}")
+ out.append(f"id : {result.id}")
out.append("")
for doc in result.documents:
out.append("-" * 70)
- out.append(f"document_type : {doc.document_type}")
+ out.append(f"document_type : {doc.type}")
out.append(f"pages : {doc.pages}")
out.append(f"confidence : {doc.confidence:.2f}")
out.append("")
out.append(f"{'FIELD':25s} {'VALUE':40s} {'CONF':>5s} {'PAGE':>4s} {'JUDGE':10s} {'VALID':6s}")
- for group in doc.fields:
- out.append(f" [group] {group.fieldGroupName}")
- for f in group.fieldGroupFields:
- value = f.fieldValueFound if f.fieldValueFound is not None else "—"
+ for group in doc.field_groups:
+ out.append(f" [group] {group.name}")
+ for f in group.fields:
+ value = f.value if f.value is not None else "—"
value_s = str(value)[:38]
- pages_s = ",".join(str(p) for p in f.pagesFound) or "—"
+ pages_s = ",".join(str(p) for p in f.pages) or "—"
judge_s = f.judge.status.value if f.judge else "—"
- fv = f.field_validation
+ fv = f.validation
valid_s = "OK" if fv.valid else "BAD"
out.append(
- f" {f.fieldName:25s} {value_s:40s} {f.confidence:.2f} "
+ f" {f.name:25s} {value_s:40s} {f.confidence:.2f} "
f"{pages_s:>4s} {judge_s:10s} {valid_s:6s}"
)
if fv.errors:
@@ -223,19 +223,20 @@ def _render(result, request) -> str:
out.append("VISUAL AUTHENTICITY")
for v in doc.authenticity.visual:
badge = "PASS" if v.passed else "FAIL"
- out.append(f" - {v.name:30s} {badge:5s} conf={v.confidence:.2f} -- {v.notes[:60]}")
+ out.append(f" - {v.name:30s} {badge:5s} conf={v.confidence:.2f} -- {(v.notes or '')[:60]}")
out.append("")
if result.rule_results:
out.append("-" * 70)
out.append("BUSINESS RULES")
for r in result.rule_results:
- out.append(f" - {r.rule_id:25s} -> {r.output:10s} {r.summary[:80]}")
+ summary = r.summary or ""
+ out.append(f" - {r.rule_id:25s} -> {r.output:10s} {summary[:80]}")
out.append("")
- if result.pipeline_errors:
+ if result.pipeline.errors:
out.append("PIPELINE ERRORS:")
- out.append(json.dumps(result.pipeline_errors, indent=2))
+ out.append(json.dumps([e.model_dump() for e in result.pipeline.errors], indent=2))
out.append("=" * 70)
return "\n".join(out)
@@ -259,20 +260,18 @@ def test_real_claude_extraction_with_rules() -> None:
pytest.skip(f"Sample PDF {PDF_PATH} not found locally.")
pdf_bytes = PDF_PATH.read_bytes()
- doc_spec = DocSpec(
- docType=DocType(
- documentType=_DOC_TYPE,
- description="Escritura notarial de poderes (Spanish notarial power of attorney)",
- country="ES",
- ),
- fieldGroups=[
+ doc_spec = DocumentTypeSpec(
+ id=_DOC_TYPE,
+ description="Escritura notarial de poderes (Spanish notarial power of attorney)",
+ country="ES",
+ field_groups=[
FieldGroup(
- fieldGroupName="otorgamiento",
- fieldGroupDesc="Datos del otorgamiento",
- fieldGroupFields=_FIELD_SPECS,
+ name="otorgamiento",
+ description="Datos del otorgamiento",
+ fields=_FIELDS,
)
],
- validators=ValidatorsSpec(visual=_VISUAL_VALIDATORS),
+ visual_checks=_VISUAL_CHECKS,
)
request = ExtractionRequest(
@@ -281,14 +280,14 @@ def test_real_claude_extraction_with_rules() -> None:
"Extract the canonical fields, verify the notary's signature is "
"present, and evaluate whether the document is complete and recent."
),
- documents=[
- DocumentInput(
+ files=[
+ FileInput(
filename=PDF_PATH.name,
content_base64=base64.b64encode(pdf_bytes).decode("ascii"),
content_type="application/pdf",
)
],
- docs=[doc_spec],
+ document_types=[doc_spec],
rules=_RULES,
options=ExtractionOptions(
model=MODEL,
@@ -314,39 +313,43 @@ def test_real_claude_extraction_with_rules() -> None:
assert result.files[0].page_count >= 1
assert len(result.documents) == 1
doc = result.documents[0]
- assert doc.document_type == _DOC_TYPE
+ assert doc.type == _DOC_TYPE
assert doc.missing is False
- fields = doc.fields[0].fieldGroupFields
- expected_names = {fs.fieldName for fs in _FIELD_SPECS}
- assert {f.fieldName for f in fields} == expected_names
+ fields = doc.field_groups[0].fields
+ expected_names = {f.name for f in _FIELDS}
+ assert {f.name for f in fields} == expected_names
- located = [f for f in fields if f.fieldValueFound is not None]
+ located = [f for f in fields if f.value is not None]
assert len(located) >= 4, "Expected most fields located"
# Every located field must carry a non-empty bbox AND a page.
for f in located:
- assert f.pagesFound, f"Located field {f.fieldName!r} has no pages"
- assert all(p >= 1 for p in f.pagesFound)
+ assert f.pages, f"Located field {f.name!r} has no pages"
+ assert all(p >= 1 for p in f.pages)
+ assert f.bbox is not None
assert f.bbox.xmax > f.bbox.xmin and f.bbox.ymax > f.bbox.ymin, (
- f"Located field {f.fieldName!r} has degenerate bbox"
+ f"Located field {f.name!r} has degenerate bbox"
)
# Judge must have stamped a verdict on every located field.
for f in located:
- assert f.judge.status.value in {"PASS", "FAIL", "UNCERTAIN"}
+ assert f.judge.status.value in {"pass", "fail", "uncertain"}
- # Visual authenticity must have one outcome per requested validator.
- assert len(doc.authenticity.visual) == len(_VISUAL_VALIDATORS)
+ # Visual authenticity must have one outcome per requested check.
+ assert len(doc.authenticity.visual) == len(_VISUAL_CHECKS)
by_name = {v.name: v for v in doc.authenticity.visual}
- assert set(by_name.keys()) == {v.name for v in _VISUAL_VALIDATORS}
+ assert set(by_name.keys()) == {v.name for v in _VISUAL_CHECKS}
# Rule engine must have evaluated every rule.
assert len(result.rule_results) == len(_RULES)
by_rule = {r.rule_id: r for r in result.rule_results}
assert set(by_rule.keys()) == {r.id for r in _RULES}
for r in result.rule_results:
- assert r.output in {"true", "false", "unknown", ""} or r.output.lower() in {"true", "false"}
+ assert r.output in {"true", "false", "unknown", ""} or r.output.lower() in {
+ "true",
+ "false",
+ }
async def _run_via_di(request: ExtractionRequest):
diff --git a/tests/unit/test_bbox_refine_worker.py b/tests/unit/test_bbox_refine_worker.py
index 4643e0b..7401a3f 100644
--- a/tests/unit/test_bbox_refine_worker.py
+++ b/tests/unit/test_bbox_refine_worker.py
@@ -28,9 +28,9 @@
from flydocs.core.services.binary.pdf_guard import PdfGuard
from flydocs.core.services.workers.bbox_refine_worker import BboxRefineWorker
from flydocs.interfaces.dtos.bbox import BboxSource, BoundingBox
-from flydocs.interfaces.dtos.extract import ExtractedDocument, ExtractionResult
+from flydocs.interfaces.dtos.extract import Document, ExtractionResult, PipelineMeta
from flydocs.interfaces.dtos.field import ExtractedField, ExtractedFieldGroup
-from flydocs.interfaces.enums.job_status import JobStatus
+from flydocs.interfaces.enums.extraction_status import ExtractionStatus
def _real_pdf() -> bytes:
@@ -44,23 +44,23 @@ def _real_pdf() -> bytes:
def _result_with_field(value: str) -> ExtractionResult:
field_ = ExtractedField(
- fieldName="customer_name",
- fieldValueFound=value,
- pagesFound=[1],
+ name="customer_name",
+ value=value,
+ pages=[1],
bbox=BoundingBox(xmin=0.05, ymin=0.05, xmax=0.95, ymax=0.95),
)
- group = ExtractedFieldGroup(fieldGroupName="customer", fieldGroupFields=[field_])
- doc = ExtractedDocument(
- document_type="invoice",
+ group = ExtractedFieldGroup(name="customer", fields=[field_])
+ doc = Document(
+ type="invoice",
pages=[1],
- fields=[group],
+ field_groups=[group],
source_file="invoice.pdf",
)
return ExtractionResult(
- request_id="00000000-0000-0000-0000-000000000001",
+ id="ext_RESULT0000000000000000000000",
+ files=[],
documents=[doc],
- model="anthropic:claude-sonnet-4-6",
- latency_ms=1000,
+ pipeline=PipelineMeta(model="anthropic:claude-sonnet-4-6", latency_ms=1000),
)
@@ -68,70 +68,74 @@ def _result_with_field(value: str) -> ExtractionResult:
@dataclass
-class _StubJob:
- id: str = "job-1"
- status: str = JobStatus.PARTIAL_SUCCEEDED.value
+class _StubExtraction:
+ id: str = "ext_TEST00000000000000000000001"
+ status: str = ExtractionStatus.SUCCEEDED.value
filename: str = "invoice.pdf"
schema_json: dict[str, Any] = field(default_factory=dict)
options_json: dict[str, Any] = field(default_factory=dict)
result_json: dict[str, Any] = field(default_factory=dict)
metadata_json: dict[str, Any] = field(default_factory=dict)
callback_url: str | None = None
- bbox_refine_status: str | None = "pending"
- bbox_refine_attempts: int = 0
+ post_processing_bbox_status: str | None = "pending"
+ post_processing_bbox_attempts: int = 0
+ post_processing_bbox_started_at: datetime | None = None
+ post_processing_bbox_finished_at: datetime | None = None
+ post_processing_bbox_error_code: str | None = None
+ post_processing_bbox_error_message: str | None = None
+ error_code: str | None = None
+ error_message: str | None = None
+ attempts: int = 0
started_at: datetime | None = None
finished_at: datetime | None = None
- created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
+ submitted_at: datetime = field(default_factory=lambda: datetime.now(UTC))
class _StubRepo:
- def __init__(self, job: _StubJob) -> None:
- self.job = job
+ def __init__(self, ext: _StubExtraction) -> None:
+ self.ext = ext
self.calls: list[tuple[str, dict[str, Any]]] = []
- async def get(self, job_id: str) -> _StubJob | None:
- return self.job if self.job.id == job_id else None
+ async def get(self, ext_id: str) -> _StubExtraction | None:
+ return self.ext if self.ext.id == ext_id else None
- async def mark_bbox_refining(self, job_id: str, *, lease_seconds: int) -> _StubJob | None:
- # Mirror the production semantics: only claim PARTIAL_SUCCEEDED
- # (or stale REFINING_BBOXES); return None otherwise.
- if self.job.status not in (
- JobStatus.PARTIAL_SUCCEEDED.value,
- JobStatus.REFINING_BBOXES.value,
- ):
+ async def claim_bbox_refinement(self, ext_id: str, *, lease_seconds: int) -> _StubExtraction | None:
+ # Production semantics: claim only when main is succeeded AND the
+ # sub-status is pending (or stale running).
+ self.calls.append(("claim_bbox_refinement", {"ext_id": ext_id, "lease_seconds": lease_seconds}))
+ if self.ext.status != ExtractionStatus.SUCCEEDED.value:
+ return None
+ if self.ext.post_processing_bbox_status not in ("pending", "running"):
return None
- self.job.status = JobStatus.REFINING_BBOXES.value
- self.job.bbox_refine_status = "running"
- self.job.bbox_refine_attempts = (self.job.bbox_refine_attempts or 0) + 1
- self.calls.append(("mark_bbox_refining", {"job_id": job_id, "lease_seconds": lease_seconds}))
- return self.job
-
- async def mark_bbox_refined(self, job_id: str, *, result: dict[str, Any]) -> _StubJob | None:
- self.job.status = JobStatus.SUCCEEDED.value
- self.job.bbox_refine_status = "succeeded"
- self.job.result_json = result
- self.calls.append(("mark_bbox_refined", {"job_id": job_id}))
- return self.job
-
- async def mark_bbox_refine_failed(self, job_id: str, *, code: str, message: str) -> _StubJob | None:
- self.job.status = JobStatus.PARTIAL_SUCCEEDED.value
- self.job.bbox_refine_status = "failed"
- self.calls.append(("mark_bbox_refine_failed", {"code": code, "message": message}))
- return self.job
-
- async def update(self, job_id: str, **changes: Any) -> _StubJob | None:
+ self.ext.post_processing_bbox_status = "running"
+ self.ext.post_processing_bbox_attempts = (self.ext.post_processing_bbox_attempts or 0) + 1
+ return self.ext
+
+ async def complete_bbox_refinement(
+ self, ext_id: str, *, result: dict[str, Any]
+ ) -> _StubExtraction | None:
+ self.ext.post_processing_bbox_status = "succeeded"
+ self.ext.result_json = result
+ self.calls.append(("complete_bbox_refinement", {"ext_id": ext_id}))
+ return self.ext
+
+ async def fail_bbox_refinement(self, ext_id: str, *, code: str, message: str) -> _StubExtraction | None:
+ self.ext.post_processing_bbox_status = "failed"
+ self.calls.append(("fail_bbox_refinement", {"code": code, "message": message}))
+ return self.ext
+
+ async def update(self, ext_id: str, **changes: Any) -> _StubExtraction | None:
for k, v in changes.items():
- setattr(self.job, k, v)
+ setattr(self.ext, k, v)
self.calls.append(("update", changes))
- return self.job
+ return self.ext
- async def requeue_bbox_refine(self, job_id: str) -> _StubJob | None:
- if self.job.status != JobStatus.REFINING_BBOXES.value:
+ async def requeue_bbox_refinement(self, ext_id: str) -> _StubExtraction | None:
+ if self.ext.post_processing_bbox_status != "running":
return None
- self.job.status = JobStatus.PARTIAL_SUCCEEDED.value
- self.job.bbox_refine_status = "pending"
- self.calls.append(("requeue_bbox_refine", {"job_id": job_id}))
- return self.job
+ self.ext.post_processing_bbox_status = "pending"
+ self.calls.append(("requeue_bbox_refinement", {"ext_id": ext_id}))
+ return self.ext
class _StubPublisher:
@@ -185,11 +189,11 @@ def _make_worker(repo: _StubRepo, publisher: _StubPublisher, webhook: _StubWebho
@pytest.mark.asyncio
-async def test_grounds_partial_succeeded_job_and_transitions_to_succeeded() -> None:
+async def test_grounds_succeeded_extraction_and_marks_bbox_leg_succeeded() -> None:
pdf = _real_pdf()
- job = _StubJob(
+ ext = _StubExtraction(
schema_json={
- "documents": [
+ "files": [
{
"filename": "invoice.pdf", # matches result.documents[0].source_file
"content_base64": base64.b64encode(pdf).decode(),
@@ -199,45 +203,54 @@ async def test_grounds_partial_succeeded_job_and_transitions_to_succeeded() -> N
},
result_json=_result_with_field("Acme Corporation").model_dump(mode="json", by_alias=True),
)
- repo = _StubRepo(job)
+ repo = _StubRepo(ext)
publisher = _StubPublisher()
webhook = _StubWebhook()
worker = _make_worker(repo, publisher, webhook)
- await worker._process(job.id)
+ await worker._process(ext.id)
- assert job.status == JobStatus.SUCCEEDED.value
- assert job.bbox_refine_status == "succeeded"
- assert [name for name, _ in repo.calls] == ["mark_bbox_refining", "mark_bbox_refined"]
- # No webhook delivered because the stub job has no callback_url.
+ # Main status stays succeeded; only the bbox sub-status moves.
+ assert ext.status == ExtractionStatus.SUCCEEDED.value
+ assert ext.post_processing_bbox_status == "succeeded"
+ assert [name for name, _ in repo.calls] == [
+ "claim_bbox_refinement",
+ "complete_bbox_refinement",
+ ]
+ # No webhook delivered because the stub has no callback_url.
assert webhook.delivered == []
# No retry was scheduled.
assert publisher.published == []
# The refined result should now carry source=pdf_text on the field.
- refined = ExtractionResult.model_validate(job.result_json)
- field_ = refined.documents[0].fields[0].fieldGroupFields[0]
+ refined = ExtractionResult.model_validate(ext.result_json)
+ field_ = refined.documents[0].field_groups[0].fields[0]
assert field_.bbox.source == BboxSource.PDF_TEXT
@pytest.mark.asyncio
-async def test_skips_jobs_not_in_partial_succeeded() -> None:
- job = _StubJob(status=JobStatus.SUCCEEDED.value, result_json={})
- repo = _StubRepo(job)
+async def test_skips_extractions_whose_bbox_leg_is_not_claimable() -> None:
+ """Main status succeeded but bbox sub-status already terminal -> no-op."""
+ ext = _StubExtraction(
+ status=ExtractionStatus.SUCCEEDED.value,
+ post_processing_bbox_status="succeeded", # already done
+ result_json={},
+ )
+ repo = _StubRepo(ext)
publisher = _StubPublisher()
webhook = _StubWebhook()
worker = _make_worker(repo, publisher, webhook)
- await worker._process(job.id)
+ await worker._process(ext.id)
- # No state transitions, no retries, no webhooks.
- assert repo.calls == []
+ # claim_bbox_refinement got called but returned None -- no further work.
+ assert [name for name, _ in repo.calls] == ["claim_bbox_refinement"]
assert webhook.delivered == []
assert publisher.published == []
@pytest.mark.asyncio
-async def test_drops_unknown_job_id() -> None:
- repo = _StubRepo(_StubJob(id="other"))
+async def test_drops_unknown_extraction_id() -> None:
+ repo = _StubRepo(_StubExtraction(id="other"))
publisher = _StubPublisher()
webhook = _StubWebhook()
worker = _make_worker(repo, publisher, webhook)
@@ -251,20 +264,22 @@ async def test_drops_unknown_job_id() -> None:
@pytest.mark.asyncio
async def test_permanent_error_marks_failed_no_republish() -> None:
- # Empty schema_json triggers a permanent ValueError ("missing 'documents'").
- job = _StubJob(
+ # Empty schema_json triggers a permanent ValueError ("missing 'files'").
+ ext = _StubExtraction(
schema_json={},
result_json=_result_with_field("Acme").model_dump(mode="json", by_alias=True),
)
- repo = _StubRepo(job)
+ repo = _StubRepo(ext)
publisher = _StubPublisher()
webhook = _StubWebhook()
worker = _make_worker(repo, publisher, webhook)
- await worker._process(job.id)
+ await worker._process(ext.id)
- assert job.status == JobStatus.PARTIAL_SUCCEEDED.value
- assert job.bbox_refine_status == "failed"
+ # Main status untouched.
+ assert ext.status == ExtractionStatus.SUCCEEDED.value
+ # Bbox leg marked failed.
+ assert ext.post_processing_bbox_status == "failed"
names = [name for name, _ in repo.calls]
- assert "mark_bbox_refine_failed" in names
+ assert "fail_bbox_refinement" in names
assert publisher.published == [] # never republish on permanent
diff --git a/tests/unit/test_bbox_refiner.py b/tests/unit/test_bbox_refiner.py
index 29c05f3..9b4cad6 100644
--- a/tests/unit/test_bbox_refiner.py
+++ b/tests/unit/test_bbox_refiner.py
@@ -51,12 +51,12 @@ def refiner() -> BboxRefiner:
async def test_grounds_simple_field_against_pdf_text_layer(refiner: BboxRefiner) -> None:
pdf = _make_pdf(["Customer Name: Acme Corporation Madrid"])
field = ExtractedField(
- fieldName="customer_name",
- fieldValueFound="Acme Corporation",
- pagesFound=[1],
+ name="customer_name",
+ value="Acme Corporation",
+ pages=[1],
bbox=_llm_bbox(),
)
- group = ExtractedFieldGroup(fieldGroupName="customer", fieldGroupFields=[field])
+ group = ExtractedFieldGroup(name="customer", fields=[field])
counters = await refiner.refine(
document_bytes=pdf,
media_type="application/pdf",
@@ -65,6 +65,7 @@ async def test_grounds_simple_field_against_pdf_text_layer(refiner: BboxRefiner)
)
assert counters.fields_seen == 1
assert counters.grounded_pdf_text == 1
+ assert field.bbox is not None
assert field.bbox.source == BboxSource.PDF_TEXT
assert field.bbox.refinement_confidence is not None
assert field.bbox.refinement_confidence >= 0.85
@@ -76,12 +77,12 @@ async def test_grounds_simple_field_against_pdf_text_layer(refiner: BboxRefiner)
async def test_keeps_llm_bbox_for_unfindable_value(refiner: BboxRefiner) -> None:
pdf = _make_pdf(["totally different text"])
field = ExtractedField(
- fieldName="customer_name",
- fieldValueFound="Banco Santander S.A.",
- pagesFound=[1],
+ name="customer_name",
+ value="Banco Santander S.A.",
+ pages=[1],
bbox=_llm_bbox(),
)
- group = ExtractedFieldGroup(fieldGroupName="customer", fieldGroupFields=[field])
+ group = ExtractedFieldGroup(name="customer", fields=[field])
counters = await refiner.refine(
document_bytes=pdf,
media_type="application/pdf",
@@ -90,6 +91,7 @@ async def test_keeps_llm_bbox_for_unfindable_value(refiner: BboxRefiner) -> None
)
assert counters.kept_llm == 1
assert counters.grounded_pdf_text == 0
+ assert field.bbox is not None
assert field.bbox.source == BboxSource.LLM
assert field.bbox.refinement_confidence is None
# Original LLM coordinates are preserved.
@@ -99,14 +101,15 @@ async def test_keeps_llm_bbox_for_unfindable_value(refiner: BboxRefiner) -> None
@pytest.mark.asyncio
async def test_skips_empty_field_value(refiner: BboxRefiner) -> None:
+ """Fields with ``value=None`` are skipped -- no work to do, no counters touched."""
pdf = _make_pdf(["any content"])
field = ExtractedField(
- fieldName="missing_field",
- fieldValueFound=None,
- pagesFound=[],
- bbox=BoundingBox.empty(),
+ name="missing_field",
+ value=None,
+ pages=[],
+ bbox=None,
)
- group = ExtractedFieldGroup(fieldGroupName="g", fieldGroupFields=[field])
+ group = ExtractedFieldGroup(name="g", fields=[field])
counters = await refiner.refine(
document_bytes=pdf,
media_type="application/pdf",
@@ -116,20 +119,18 @@ async def test_skips_empty_field_value(refiner: BboxRefiner) -> None:
assert counters.fields_seen == 1
assert counters.kept_llm == 0
assert counters.grounded_pdf_text == 0
- # Empty placeholder keeps source=NONE
- assert field.bbox.source == BboxSource.NONE
+ # Field bbox stays None -- v1 represents 'no bbox' as null.
+ assert field.bbox is None
@pytest.mark.asyncio
async def test_recurses_into_array_field_rows(refiner: BboxRefiner) -> None:
pdf = _make_pdf(["Items list", "Apple 100", "Banana 200"])
- apple_qty = ExtractedField(fieldName="qty", fieldValueFound=100, pagesFound=[1], bbox=_llm_bbox())
- apple_name = ExtractedField(fieldName="name", fieldValueFound="Apple", pagesFound=[1], bbox=_llm_bbox())
- apple_row = ExtractedField(
- fieldName="row", fieldValueFound=[apple_name, apple_qty], pagesFound=[1], bbox=_llm_bbox()
- )
- items = ExtractedField(fieldName="items", fieldValueFound=[apple_row], pagesFound=[1], bbox=_llm_bbox())
- group = ExtractedFieldGroup(fieldGroupName="invoice", fieldGroupFields=[items])
+ apple_qty = ExtractedField(name="qty", value=100, pages=[1], bbox=_llm_bbox())
+ apple_name = ExtractedField(name="name", value="Apple", pages=[1], bbox=_llm_bbox())
+ apple_row = ExtractedField(name="row", value=[apple_name, apple_qty], pages=[1], bbox=_llm_bbox())
+ items = ExtractedField(name="items", value=[apple_row], pages=[1], bbox=_llm_bbox())
+ group = ExtractedFieldGroup(name="invoice", fields=[items])
counters = await refiner.refine(
document_bytes=pdf,
media_type="application/pdf",
@@ -140,6 +141,7 @@ async def test_recurses_into_array_field_rows(refiner: BboxRefiner) -> None:
assert counters.fields_seen == 2
# Both leaves should ground.
assert counters.grounded_pdf_text >= 1
+ assert apple_name.bbox is not None
assert apple_name.bbox.source == BboxSource.PDF_TEXT
diff --git a/tests/unit/test_bbox_validator.py b/tests/unit/test_bbox_validator.py
index 4db7348..330c0b0 100644
--- a/tests/unit/test_bbox_validator.py
+++ b/tests/unit/test_bbox_validator.py
@@ -4,7 +4,7 @@
Verifies the validator stamps the correct ``BboxQuality`` verdict and a
score in ``[0, 1]`` for every recognised pattern:
-* ``empty`` -- ``None`` value or zero-area placeholder
+* ``None`` -- value-less or zero-area placeholder bboxes get cleared
* ``invalid`` -- corners outside ``[0, 1]`` (caught by pydantic) or
degenerate after construction
* ``suspicious``-- ~full-page boxes (LLM hallucinated a generic region)
@@ -24,7 +24,7 @@ def _bbox(xmin: float, ymin: float, xmax: float, ymax: float) -> BoundingBox:
def _group(field: ExtractedField) -> ExtractedFieldGroup:
- return ExtractedFieldGroup(fieldGroupName="g", fieldGroupFields=[field])
+ return ExtractedFieldGroup(name="g", fields=[field])
def _validate(field: ExtractedField) -> ExtractedField:
@@ -33,66 +33,62 @@ def _validate(field: ExtractedField) -> ExtractedField:
def test_good_bbox_for_plausible_text_line() -> None:
- field = ExtractedField(fieldName="name", fieldValueFound="John Doe", bbox=_bbox(0.10, 0.10, 0.30, 0.13))
+ field = ExtractedField(name="name", value="John Doe", bbox=_bbox(0.10, 0.10, 0.30, 0.13))
_validate(field)
+ assert field.bbox is not None
assert field.bbox.quality is BboxQuality.GOOD
assert field.bbox.quality_score > 0.8
def test_suspicious_when_bbox_covers_almost_full_page() -> None:
- field = ExtractedField(
- fieldName="title", fieldValueFound="Some title", bbox=_bbox(0.01, 0.01, 0.99, 0.99)
- )
+ field = ExtractedField(name="title", value="Some title", bbox=_bbox(0.01, 0.01, 0.99, 0.99))
_validate(field)
+ assert field.bbox is not None
assert field.bbox.quality is BboxQuality.SUSPICIOUS
assert field.bbox.quality_score <= 0.3
def test_poor_when_bbox_is_microscopic() -> None:
# Area = 1e-6 (well below _AREA_MIN = 5e-5).
- field = ExtractedField(fieldName="x", fieldValueFound="x", bbox=_bbox(0.5, 0.5, 0.501, 0.501))
+ field = ExtractedField(name="x", value="x", bbox=_bbox(0.5, 0.5, 0.501, 0.501))
_validate(field)
+ assert field.bbox is not None
assert field.bbox.quality is BboxQuality.POOR
def test_poor_when_aspect_ratio_extreme() -> None:
# 0.6 wide × 0.01 tall -> aspect 60, beyond _ASPECT_MAX = 30.
- field = ExtractedField(fieldName="bar", fieldValueFound="bar", bbox=_bbox(0.1, 0.5, 0.7, 0.51))
+ field = ExtractedField(name="bar", value="bar", bbox=_bbox(0.1, 0.5, 0.7, 0.51))
_validate(field)
+ assert field.bbox is not None
assert field.bbox.quality is BboxQuality.POOR
-def test_empty_when_field_value_is_none() -> None:
- field = ExtractedField(fieldName="missing", fieldValueFound=None, bbox=_bbox(0.1, 0.1, 0.2, 0.13))
- _validate(field)
- assert field.bbox.quality is BboxQuality.EMPTY
- assert field.bbox.quality_score == 0.0
-
-
-def test_empty_when_bbox_is_zero_placeholder() -> None:
- field = ExtractedField(fieldName="placeholder", fieldValueFound="x", bbox=BoundingBox.empty())
+def test_bbox_cleared_when_field_value_is_none() -> None:
+ """v1 represents 'no bbox' as ``bbox=None`` -- there's no EMPTY verdict."""
+ field = ExtractedField(name="missing", value=None, bbox=_bbox(0.1, 0.1, 0.2, 0.13))
_validate(field)
- assert field.bbox.quality is BboxQuality.EMPTY
- assert field.bbox.quality_score == 0.0
+ assert field.bbox is None
def test_score_penalises_boxes_hugging_all_edges() -> None:
"""A box that hugs every page edge gets a lower score than an interior one."""
- hugging = ExtractedField(fieldName="a", fieldValueFound="a", bbox=_bbox(0.0, 0.0, 0.4, 0.05))
- interior = ExtractedField(fieldName="b", fieldValueFound="b", bbox=_bbox(0.2, 0.2, 0.6, 0.25))
+ hugging = ExtractedField(name="a", value="a", bbox=_bbox(0.0, 0.0, 0.4, 0.05))
+ interior = ExtractedField(name="b", value="b", bbox=_bbox(0.2, 0.2, 0.6, 0.25))
_validate(hugging)
_validate(interior)
+ assert hugging.bbox is not None and interior.bbox is not None
assert interior.bbox.quality_score > hugging.bbox.quality_score
def test_recurses_into_array_field_rows() -> None:
"""Nested ExtractedFields inside an array should also get stamped."""
- child = ExtractedField(fieldName="line", fieldValueFound="42", bbox=_bbox(0.1, 0.2, 0.3, 0.22))
+ child = ExtractedField(name="line", value="42", bbox=_bbox(0.1, 0.2, 0.3, 0.22))
parent = ExtractedField(
- fieldName="items",
- fieldValueFound=[child],
+ name="items",
+ value=[child],
bbox=_bbox(0.1, 0.1, 0.9, 0.3),
)
_validate(parent)
- assert parent.bbox.quality is BboxQuality.GOOD
- assert child.bbox.quality is BboxQuality.GOOD
+ assert parent.bbox is not None and parent.bbox.quality is BboxQuality.GOOD
+ assert child.bbox is not None and child.bbox.quality is BboxQuality.GOOD
diff --git a/tests/unit/test_document_classifier.py b/tests/unit/test_document_classifier.py
index ab4653c..8657051 100644
--- a/tests/unit/test_document_classifier.py
+++ b/tests/unit/test_document_classifier.py
@@ -27,8 +27,8 @@
DocumentClassifier,
)
from flydocs.core.services.classification import classifier as classifier_module
-from flydocs.interfaces.dtos.doc import DocSpec, DocType, ValidatorsSpec
-from flydocs.interfaces.dtos.field import FieldGroup, FieldSpec
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec
+from flydocs.interfaces.dtos.field import Field, FieldGroup
from flydocs.interfaces.enums.field_type import FieldType
@@ -41,18 +41,19 @@ def _template() -> PromptTemplate:
)
-def _passport_spec() -> DocSpec:
- return DocSpec(
- docType=DocType(documentType="passport", description="x", country="ES"),
- fieldGroups=[
+def _passport_spec() -> DocumentTypeSpec:
+ return DocumentTypeSpec(
+ id="passport",
+ description="x",
+ country="ES",
+ field_groups=[
FieldGroup(
- fieldGroupName="g",
- fieldGroupFields=[
- FieldSpec(fieldName="a", fieldType=FieldType.STRING),
+ name="g",
+ fields=[
+ Field(name="a", type=FieldType.STRING),
],
)
],
- validators=ValidatorsSpec(),
)
diff --git a/tests/unit/test_document_splitter.py b/tests/unit/test_document_splitter.py
index c33d635..8bcfd7c 100644
--- a/tests/unit/test_document_splitter.py
+++ b/tests/unit/test_document_splitter.py
@@ -27,8 +27,8 @@
SplitResult,
)
from flydocs.core.services.splitting import splitter as splitter_module
-from flydocs.interfaces.dtos.doc import DocSpec, DocType, ValidatorsSpec
-from flydocs.interfaces.dtos.field import FieldGroup, FieldSpec
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec
+from flydocs.interfaces.dtos.field import Field, FieldGroup
from flydocs.interfaces.enums.field_type import FieldType
@@ -41,16 +41,17 @@ def _template() -> PromptTemplate:
)
-def _spec(doctype: str = "deed") -> DocSpec:
- return DocSpec(
- docType=DocType(documentType=doctype, description="x", country="ES"),
- fieldGroups=[
+def _spec(doctype: str = "deed") -> DocumentTypeSpec:
+ return DocumentTypeSpec(
+ id=doctype,
+ description="x",
+ country="ES",
+ field_groups=[
FieldGroup(
- fieldGroupName="g",
- fieldGroupFields=[FieldSpec(fieldName="a", fieldType=FieldType.STRING)],
+ name="g",
+ fields=[Field(name="a", type=FieldType.STRING)],
)
],
- validators=ValidatorsSpec(),
)
diff --git a/tests/unit/test_entity_resolution_transformer.py b/tests/unit/test_entity_resolution_transformer.py
index f46de40..fcd2372 100644
--- a/tests/unit/test_entity_resolution_transformer.py
+++ b/tests/unit/test_entity_resolution_transformer.py
@@ -25,28 +25,28 @@
def _row(values: dict[str, str]) -> ExtractedField:
"""Build one persona row from a flat dict of sub-field values."""
return ExtractedField(
- fieldName="row",
- fieldValueFound=[ExtractedField(fieldName=k, fieldValueFound=v) for k, v in values.items()],
+ name="row",
+ value=[ExtractedField(name=k, value=v) for k, v in values.items()],
)
def _personas_group(rows: list[ExtractedField]) -> ExtractedFieldGroup:
return ExtractedFieldGroup(
- fieldGroupName="personas",
- fieldGroupFields=[ExtractedField(fieldName="personas", fieldValueFound=rows)],
+ name="personas",
+ fields=[ExtractedField(name="personas", value=rows)],
)
def _row_names(group: ExtractedFieldGroup) -> list[str]:
"""Pull the ``nombre`` value out of each row for assertions."""
out: list[str] = []
- for f in group.fieldGroupFields:
- if not isinstance(f.fieldValueFound, list):
+ for f in group.fields:
+ if not isinstance(f.value, list):
continue
- for row in f.fieldValueFound:
- for sub in row.fieldValueFound or []:
- if sub.fieldName == "nombre":
- out.append(sub.fieldValueFound) # type: ignore[arg-type]
+ for row in f.value:
+ for sub in row.value or []:
+ if sub.name == "nombre":
+ out.append(sub.value) # type: ignore[arg-type]
break
return out
@@ -124,7 +124,7 @@ def test_output_group_preserves_original() -> None:
EntityResolutionTransformer().apply(t, groups)
assert len(groups) == 2
- assert {g.fieldGroupName for g in groups} == {"personas", "personas_normalized"}
+ assert {g.name for g in groups} == {"personas", "personas_normalized"}
assert len(_row_names(groups[0])) == 2 # original untouched
assert len(_row_names(groups[1])) == 1 # dedupe applied
diff --git a/tests/unit/test_event_envelopes.py b/tests/unit/test_event_envelopes.py
index f54edda..7e0fd29 100644
--- a/tests/unit/test_event_envelopes.py
+++ b/tests/unit/test_event_envelopes.py
@@ -1,17 +1,17 @@
# Copyright 2026 Firefly Software Solutions Inc
-"""Typed EDA event envelopes.
+"""Unified :class:`EventEnvelope` covering EDA + webhook deliveries.
Coverage:
-1. ``IDPJobSubmittedEvent`` defaults populate ``event_id`` (UUID),
- ``occurred_at`` (UTC datetime), ``version``, and ``event_type``.
-2. ``envelope_for_publish`` produces a JSON-friendly dict suitable
- for ``EventPublisher.publish(payload=...)`` — datetimes serialise
- to ISO strings, the event id round-trips, and the enum discriminator
- matches the constant pyfly will route on.
-3. The discriminated union (``IDPEvent``) round-trips through pydantic
- from raw dicts the EDA bus would deliver, so the consumer-side
- parse is loss-free.
+1. Defaults populate ``event_id`` (UUID4), ``occurred_at`` (UTC datetime),
+ ``version`` and accept any of the four canonical event-type strings.
+2. ``envelope_for_publish`` produces a JSON-friendly dict suitable for
+ :func:`EventPublisher.publish(payload=...)`: datetimes serialise to ISO
+ strings, the event id round-trips, and enums become their string values.
+3. The envelope round-trips through pydantic from raw dicts the EDA bus
+ would deliver, so the consumer-side parse is loss-free.
+4. The four event-type constants carry the dotted snake_case form
+ (the deliberate exception to the flat-snake convention).
"""
from __future__ import annotations
@@ -19,89 +19,154 @@
import uuid
from datetime import UTC, datetime
-from pydantic import TypeAdapter
+import pytest
from flydocs.interfaces.dtos.event import (
- IDPBboxRefineCompletedEvent,
- IDPBboxRefineRequestedEvent,
- IDPEvent,
- IDPJobCompletedEvent,
- IDPJobSubmittedEvent,
+ ALL_EVENT_TYPES,
+ EVENT_TYPE_EXTRACTION_COMPLETED,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
+ EVENT_TYPE_EXTRACTION_SUBMITTED,
+ EventEnvelope,
envelope_for_publish,
)
-from flydocs.interfaces.enums.job_status import JobStatus
+from flydocs.interfaces.dtos.extract import (
+ ExtractionResult,
+ PipelineMeta,
+)
+from flydocs.interfaces.dtos.extraction import Extraction
+from flydocs.interfaces.enums.extraction_status import ExtractionStatus
+
+
+def _extraction(status: ExtractionStatus = ExtractionStatus.QUEUED) -> Extraction:
+ return Extraction(
+ id="ext_TEST00000000000000000000000",
+ status=status,
+ submitted_at=datetime(2026, 5, 15, 12, 0, 0, tzinfo=UTC),
+ )
+
+
+# ---------------------------------------------------------------------------
+# Event-type constants
+# ---------------------------------------------------------------------------
-def test_submitted_event_defaults() -> None:
- """Constructor populates id + timestamp + version + discriminator."""
- ev = IDPJobSubmittedEvent(job_id="job-1")
+def test_event_type_constants_use_dotted_snake_case() -> None:
+ """The deliberate exception to flat snake_case enums."""
+ assert EVENT_TYPE_EXTRACTION_SUBMITTED == "extraction.submitted"
+ assert EVENT_TYPE_EXTRACTION_COMPLETED == "extraction.completed"
+ assert EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED == "extraction.post_processing.requested"
+ assert EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED == "extraction.post_processing.completed"
+ assert set(ALL_EVENT_TYPES) == {
+ EVENT_TYPE_EXTRACTION_SUBMITTED,
+ EVENT_TYPE_EXTRACTION_COMPLETED,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_REQUESTED,
+ EVENT_TYPE_EXTRACTION_POST_PROCESSING_COMPLETED,
+ }
+
+# ---------------------------------------------------------------------------
+# Envelope defaults
+# ---------------------------------------------------------------------------
+
+
+def test_envelope_defaults_populate_id_timestamp_version() -> None:
+ """Constructor populates id + timestamp + version."""
+ env = EventEnvelope(
+ event_type=EVENT_TYPE_EXTRACTION_SUBMITTED,
+ extraction=_extraction(),
+ )
# event_id is a valid UUID4 string.
- parsed = uuid.UUID(ev.event_id)
+ parsed = uuid.UUID(env.event_id)
assert parsed.version == 4
-
# occurred_at is timezone-aware UTC.
- assert isinstance(ev.occurred_at, datetime)
- assert ev.occurred_at.tzinfo == UTC
+ assert isinstance(env.occurred_at, datetime)
+ assert env.occurred_at.tzinfo is not None
+ # Discriminator carries the dotted snake form.
+ assert env.event_type == "extraction.submitted"
+ assert env.version == "1.0.0"
- # Type + version + attempt defaults.
- assert ev.event_type == "IDPJobSubmitted"
- assert ev.version == "1.0.0"
- assert ev.attempt == 1
+
+# ---------------------------------------------------------------------------
+# envelope_for_publish serialisation
+# ---------------------------------------------------------------------------
def test_envelope_for_publish_is_json_friendly() -> None:
- """The serialiser used by publishers produces a primitive dict."""
+ """The serialiser produces a primitive dict suitable for EventPublisher."""
occurred = datetime(2026, 5, 15, 12, 0, 0, tzinfo=UTC)
- ev = IDPJobSubmittedEvent(
- job_id="job-1",
+ env = EventEnvelope(
+ event_type=EVENT_TYPE_EXTRACTION_SUBMITTED,
occurred_at=occurred,
correlation_id="cor-42",
+ tenant_id="acme",
+ extraction=_extraction(),
)
- payload = envelope_for_publish(ev)
-
- # Discriminator is preserved verbatim — pyfly routes on this.
- assert payload["event_type"] == "IDPJobSubmitted"
- assert payload["job_id"] == "job-1"
+ payload = envelope_for_publish(env)
+ # Discriminator preserved verbatim.
+ assert payload["event_type"] == "extraction.submitted"
assert payload["correlation_id"] == "cor-42"
+ assert payload["tenant_id"] == "acme"
# Datetimes become ISO strings (mode='json').
assert payload["occurred_at"].startswith("2026-05-15T12:00:00")
# event_id is preserved.
- assert payload["event_id"] == ev.event_id
+ assert payload["event_id"] == env.event_id
+ # Nested extraction is serialised as a dict with the enum coerced to its string value.
+ assert payload["extraction"]["id"] == "ext_TEST00000000000000000000000"
+ assert payload["extraction"]["status"] == "queued"
+
+# ---------------------------------------------------------------------------
+# Round-trip across the bus
+# ---------------------------------------------------------------------------
-def test_discriminated_union_round_trips_every_type() -> None:
+
+@pytest.mark.parametrize(
+ "event_type",
+ list(ALL_EVENT_TYPES),
+)
+def test_envelope_round_trips_through_serialise_parse(event_type: str) -> None:
"""Every event type re-parses correctly from its serialised dict."""
- adapter: TypeAdapter[IDPEvent] = TypeAdapter(IDPEvent)
- events: list[IDPEvent] = [
- IDPJobSubmittedEvent(job_id="job-1"),
- IDPJobCompletedEvent(
- job_id="job-2",
- status=JobStatus.SUCCEEDED,
- started_at=datetime(2026, 5, 15, 10, 0, 0, tzinfo=UTC),
- finished_at=datetime(2026, 5, 15, 10, 5, 0, tzinfo=UTC),
- attempts=2,
- ),
- IDPBboxRefineRequestedEvent(job_id="job-3", attempt=1),
- IDPBboxRefineCompletedEvent(
- job_id="job-4",
- status="succeeded",
- attempts=1,
- ),
- ]
- for ev in events:
- raw = ev.model_dump(mode="json")
- parsed = adapter.validate_python(raw)
- assert parsed.event_type == ev.event_type
- # event_id is stable across the round-trip.
- assert parsed.event_id == ev.event_id
-
-
-def test_completed_event_serialises_status_enum() -> None:
- """JobStatus enum serialises to its string value in the payload."""
- ev = IDPJobCompletedEvent(
- job_id="job-9",
- status=JobStatus.PARTIAL_SUCCEEDED,
+ env = EventEnvelope(
+ event_type=event_type,
+ extraction=_extraction(ExtractionStatus.SUCCEEDED),
+ )
+ raw = env.model_dump(mode="json")
+ parsed = EventEnvelope.model_validate(raw)
+ assert parsed.event_type == event_type
+ assert parsed.event_id == env.event_id
+ assert parsed.extraction.id == env.extraction.id
+ assert parsed.extraction.status == ExtractionStatus.SUCCEEDED
+
+
+# ---------------------------------------------------------------------------
+# Result is populated only on success
+# ---------------------------------------------------------------------------
+
+
+def test_envelope_can_carry_full_result() -> None:
+ """``result`` is null by default; completed-success events fill it."""
+ result = ExtractionResult(
+ id="ext_TEST00000000000000000000000",
+ files=[],
+ documents=[],
+ pipeline=PipelineMeta(model="m", latency_ms=1),
+ )
+ env = EventEnvelope(
+ event_type=EVENT_TYPE_EXTRACTION_COMPLETED,
+ extraction=_extraction(ExtractionStatus.SUCCEEDED),
+ result=result,
+ )
+ payload = envelope_for_publish(env)
+ assert payload["result"]["id"] == "ext_TEST00000000000000000000000"
+ assert payload["result"]["pipeline"]["model"] == "m"
+
+
+def test_envelope_result_defaults_to_null() -> None:
+ env = EventEnvelope(
+ event_type=EVENT_TYPE_EXTRACTION_SUBMITTED,
+ extraction=_extraction(),
)
- payload = envelope_for_publish(ev)
- assert payload["status"] == "PARTIAL_SUCCEEDED"
+ assert env.result is None
+ payload = envelope_for_publish(env)
+ assert payload["result"] is None
diff --git a/tests/unit/test_extraction_job_repository.py b/tests/unit/test_extraction_repository.py
similarity index 56%
rename from tests/unit/test_extraction_job_repository.py
rename to tests/unit/test_extraction_repository.py
index f473e8e..4f7a999 100644
--- a/tests/unit/test_extraction_job_repository.py
+++ b/tests/unit/test_extraction_repository.py
@@ -1,5 +1,5 @@
# Copyright 2026 Firefly Software Solutions Inc
-""":class:`ExtractionJobRepository` -- concurrency-safety contract.
+""":class:`ExtractionRepository` -- concurrency-safety contract.
These tests exercise the atomic state-transition methods against a
real SQLite-backed engine. SQLite serialises writers at the database
@@ -18,22 +18,22 @@
import pytest
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
-from flydocs.models.entities.extraction_job import Base, ExtractionJob
-from flydocs.models.repositories import ExtractionJobRepository
+from flydocs.models.entities.extraction import Base, Extraction
+from flydocs.models.repositories import ExtractionRepository
-async def _fresh_repo() -> ExtractionJobRepository:
+async def _fresh_repo() -> ExtractionRepository:
engine = create_async_engine("sqlite+aiosqlite:///:memory:", future=True)
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
factory = async_sessionmaker(engine, expire_on_commit=False)
- return ExtractionJobRepository(factory, engine=engine)
+ return ExtractionRepository(factory, engine=engine)
-async def _seed(repo: ExtractionJobRepository, **overrides) -> ExtractionJob:
- job = ExtractionJob(
+async def _seed(repo: ExtractionRepository, **overrides) -> Extraction:
+ ext = Extraction(
idempotency_key=overrides.get("idempotency_key"),
- status=overrides.get("status", "QUEUED"),
+ status=overrides.get("status", "queued"),
filename=overrides.get("filename", "test.pdf"),
content_sha256=overrides.get("content_sha256", "0" * 64),
content_bytes=overrides.get("content_bytes", 1),
@@ -42,31 +42,31 @@ async def _seed(repo: ExtractionJobRepository, **overrides) -> ExtractionJob:
metadata_json=overrides.get("metadata_json", {}),
attempts=overrides.get("attempts", 0),
started_at=overrides.get("started_at"),
- bbox_refine_status=overrides.get("bbox_refine_status"),
- bbox_refine_started_at=overrides.get("bbox_refine_started_at"),
+ post_processing_bbox_status=overrides.get("post_processing_bbox_status"),
+ post_processing_bbox_started_at=overrides.get("post_processing_bbox_started_at"),
)
- return await repo.add(job)
+ return await repo.add(ext)
# --------------------------------------------------------------------- mark_running
@pytest.mark.asyncio
-async def test_mark_running_claims_queued_job() -> None:
+async def test_mark_running_claims_queued_extraction() -> None:
repo = await _fresh_repo()
seeded = await _seed(repo)
claimed = await repo.mark_running(seeded.id, lease_seconds=60)
assert claimed is not None
- assert claimed.status == "RUNNING"
+ assert claimed.status == "running"
assert claimed.attempts == 1
assert claimed.started_at is not None
@pytest.mark.asyncio
async def test_mark_running_rejects_already_running_with_fresh_lease() -> None:
- """Concurrent re-claim of a job whose lease hasn't expired returns None."""
+ """Concurrent re-claim of an extraction whose lease hasn't expired returns None."""
repo = await _fresh_repo()
seeded = await _seed(repo)
first = await repo.mark_running(seeded.id, lease_seconds=300)
@@ -79,30 +79,30 @@ async def test_mark_running_rejects_already_running_with_fresh_lease() -> None:
@pytest.mark.asyncio
async def test_mark_running_reclaims_stale_running_for_crash_recovery() -> None:
- """A RUNNING job with started_at past the lease window is re-claimable."""
+ """A running extraction with started_at past the lease window is re-claimable."""
repo = await _fresh_repo()
# Simulate a worker that claimed long ago and crashed.
started_ago = datetime.now(UTC) - timedelta(seconds=600)
- seeded = await _seed(repo, status="RUNNING", started_at=started_ago, attempts=1)
+ seeded = await _seed(repo, status="running", started_at=started_ago, attempts=1)
reclaimed = await repo.mark_running(seeded.id, lease_seconds=60)
assert reclaimed is not None
- assert reclaimed.status == "RUNNING"
+ assert reclaimed.status == "running"
assert reclaimed.attempts == 2 # crash-recovery bumps attempts
@pytest.mark.asyncio
async def test_mark_running_skips_cancelled_succeeded_failed() -> None:
- for terminal in ("CANCELLED", "SUCCEEDED", "FAILED"):
+ for terminal in ("cancelled", "succeeded", "failed"):
repo = await _fresh_repo()
seeded = await _seed(repo, status=terminal)
result = await repo.mark_running(seeded.id, lease_seconds=60)
- assert result is None, f"unexpectedly claimed a {terminal} job"
+ assert result is None, f"unexpectedly claimed a {terminal} extraction"
@pytest.mark.asyncio
-async def test_mark_running_returns_none_for_missing_job() -> None:
+async def test_mark_running_returns_none_for_missing_extraction() -> None:
repo = await _fresh_repo()
result = await repo.mark_running("does-not-exist", lease_seconds=60)
assert result is None
@@ -136,14 +136,14 @@ async def test_mark_cancelled_succeeds_only_for_queued() -> None:
cancelled = await repo.mark_cancelled(seeded.id)
assert cancelled is not None
- assert cancelled.status == "CANCELLED"
+ assert cancelled.status == "cancelled"
@pytest.mark.asyncio
-async def test_mark_cancelled_rejects_running_job() -> None:
+async def test_mark_cancelled_rejects_running_extraction() -> None:
"""The cancel/run race is settled by a single atomic UPDATE."""
repo = await _fresh_repo()
- seeded = await _seed(repo, status="RUNNING", started_at=datetime.now(UTC))
+ seeded = await _seed(repo, status="running", started_at=datetime.now(UTC))
result = await repo.mark_cancelled(seeded.id)
assert result is None
@@ -160,7 +160,7 @@ async def test_concurrent_cancel_vs_claim_exactly_one_wins() -> None:
repo.mark_running(seeded.id, lease_seconds=300),
)
# The two transitions have disjoint preconditions -- both target
- # QUEUED but flip the row to mutually exclusive successors. Whichever
+ # queued but flip the row to mutually exclusive successors. Whichever
# commits first locks the row in a state the other's WHERE no longer
# matches.
winners = [r for r in (cancel_result, claim_result) if r is not None]
@@ -173,13 +173,13 @@ async def test_concurrent_cancel_vs_claim_exactly_one_wins() -> None:
@pytest.mark.asyncio
-async def test_mark_succeeded_only_finalises_running_or_refining() -> None:
+async def test_mark_succeeded_only_finalises_running() -> None:
repo = await _fresh_repo()
- seeded = await _seed(repo, status="RUNNING", started_at=datetime.now(UTC))
+ seeded = await _seed(repo, status="running", started_at=datetime.now(UTC))
finalised = await repo.mark_succeeded(seeded.id, result={"ok": True})
assert finalised is not None
- assert finalised.status == "SUCCEEDED"
+ assert finalised.status == "succeeded"
assert finalised.result_json == {"ok": True}
# Second mark_succeeded is a no-op.
@@ -187,14 +187,40 @@ async def test_mark_succeeded_only_finalises_running_or_refining() -> None:
assert again is None
+@pytest.mark.asyncio
+async def test_mark_succeeded_with_request_bbox_refinement_sets_pending() -> None:
+ """mark_succeeded(request_bbox_refinement=True) flips the bbox leg to pending atomically."""
+ repo = await _fresh_repo()
+ seeded = await _seed(repo, status="running", started_at=datetime.now(UTC))
+
+ finalised = await repo.mark_succeeded(
+ seeded.id,
+ result={"ok": True},
+ request_bbox_refinement=True,
+ )
+ assert finalised is not None
+ assert finalised.status == "succeeded"
+ assert finalised.post_processing_bbox_status == "pending"
+
+
+@pytest.mark.asyncio
+async def test_mark_succeeded_without_bbox_request_leaves_bbox_status_null() -> None:
+ repo = await _fresh_repo()
+ seeded = await _seed(repo, status="running", started_at=datetime.now(UTC))
+
+ finalised = await repo.mark_succeeded(seeded.id, result={"ok": True})
+ assert finalised is not None
+ assert finalised.post_processing_bbox_status is None
+
+
@pytest.mark.asyncio
async def test_mark_failed_only_finalises_running() -> None:
repo = await _fresh_repo()
- seeded = await _seed(repo, status="RUNNING", started_at=datetime.now(UTC))
+ seeded = await _seed(repo, status="running", started_at=datetime.now(UTC))
failed = await repo.mark_failed(seeded.id, code="X", message="boom")
assert failed is not None
- assert failed.status == "FAILED"
+ assert failed.status == "failed"
# Idempotent: second call returns None instead of clobbering.
again = await repo.mark_failed(seeded.id, code="Y", message="other")
assert again is None
@@ -204,49 +230,63 @@ async def test_mark_failed_only_finalises_running() -> None:
@pytest.mark.asyncio
-async def test_mark_bbox_refining_claims_partial_succeeded() -> None:
+async def test_claim_bbox_refinement_starts_from_pending() -> None:
+ """Main status is succeeded + bbox sub-status pending -> running."""
repo = await _fresh_repo()
- seeded = await _seed(repo, status="PARTIAL_SUCCEEDED", bbox_refine_status="pending")
+ seeded = await _seed(
+ repo,
+ status="succeeded",
+ post_processing_bbox_status="pending",
+ )
- claimed = await repo.mark_bbox_refining(seeded.id, lease_seconds=60)
+ claimed = await repo.claim_bbox_refinement(seeded.id, lease_seconds=60)
assert claimed is not None
- assert claimed.status == "REFINING_BBOXES"
- assert claimed.bbox_refine_status == "running"
- assert claimed.bbox_refine_attempts == 1
+ # Main status stays succeeded; only the sub-status moves.
+ assert claimed.status == "succeeded"
+ assert claimed.post_processing_bbox_status == "running"
+ assert claimed.post_processing_bbox_attempts == 1
@pytest.mark.asyncio
-async def test_mark_bbox_refining_rejects_fresh_refining_lease() -> None:
+async def test_claim_bbox_refinement_rejects_fresh_running_lease() -> None:
repo = await _fresh_repo()
- seeded = await _seed(repo, status="PARTIAL_SUCCEEDED", bbox_refine_status="pending")
- first = await repo.mark_bbox_refining(seeded.id, lease_seconds=300)
+ seeded = await _seed(
+ repo,
+ status="succeeded",
+ post_processing_bbox_status="pending",
+ )
+ first = await repo.claim_bbox_refinement(seeded.id, lease_seconds=300)
assert first is not None
- second = await repo.mark_bbox_refining(seeded.id, lease_seconds=300)
+ second = await repo.claim_bbox_refinement(seeded.id, lease_seconds=300)
assert second is None
@pytest.mark.asyncio
-async def test_mark_bbox_refining_reclaims_stale_lease() -> None:
+async def test_claim_bbox_refinement_reclaims_stale_lease() -> None:
repo = await _fresh_repo()
long_ago = datetime.now(UTC) - timedelta(seconds=600)
seeded = await _seed(
repo,
- status="REFINING_BBOXES",
- bbox_refine_status="running",
- bbox_refine_started_at=long_ago,
+ status="succeeded",
+ post_processing_bbox_status="running",
+ post_processing_bbox_started_at=long_ago,
)
- reclaimed = await repo.mark_bbox_refining(seeded.id, lease_seconds=60)
+ reclaimed = await repo.claim_bbox_refinement(seeded.id, lease_seconds=60)
assert reclaimed is not None
- assert reclaimed.bbox_refine_attempts == 1
+ assert reclaimed.post_processing_bbox_attempts == 1
@pytest.mark.asyncio
-async def test_concurrent_bbox_refining_one_winner() -> None:
+async def test_concurrent_claim_bbox_refinement_one_winner() -> None:
repo = await _fresh_repo()
- seeded = await _seed(repo, status="PARTIAL_SUCCEEDED", bbox_refine_status="pending")
+ seeded = await _seed(
+ repo,
+ status="succeeded",
+ post_processing_bbox_status="pending",
+ )
results = await asyncio.gather(
- repo.mark_bbox_refining(seeded.id, lease_seconds=300),
- repo.mark_bbox_refining(seeded.id, lease_seconds=300),
+ repo.claim_bbox_refinement(seeded.id, lease_seconds=300),
+ repo.claim_bbox_refinement(seeded.id, lease_seconds=300),
)
winners = [r for r in results if r is not None]
losers = [r for r in results if r is None]
@@ -255,50 +295,69 @@ async def test_concurrent_bbox_refining_one_winner() -> None:
@pytest.mark.asyncio
-async def test_mark_bbox_refined_only_from_refining() -> None:
+async def test_complete_bbox_refinement_only_from_running() -> None:
repo = await _fresh_repo()
now = datetime.now(UTC)
seeded = await _seed(
repo,
- status="REFINING_BBOXES",
- bbox_refine_status="running",
- bbox_refine_started_at=now,
+ status="succeeded",
+ post_processing_bbox_status="running",
+ post_processing_bbox_started_at=now,
)
- finalised = await repo.mark_bbox_refined(seeded.id, result={"grounded": True})
+ finalised = await repo.complete_bbox_refinement(seeded.id, result={"grounded": True})
assert finalised is not None
- assert finalised.status == "SUCCEEDED"
- assert finalised.bbox_refine_status == "succeeded"
+ # Main status was already succeeded; sub-status flips to succeeded.
+ assert finalised.status == "succeeded"
+ assert finalised.post_processing_bbox_status == "succeeded"
+ assert finalised.result_json == {"grounded": True}
@pytest.mark.asyncio
-async def test_requeue_bbox_refine_only_from_refining() -> None:
+async def test_requeue_bbox_refinement_only_from_running() -> None:
repo = await _fresh_repo()
now = datetime.now(UTC)
seeded = await _seed(
repo,
- status="REFINING_BBOXES",
- bbox_refine_status="running",
- bbox_refine_started_at=now,
+ status="succeeded",
+ post_processing_bbox_status="running",
+ post_processing_bbox_started_at=now,
)
- requeued = await repo.requeue_bbox_refine(seeded.id)
+ requeued = await repo.requeue_bbox_refinement(seeded.id)
assert requeued is not None
- assert requeued.status == "PARTIAL_SUCCEEDED"
- assert requeued.bbox_refine_status == "pending"
+ assert requeued.status == "succeeded"
+ assert requeued.post_processing_bbox_status == "pending"
- again = await repo.requeue_bbox_refine(seeded.id)
+ again = await repo.requeue_bbox_refinement(seeded.id)
assert again is None
+@pytest.mark.asyncio
+async def test_fail_bbox_refinement_only_from_running() -> None:
+ repo = await _fresh_repo()
+ now = datetime.now(UTC)
+ seeded = await _seed(
+ repo,
+ status="succeeded",
+ post_processing_bbox_status="running",
+ post_processing_bbox_started_at=now,
+ )
+ failed = await repo.fail_bbox_refinement(seeded.id, code="X", message="boom")
+ assert failed is not None
+ # Main status stays succeeded; only the sub-status fails.
+ assert failed.status == "succeeded"
+ assert failed.post_processing_bbox_status == "failed"
+
+
# --------------------------------------------------------------------- requeue_for_retry
@pytest.mark.asyncio
async def test_requeue_for_retry_only_from_running() -> None:
repo = await _fresh_repo()
- seeded = await _seed(repo, status="RUNNING", started_at=datetime.now(UTC))
+ seeded = await _seed(repo, status="running", started_at=datetime.now(UTC))
requeued = await repo.requeue_for_retry(seeded.id)
assert requeued is not None
- assert requeued.status == "QUEUED"
+ assert requeued.status == "queued"
# A cancel that arrived while we were running can no longer be
# racing -- the retry's next ``mark_running`` will atomically claim
# again. Verify a redundant requeue is a no-op.
diff --git a/tests/unit/test_extraction_status_enum.py b/tests/unit/test_extraction_status_enum.py
new file mode 100644
index 0000000..5d23fe6
--- /dev/null
+++ b/tests/unit/test_extraction_status_enum.py
@@ -0,0 +1,73 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""``ExtractionStatus`` / ``PostProcessingStatus`` semantic predicates."""
+
+from __future__ import annotations
+
+import pytest
+
+from flydocs.interfaces.enums.extraction_status import ExtractionStatus, PostProcessingStatus
+
+
+@pytest.mark.parametrize(
+ "status",
+ [ExtractionStatus.SUCCEEDED, ExtractionStatus.FAILED, ExtractionStatus.CANCELLED],
+)
+def test_terminal_statuses(status: ExtractionStatus) -> None:
+ assert status.is_terminal
+
+
+@pytest.mark.parametrize(
+ "status",
+ [
+ ExtractionStatus.QUEUED,
+ ExtractionStatus.RUNNING,
+ ],
+)
+def test_non_terminal_statuses(status: ExtractionStatus) -> None:
+ assert not status.is_terminal
+
+
+def test_only_succeeded_has_result() -> None:
+ """In v1 only ``succeeded`` carries a readable result.
+
+ Partial / refining states are gone; bbox refinement is purely
+ additive post-processing on a fully-succeeded result.
+ """
+ assert ExtractionStatus.SUCCEEDED.has_result is True
+
+
+@pytest.mark.parametrize(
+ "status",
+ [
+ ExtractionStatus.QUEUED,
+ ExtractionStatus.RUNNING,
+ ExtractionStatus.FAILED,
+ ExtractionStatus.CANCELLED,
+ ],
+)
+def test_statuses_without_readable_result(status: ExtractionStatus) -> None:
+ assert not status.has_result
+
+
+def test_extraction_status_string_values() -> None:
+ """Lowercase wire values: the migration + repository depend on these strings."""
+ assert ExtractionStatus.QUEUED.value == "queued"
+ assert ExtractionStatus.RUNNING.value == "running"
+ assert ExtractionStatus.SUCCEEDED.value == "succeeded"
+ assert ExtractionStatus.FAILED.value == "failed"
+ assert ExtractionStatus.CANCELLED.value == "cancelled"
+
+
+def test_post_processing_status_values() -> None:
+ # Stable wire values -- the migration + repository depend on these strings.
+ assert PostProcessingStatus.PENDING.value == "pending"
+ assert PostProcessingStatus.RUNNING.value == "running"
+ assert PostProcessingStatus.SUCCEEDED.value == "succeeded"
+ assert PostProcessingStatus.FAILED.value == "failed"
+
+
+def test_post_processing_status_terminal_predicate() -> None:
+ assert PostProcessingStatus.SUCCEEDED.is_terminal
+ assert PostProcessingStatus.FAILED.is_terminal
+ assert not PostProcessingStatus.PENDING.is_terminal
+ assert not PostProcessingStatus.RUNNING.is_terminal
diff --git a/tests/unit/test_job_worker_retry.py b/tests/unit/test_extraction_worker_retry.py
similarity index 91%
rename from tests/unit/test_job_worker_retry.py
rename to tests/unit/test_extraction_worker_retry.py
index 4dcacbf..32235b9 100644
--- a/tests/unit/test_job_worker_retry.py
+++ b/tests/unit/test_extraction_worker_retry.py
@@ -1,5 +1,5 @@
# Copyright 2026 Firefly Software Solutions Inc
-"""Unit tests for :class:`JobWorker` retry hardening.
+"""Unit tests for :class:`ExtractionWorker` retry hardening.
We hit the private classification helper and the backoff math directly
rather than spinning the worker against a real queue -- the orchestration
@@ -10,7 +10,7 @@
from unittest.mock import MagicMock
-from flydocs.core.services.workers.job_worker import JobWorker, _is_permanent
+from flydocs.core.services.workers.job_worker import ExtractionWorker, _is_permanent
# -- classification --------------------------------------------------------
@@ -50,13 +50,13 @@ def test_generic_runtime_error_is_retryable() -> None:
# -- backoff math ----------------------------------------------------------
-def _worker_with(base: float, ceiling: float) -> JobWorker:
+def _worker_with(base: float, ceiling: float) -> ExtractionWorker:
settings = MagicMock()
settings.eda_adapter = "memory"
settings.retry_base_delay_s = base
settings.retry_max_delay_s = ceiling
settings.job_max_attempts = 3
- return JobWorker(
+ return ExtractionWorker(
orchestrator=MagicMock(),
repository=MagicMock(),
event_publisher=MagicMock(),
diff --git a/tests/unit/test_field_validator.py b/tests/unit/test_field_validator.py
index f5264fb..28ee7f5 100644
--- a/tests/unit/test_field_validator.py
+++ b/tests/unit/test_field_validator.py
@@ -2,8 +2,8 @@
"""Unit tests for :class:`FieldValidator` -- the pure-Python validation node.
Verifies the post-extraction validator correctly:
-- decorates each extracted field with a ``field_validation`` object
-- rejects values that fail a regex / enum / standard validator
+- decorates each extracted field with a ``validation`` object
+- rejects values that fail a regex / enum / built-in validator
- treats ``severity=warning`` validators as soft (``valid=true`` but error recorded)
"""
@@ -13,68 +13,68 @@
from flydocs.interfaces.dtos.field import (
ExtractedField,
ExtractedFieldGroup,
+ Field,
FieldGroup,
- FieldSpec,
)
-from flydocs.interfaces.dtos.standard_validator import StandardValidatorSpec
+from flydocs.interfaces.dtos.validator import ValidatorSpec
from flydocs.interfaces.enums.field_type import FieldType
-from flydocs.interfaces.enums.standard_validator import StandardValidatorType
+from flydocs.interfaces.enums.validator import ValidatorType
-def _group(spec: FieldSpec, extracted: ExtractedField) -> tuple[FieldGroup, ExtractedFieldGroup]:
+def _group(spec: Field, extracted: ExtractedField) -> tuple[FieldGroup, ExtractedFieldGroup]:
return (
- FieldGroup(fieldGroupName="g", fieldGroupDesc="", fieldGroupFields=[spec]),
- ExtractedFieldGroup(fieldGroupName="g", fieldGroupFields=[extracted]),
+ FieldGroup(name="g", description="", fields=[spec]),
+ ExtractedFieldGroup(name="g", fields=[extracted]),
)
def test_enum_rejects_unknown_value() -> None:
- spec = FieldSpec(fieldName="currency", fieldType=FieldType.STRING, enum=["EUR", "USD"])
- extracted = ExtractedField(fieldName="currency", fieldValueFound="GBP")
+ spec = Field(name="currency", type=FieldType.STRING, enum=["EUR", "USD"])
+ extracted = ExtractedField(name="currency", value="GBP")
sg, eg = _group(spec, extracted)
FieldValidator().validate([sg], [eg])
- assert eg.fieldGroupFields[0].field_validation.valid is False
- assert eg.fieldGroupFields[0].field_validation.errors[0].rule.value == "enum"
+ assert eg.fields[0].validation.valid is False
+ assert eg.fields[0].validation.errors[0].rule.value == "enum"
-def test_standard_validator_marks_invalid_email() -> None:
- spec = FieldSpec(
- fieldName="contact",
- fieldType=FieldType.STRING,
- standard_validators=[StandardValidatorSpec(type=StandardValidatorType.EMAIL)],
+def test_validator_marks_invalid_email() -> None:
+ spec = Field(
+ name="contact",
+ type=FieldType.STRING,
+ validators=[ValidatorSpec(name=ValidatorType.EMAIL)],
)
- extracted = ExtractedField(fieldName="contact", fieldValueFound="not-an-email")
+ extracted = ExtractedField(name="contact", value="not-an-email")
sg, eg = _group(spec, extracted)
FieldValidator().validate([sg], [eg])
- assert eg.fieldGroupFields[0].field_validation.valid is False
- assert any(e.rule.value == "standard" for e in eg.fieldGroupFields[0].field_validation.errors)
+ assert eg.fields[0].validation.valid is False
+ assert any(e.rule.value == "validator" for e in eg.fields[0].validation.errors)
def test_warning_severity_keeps_field_valid() -> None:
- spec = FieldSpec(
- fieldName="iban",
- fieldType=FieldType.STRING,
- standard_validators=[
- StandardValidatorSpec(type=StandardValidatorType.IBAN, severity="warning"),
+ spec = Field(
+ name="iban",
+ type=FieldType.STRING,
+ validators=[
+ ValidatorSpec(name=ValidatorType.IBAN, severity="warning"),
],
)
- extracted = ExtractedField(fieldName="iban", fieldValueFound="NOT-AN-IBAN")
+ extracted = ExtractedField(name="iban", value="NOT-AN-IBAN")
sg, eg = _group(spec, extracted)
FieldValidator().validate([sg], [eg])
- fv = eg.fieldGroupFields[0].field_validation
+ fv = eg.fields[0].validation
assert fv.valid is True
assert len(fv.errors) == 1
assert fv.errors[0].message.endswith("[warning]")
def test_none_value_is_skipped() -> None:
- spec = FieldSpec(
- fieldName="iban",
- fieldType=FieldType.STRING,
- standard_validators=[StandardValidatorSpec(type=StandardValidatorType.IBAN)],
+ spec = Field(
+ name="iban",
+ type=FieldType.STRING,
+ validators=[ValidatorSpec(name=ValidatorType.IBAN)],
)
- extracted = ExtractedField(fieldName="iban", fieldValueFound=None)
+ extracted = ExtractedField(name="iban", value=None)
sg, eg = _group(spec, extracted)
FieldValidator().validate([sg], [eg])
- assert eg.fieldGroupFields[0].field_validation.valid is True
- assert eg.fieldGroupFields[0].field_validation.errors == []
+ assert eg.fields[0].validation.valid is True
+ assert eg.fields[0].validation.errors == []
diff --git a/tests/unit/test_get_extraction_result_handler.py b/tests/unit/test_get_extraction_result_handler.py
new file mode 100644
index 0000000..459289b
--- /dev/null
+++ b/tests/unit/test_get_extraction_result_handler.py
@@ -0,0 +1,164 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""``GetExtractionResultHandler`` -- final result reads + post-processing long-poll.
+
+In v1 only ``succeeded`` carries a readable result. Bbox refinement is
+purely additive post-processing on an already-succeeded result: the
+result is already returnable, the optional ``wait_for_post_processing``
+flag lets callers block until the bbox leg lands in a terminal state.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from dataclasses import dataclass, field
+from datetime import UTC, datetime
+from typing import Any
+
+import pytest
+
+from flydocs.core.services.extractions.get_extraction_result_handler import (
+ ExtractionNotReady,
+ GetExtractionResultHandler,
+ GetExtractionResultQuery,
+)
+from flydocs.interfaces.dtos.extract import ExtractionResult, PipelineMeta
+from flydocs.interfaces.enums.extraction_status import (
+ ExtractionStatus,
+ PostProcessingStatus,
+)
+
+
+def _result_payload() -> dict[str, Any]:
+ return ExtractionResult(
+ id="ext_RESULT0000000000000000000000",
+ files=[],
+ documents=[],
+ pipeline=PipelineMeta(model="m", latency_ms=10),
+ ).model_dump(mode="json", by_alias=True)
+
+
+@dataclass
+class _StubExtraction:
+ id: str = "ext_TEST0000000000000000000000A"
+ status: str = ExtractionStatus.SUCCEEDED.value
+ result_json: dict[str, Any] | None = field(default_factory=_result_payload)
+ submitted_at: datetime = field(default_factory=lambda: datetime.now(UTC))
+ post_processing_bbox_status: str | None = None
+
+
+class _StubRepo:
+ """Repository stub that can flip status mid-poll to simulate the refiner."""
+
+ def __init__(
+ self,
+ ext: _StubExtraction,
+ *,
+ flip_bbox_to: str | None = None,
+ after_calls: int = 0,
+ ) -> None:
+ self.ext = ext
+ self.flip_bbox_to = flip_bbox_to
+ self.after_calls = after_calls
+ self.calls = 0
+
+ async def get(self, ext_id: str) -> _StubExtraction | None:
+ self.calls += 1
+ if self.flip_bbox_to is not None and self.calls > self.after_calls:
+ self.ext.post_processing_bbox_status = self.flip_bbox_to
+ return self.ext if self.ext.id == ext_id else None
+
+
+@pytest.mark.asyncio
+async def test_returns_result_for_succeeded() -> None:
+ handler = GetExtractionResultHandler(repository=_StubRepo(_StubExtraction())) # type: ignore[arg-type]
+ out = await handler.do_handle(GetExtractionResultQuery(extraction_id="ext_TEST0000000000000000000000A"))
+ assert out is not None
+ assert out.id == "ext_TEST0000000000000000000000A"
+
+
+@pytest.mark.asyncio
+async def test_returns_result_when_bbox_leg_pending() -> None:
+ """The main pipeline is succeeded; the bbox leg is additive post-processing."""
+ ext = _StubExtraction(post_processing_bbox_status=PostProcessingStatus.PENDING.value)
+ handler = GetExtractionResultHandler(repository=_StubRepo(ext)) # type: ignore[arg-type]
+ out = await handler.do_handle(GetExtractionResultQuery(extraction_id="ext_TEST0000000000000000000000A"))
+ assert out is not None
+ assert out.id == "ext_TEST0000000000000000000000A"
+
+
+@pytest.mark.asyncio
+async def test_returns_result_when_bbox_leg_running() -> None:
+ ext = _StubExtraction(post_processing_bbox_status=PostProcessingStatus.RUNNING.value)
+ handler = GetExtractionResultHandler(repository=_StubRepo(ext)) # type: ignore[arg-type]
+ out = await handler.do_handle(GetExtractionResultQuery(extraction_id="ext_TEST0000000000000000000000A"))
+ assert out is not None
+
+
+@pytest.mark.asyncio
+async def test_raises_not_ready_for_queued() -> None:
+ ext = _StubExtraction(status=ExtractionStatus.QUEUED.value, result_json=None)
+ handler = GetExtractionResultHandler(repository=_StubRepo(ext)) # type: ignore[arg-type]
+ with pytest.raises(ExtractionNotReady) as ei:
+ await handler.do_handle(GetExtractionResultQuery(extraction_id="ext_TEST0000000000000000000000A"))
+ assert ei.value.status == ExtractionStatus.QUEUED
+
+
+@pytest.mark.asyncio
+async def test_raises_not_ready_for_failed() -> None:
+ ext = _StubExtraction(status=ExtractionStatus.FAILED.value, result_json=None)
+ handler = GetExtractionResultHandler(repository=_StubRepo(ext)) # type: ignore[arg-type]
+ with pytest.raises(ExtractionNotReady) as ei:
+ await handler.do_handle(GetExtractionResultQuery(extraction_id="ext_TEST0000000000000000000000A"))
+ assert ei.value.status == ExtractionStatus.FAILED
+
+
+@pytest.mark.asyncio
+async def test_returns_none_for_unknown_extraction() -> None:
+ handler = GetExtractionResultHandler(
+ repository=_StubRepo(_StubExtraction(id="other")) # type: ignore[arg-type]
+ )
+ out = await handler.do_handle(GetExtractionResultQuery(extraction_id="missing"))
+ assert out is None
+
+
+@pytest.mark.asyncio
+async def test_wait_for_post_processing_returns_at_timeout() -> None:
+ """Bbox leg stays in pending; poll should return at timeout."""
+ ext = _StubExtraction(post_processing_bbox_status=PostProcessingStatus.PENDING.value)
+ handler = GetExtractionResultHandler(repository=_StubRepo(ext)) # type: ignore[arg-type]
+ started = asyncio.get_running_loop().time()
+ out = await handler.do_handle(
+ GetExtractionResultQuery(
+ extraction_id="ext_TEST0000000000000000000000A",
+ wait_for_post_processing=True,
+ timeout_s=0.3,
+ poll_interval_s=0.1,
+ )
+ )
+ elapsed = asyncio.get_running_loop().time() - started
+ assert out is not None
+ assert 0.2 < elapsed < 1.0 # respected the timeout
+
+
+@pytest.mark.asyncio
+async def test_wait_for_post_processing_returns_early_when_bbox_succeeds() -> None:
+ """Bbox leg flips from pending to succeeded mid-poll -> exit early."""
+ ext = _StubExtraction(post_processing_bbox_status=PostProcessingStatus.PENDING.value)
+ handler = GetExtractionResultHandler(
+ repository=_StubRepo( # type: ignore[arg-type]
+ ext, flip_bbox_to=PostProcessingStatus.SUCCEEDED.value, after_calls=2
+ )
+ )
+ started = asyncio.get_running_loop().time()
+ out = await handler.do_handle(
+ GetExtractionResultQuery(
+ extraction_id="ext_TEST0000000000000000000000A",
+ wait_for_post_processing=True,
+ timeout_s=10.0,
+ poll_interval_s=0.1,
+ )
+ )
+ elapsed = asyncio.get_running_loop().time() - started
+ assert out is not None
+ assert ext.post_processing_bbox_status == PostProcessingStatus.SUCCEEDED.value
+ assert elapsed < 5.0 # well under the 10s timeout
diff --git a/tests/unit/test_get_job_result_handler.py b/tests/unit/test_get_job_result_handler.py
deleted file mode 100644
index 70719e7..0000000
--- a/tests/unit/test_get_job_result_handler.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""``GetJobResultHandler`` -- partial result reads + wait_for_bboxes long-poll."""
-
-from __future__ import annotations
-
-import asyncio
-from dataclasses import dataclass, field
-from datetime import UTC, datetime
-from typing import Any
-
-import pytest
-
-from flydocs.core.services.jobs.get_job_result_handler import (
- GetJobResultHandler,
- GetJobResultQuery,
- JobNotReady,
-)
-from flydocs.interfaces.dtos.extract import ExtractionResult
-from flydocs.interfaces.enums.job_status import JobStatus
-
-
-def _result_payload() -> dict[str, Any]:
- return ExtractionResult(
- request_id="00000000-0000-0000-0000-000000000001",
- documents=[],
- model="m",
- latency_ms=10,
- ).model_dump(mode="json", by_alias=True)
-
-
-@dataclass
-class _StubJob:
- id: str = "job-1"
- status: str = JobStatus.SUCCEEDED.value
- result_json: dict[str, Any] | None = field(default_factory=_result_payload)
- created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
-
-
-class _StubRepo:
- """Repository stub that can flip status mid-poll to simulate the refiner."""
-
- def __init__(self, job: _StubJob, *, flip_to: str | None = None, after_calls: int = 0) -> None:
- self.job = job
- self.flip_to = flip_to
- self.after_calls = after_calls
- self.calls = 0
-
- async def get(self, job_id: str) -> _StubJob | None:
- self.calls += 1
- if self.flip_to is not None and self.calls > self.after_calls:
- self.job.status = self.flip_to
- return self.job if self.job.id == job_id else None
-
-
-@pytest.mark.asyncio
-async def test_returns_result_for_succeeded() -> None:
- handler = GetJobResultHandler(repository=_StubRepo(_StubJob())) # type: ignore[arg-type]
- out = await handler.do_handle(GetJobResultQuery(job_id="job-1"))
- assert out is not None
- assert out.job_id == "job-1"
-
-
-@pytest.mark.asyncio
-async def test_returns_partial_result_when_status_partial_succeeded() -> None:
- job = _StubJob(status=JobStatus.PARTIAL_SUCCEEDED.value)
- handler = GetJobResultHandler(repository=_StubRepo(job)) # type: ignore[arg-type]
- out = await handler.do_handle(GetJobResultQuery(job_id="job-1"))
- assert out is not None
- assert out.job_id == "job-1"
-
-
-@pytest.mark.asyncio
-async def test_returns_partial_result_when_status_refining_bboxes() -> None:
- job = _StubJob(status=JobStatus.REFINING_BBOXES.value)
- handler = GetJobResultHandler(repository=_StubRepo(job)) # type: ignore[arg-type]
- out = await handler.do_handle(GetJobResultQuery(job_id="job-1"))
- assert out is not None
-
-
-@pytest.mark.asyncio
-async def test_raises_job_not_ready_for_queued() -> None:
- job = _StubJob(status=JobStatus.QUEUED.value, result_json=None)
- handler = GetJobResultHandler(repository=_StubRepo(job)) # type: ignore[arg-type]
- with pytest.raises(JobNotReady) as ei:
- await handler.do_handle(GetJobResultQuery(job_id="job-1"))
- assert ei.value.status == JobStatus.QUEUED
-
-
-@pytest.mark.asyncio
-async def test_raises_job_not_ready_for_failed() -> None:
- job = _StubJob(status=JobStatus.FAILED.value, result_json=None)
- handler = GetJobResultHandler(repository=_StubRepo(job)) # type: ignore[arg-type]
- with pytest.raises(JobNotReady) as ei:
- await handler.do_handle(GetJobResultQuery(job_id="job-1"))
- assert ei.value.status == JobStatus.FAILED
-
-
-@pytest.mark.asyncio
-async def test_returns_none_for_unknown_job() -> None:
- handler = GetJobResultHandler(repository=_StubRepo(_StubJob(id="other"))) # type: ignore[arg-type]
- out = await handler.do_handle(GetJobResultQuery(job_id="missing"))
- assert out is None
-
-
-@pytest.mark.asyncio
-async def test_wait_for_bboxes_returns_partial_at_timeout() -> None:
- # Job stays in PARTIAL_SUCCEEDED throughout; poll should return that
- # state (with its result) once the deadline elapses.
- job = _StubJob(status=JobStatus.PARTIAL_SUCCEEDED.value)
- handler = GetJobResultHandler(repository=_StubRepo(job)) # type: ignore[arg-type]
- started = asyncio.get_running_loop().time()
- out = await handler.do_handle(
- GetJobResultQuery(
- job_id="job-1",
- wait_for_bboxes=True,
- timeout_s=0.3,
- poll_interval_s=0.1,
- )
- )
- elapsed = asyncio.get_running_loop().time() - started
- assert out is not None
- assert 0.2 < elapsed < 1.0 # respected the timeout, didn't return instantly
-
-
-@pytest.mark.asyncio
-async def test_wait_for_bboxes_returns_early_when_status_flips_to_succeeded() -> None:
- # Job starts in PARTIAL_SUCCEEDED; after 2 polls the stub flips it to
- # SUCCEEDED -- handler should return before the timeout fires.
- job = _StubJob(status=JobStatus.PARTIAL_SUCCEEDED.value)
- handler = GetJobResultHandler(
- repository=_StubRepo(job, flip_to=JobStatus.SUCCEEDED.value, after_calls=2) # type: ignore[arg-type]
- )
- started = asyncio.get_running_loop().time()
- out = await handler.do_handle(
- GetJobResultQuery(
- job_id="job-1",
- wait_for_bboxes=True,
- timeout_s=10.0,
- poll_interval_s=0.1,
- )
- )
- elapsed = asyncio.get_running_loop().time() - started
- assert out is not None
- assert job.status == JobStatus.SUCCEEDED.value
- assert elapsed < 5.0 # well under the 10s timeout
diff --git a/tests/unit/test_job_status_enum.py b/tests/unit/test_job_status_enum.py
deleted file mode 100644
index b76918e..0000000
--- a/tests/unit/test_job_status_enum.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""``JobStatus`` semantic predicates -- terminal + has_result invariants."""
-
-from __future__ import annotations
-
-import pytest
-
-from flydocs.interfaces.enums.job_status import BboxRefineStatus, JobStatus
-
-
-@pytest.mark.parametrize(
- "status",
- [JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.CANCELLED],
-)
-def test_terminal_statuses(status: JobStatus) -> None:
- assert status.is_terminal
-
-
-@pytest.mark.parametrize(
- "status",
- [
- JobStatus.QUEUED,
- JobStatus.RUNNING,
- JobStatus.PARTIAL_SUCCEEDED,
- JobStatus.REFINING_BBOXES,
- ],
-)
-def test_non_terminal_statuses(status: JobStatus) -> None:
- assert not status.is_terminal
-
-
-@pytest.mark.parametrize(
- "status",
- [JobStatus.SUCCEEDED, JobStatus.PARTIAL_SUCCEEDED, JobStatus.REFINING_BBOXES],
-)
-def test_statuses_with_readable_result(status: JobStatus) -> None:
- assert status.has_result
-
-
-@pytest.mark.parametrize(
- "status",
- [JobStatus.QUEUED, JobStatus.RUNNING, JobStatus.FAILED, JobStatus.CANCELLED],
-)
-def test_statuses_without_readable_result(status: JobStatus) -> None:
- assert not status.has_result
-
-
-def test_bbox_refine_status_values() -> None:
- # Stable wire values -- the migration + repository depend on these strings.
- assert BboxRefineStatus.PENDING.value == "pending"
- assert BboxRefineStatus.RUNNING.value == "running"
- assert BboxRefineStatus.SUCCEEDED.value == "succeeded"
- assert BboxRefineStatus.FAILED.value == "failed"
diff --git a/tests/unit/test_judge_escalator.py b/tests/unit/test_judge_escalator.py
index 2c4fbba..baa384f 100644
--- a/tests/unit/test_judge_escalator.py
+++ b/tests/unit/test_judge_escalator.py
@@ -12,18 +12,19 @@
from flydocs.core.services.escalation import JudgeEscalator
from flydocs.core.services.splitting import DiscoveredSegment
from flydocs.interfaces.dtos.bbox import BoundingBox
-from flydocs.interfaces.dtos.doc import DocSpec, DocType, ValidatorsSpec
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec
from flydocs.interfaces.dtos.extract import (
- DocumentInput,
+ EscalationConfig,
ExtractionOptions,
ExtractionRequest,
+ FileInput,
StageToggles,
)
from flydocs.interfaces.dtos.field import (
ExtractedField,
ExtractedFieldGroup,
+ Field,
FieldGroup,
- FieldSpec,
FieldValidation,
JudgeOutcome,
)
@@ -33,51 +34,68 @@
_DUMMY = base64.b64encode(b"%PDF-1.4").decode("ascii")
-def _doc_spec() -> DocSpec:
- return DocSpec(
- docType=DocType(documentType="passport", description="x", country="ES"),
- fieldGroups=[
+def _doc_spec() -> DocumentTypeSpec:
+ return DocumentTypeSpec(
+ id="passport",
+ description="x",
+ country="ES",
+ field_groups=[
FieldGroup(
- fieldGroupName="g",
- fieldGroupFields=[
- FieldSpec(fieldName="a", fieldDescription="x", fieldType=FieldType.STRING),
- FieldSpec(fieldName="b", fieldDescription="x", fieldType=FieldType.STRING),
+ name="g",
+ fields=[
+ Field(name="a", description="x", type=FieldType.STRING),
+ Field(name="b", description="x", type=FieldType.STRING),
],
)
],
- validators=ValidatorsSpec(),
)
-def _request(*, escalation_threshold=None, escalation_model=None) -> ExtractionRequest:
- opts = ExtractionOptions(stages=StageToggles(judge=True, judge_escalation=True))
- if escalation_threshold is not None:
- opts.escalation_threshold = escalation_threshold
- if escalation_model is not None:
- opts.escalation_model = escalation_model
+def _request(
+ *,
+ escalation_threshold: float | None = None,
+ escalation_model: str | None = None,
+) -> ExtractionRequest:
+ stages = StageToggles(judge=True, judge_escalation=True)
+ escalation = None
+ if escalation_threshold is not None or escalation_model is not None:
+ escalation = EscalationConfig(
+ threshold=escalation_threshold if escalation_threshold is not None else 0.5,
+ model=escalation_model or "anthropic:claude-opus-4-7",
+ )
+ opts = ExtractionOptions(stages=stages, escalation=escalation)
return ExtractionRequest(
- documents=[DocumentInput(filename="x.pdf", content_base64=_DUMMY, content_type="application/pdf")],
- docs=[_doc_spec()],
+ files=[FileInput(filename="x.pdf", content_base64=_DUMMY, content_type="application/pdf")],
+ document_types=[_doc_spec()],
rules=[],
options=opts,
)
-def _field(name: str, value: str | None, judge_status: JudgeStatus, flag: bool = False) -> ExtractedField:
+def _field(
+ name: str,
+ value: str | None,
+ judge_status: JudgeStatus,
+ flag: bool = False,
+) -> ExtractedField:
return ExtractedField(
- fieldName=name,
- fieldValueFound=value,
+ name=name,
+ value=value,
confidence=0.9,
- pagesFound=[1],
+ pages=[1],
bbox=BoundingBox(xmin=0.0, ymin=0.0, xmax=1.0, ymax=1.0),
- field_validation=FieldValidation(valid=True),
- judge=JudgeOutcome(status=judge_status, confidence=0.9, evidence="e", notes="", flag_for_review=flag),
+ validation=FieldValidation(valid=True),
+ judge=JudgeOutcome(
+ status=judge_status,
+ confidence=0.9,
+ evidence="e",
+ notes=None,
+ flag_for_review=flag,
+ ),
)
def _ctx(
- extractor_mock: AsyncMock,
- judge_mock: AsyncMock,
*,
per_doc_extracted: dict[str, list[ExtractedFieldGroup]],
primary_model: str = "anthropic:claude-haiku-4-5",
@@ -117,15 +135,15 @@ async def test_no_escalation_when_threshold_zero() -> None:
per_doc = {
"passport": [
ExtractedFieldGroup(
- fieldGroupName="g",
- fieldGroupFields=[
+ name="g",
+ fields=[
_field("a", "x", JudgeStatus.FAIL),
_field("b", "y", JudgeStatus.FAIL),
],
)
]
}
- ctx = _ctx(extractor, judge, per_doc_extracted=per_doc)
+ ctx = _ctx(per_doc_extracted=per_doc)
info = await escalator.maybe_escalate(ctx, _request(escalation_threshold=0.0))
assert info is None
extractor.extract.assert_not_called()
@@ -144,12 +162,15 @@ async def test_no_escalation_when_model_not_set() -> None:
per_doc = {
"passport": [
ExtractedFieldGroup(
- fieldGroupName="g",
- fieldGroupFields=[_field("a", "x", JudgeStatus.FAIL), _field("b", "y", JudgeStatus.FAIL)],
+ name="g",
+ fields=[
+ _field("a", "x", JudgeStatus.FAIL),
+ _field("b", "y", JudgeStatus.FAIL),
+ ],
)
]
}
- ctx = _ctx(extractor, judge, per_doc_extracted=per_doc)
+ ctx = _ctx(per_doc_extracted=per_doc)
info = await escalator.maybe_escalate(ctx, _request())
assert info is None
@@ -167,15 +188,15 @@ async def test_no_escalation_when_failure_rate_below_threshold() -> None:
per_doc = {
"passport": [
ExtractedFieldGroup(
- fieldGroupName="g",
- fieldGroupFields=[
+ name="g",
+ fields=[
_field("a", "x", JudgeStatus.PASS),
_field("b", "y", JudgeStatus.FAIL), # 1/2 = 0.5 < 0.6
],
)
]
}
- ctx = _ctx(extractor, judge, per_doc_extracted=per_doc)
+ ctx = _ctx(per_doc_extracted=per_doc)
info = await escalator.maybe_escalate(ctx, _request())
assert info is None
@@ -188,8 +209,8 @@ async def test_escalation_triggered_and_accepted() -> None:
"""Threshold crossed AND escalation improves the result -> accepted=True."""
new_groups = [
ExtractedFieldGroup(
- fieldGroupName="g",
- fieldGroupFields=[
+ name="g",
+ fields=[
_field("a", "x", JudgeStatus.PASS),
_field("b", "y", JudgeStatus.PASS),
],
@@ -209,15 +230,15 @@ async def test_escalation_triggered_and_accepted() -> None:
per_doc = {
"passport": [
ExtractedFieldGroup(
- fieldGroupName="g",
- fieldGroupFields=[
+ name="g",
+ fields=[
_field("a", "x", JudgeStatus.FAIL),
_field("b", "y", JudgeStatus.FAIL), # 2/2 = 1.0 >= 0.5
],
)
]
}
- ctx = _ctx(extractor, judge, per_doc_extracted=per_doc)
+ ctx = _ctx(per_doc_extracted=per_doc)
info = await escalator.maybe_escalate(ctx, _request())
@@ -242,8 +263,8 @@ async def test_escalation_triggered_but_rejected() -> None:
"""Threshold crossed but escalation result is no better -> accepted=False."""
new_groups = [
ExtractedFieldGroup(
- fieldGroupName="g",
- fieldGroupFields=[
+ name="g",
+ fields=[
_field("a", "x", JudgeStatus.FAIL),
_field("b", "y", JudgeStatus.FAIL),
],
@@ -263,15 +284,15 @@ async def test_escalation_triggered_but_rejected() -> None:
per_doc = {
"passport": [
ExtractedFieldGroup(
- fieldGroupName="g",
- fieldGroupFields=[
+ name="g",
+ fields=[
_field("a", "x", JudgeStatus.FAIL),
_field("b", "y", JudgeStatus.FAIL),
],
)
]
}
- ctx = _ctx(extractor, judge, per_doc_extracted=per_doc)
+ ctx = _ctx(per_doc_extracted=per_doc)
info = await escalator.maybe_escalate(ctx, _request())
@@ -298,11 +319,14 @@ async def test_no_escalation_when_same_model_as_primary() -> None:
per_doc = {
"passport": [
ExtractedFieldGroup(
- fieldGroupName="g",
- fieldGroupFields=[_field("a", "x", JudgeStatus.FAIL), _field("b", "y", JudgeStatus.FAIL)],
+ name="g",
+ fields=[
+ _field("a", "x", JudgeStatus.FAIL),
+ _field("b", "y", JudgeStatus.FAIL),
+ ],
)
]
}
- ctx = _ctx(extractor, judge, per_doc_extracted=per_doc, primary_model="anthropic:claude-opus-4-7")
+ ctx = _ctx(per_doc_extracted=per_doc, primary_model="anthropic:claude-opus-4-7")
info = await escalator.maybe_escalate(ctx, _request())
assert info is None
diff --git a/tests/unit/test_list_extractions_handler.py b/tests/unit/test_list_extractions_handler.py
new file mode 100644
index 0000000..64405ca
--- /dev/null
+++ b/tests/unit/test_list_extractions_handler.py
@@ -0,0 +1,122 @@
+# Copyright 2026 Firefly Software Solutions Inc
+""":class:`ListExtractionsHandler` -- pagination + filter contract.
+
+The handler delegates to ``ExtractionRepository.list_extractions``; here
+we mock the repository and assert (a) the right filter args travel
+through, (b) the row mapping into :class:`Extraction` is faithful, and
+(c) ``total`` reflects the filtered set independent of ``limit``.
+"""
+
+from __future__ import annotations
+
+from datetime import UTC, datetime
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from flydocs.core.services.extractions.list_extractions_handler import (
+ ListExtractionsHandler,
+ ListExtractionsQuery,
+)
+from flydocs.interfaces.enums.extraction_status import (
+ ExtractionStatus,
+ PostProcessingStatus,
+)
+
+
+def _row(**overrides):
+ base = {
+ "id": "ext_TEST00000000000000000000001",
+ "status": "succeeded",
+ "submitted_at": datetime(2026, 5, 15, 10, 0, tzinfo=UTC),
+ "started_at": datetime(2026, 5, 15, 10, 0, 1, tzinfo=UTC),
+ "finished_at": datetime(2026, 5, 15, 10, 1, tzinfo=UTC),
+ "attempts": 1,
+ "error_code": None,
+ "error_message": None,
+ "post_processing_bbox_status": None,
+ "post_processing_bbox_attempts": 0,
+ "post_processing_bbox_started_at": None,
+ "post_processing_bbox_finished_at": None,
+ "post_processing_bbox_error_code": None,
+ "post_processing_bbox_error_message": None,
+ }
+ base.update(overrides)
+ return SimpleNamespace(**base)
+
+
+@pytest.mark.asyncio
+async def test_passes_filters_through_and_maps_rows() -> None:
+ repository = MagicMock()
+ repository.list_extractions = AsyncMock(
+ return_value=(
+ [
+ _row(id="ext_AAA00000000000000000000001", status="succeeded"),
+ _row(
+ id="ext_BBB00000000000000000000002",
+ status="succeeded",
+ post_processing_bbox_status="pending",
+ ),
+ ],
+ 42, # total across the filter
+ )
+ )
+ handler = ListExtractionsHandler(repository=repository)
+
+ response = await handler.do_handle(
+ ListExtractionsQuery(
+ statuses=(ExtractionStatus.SUCCEEDED,),
+ post_processing_statuses=(PostProcessingStatus.PENDING,),
+ created_after=datetime(2026, 5, 15, tzinfo=UTC),
+ limit=2,
+ offset=0,
+ )
+ )
+
+ repository.list_extractions.assert_awaited_once()
+ kwargs = repository.list_extractions.await_args.kwargs
+ assert kwargs["statuses"] == ["succeeded"]
+ assert kwargs["post_processing_bbox_statuses"] == ["pending"]
+ assert kwargs["limit"] == 2
+ assert kwargs["offset"] == 0
+
+ assert response.total == 42 # filtered total, not limited
+ assert response.limit == 2
+ assert response.offset == 0
+ assert [i.id for i in response.items] == [
+ "ext_AAA00000000000000000000001",
+ "ext_BBB00000000000000000000002",
+ ]
+ assert response.items[0].status is ExtractionStatus.SUCCEEDED
+ assert response.items[1].status is ExtractionStatus.SUCCEEDED
+ # The second row carries a post_processing block with pending status.
+ assert response.items[1].post_processing is not None
+ assert response.items[1].post_processing.bbox_refinement.status is PostProcessingStatus.PENDING
+
+
+@pytest.mark.asyncio
+async def test_empty_filter_lists_passes_none_to_repository() -> None:
+ """Empty tuples should become ``None`` so the repository builds no SQL clause."""
+ repository = MagicMock()
+ repository.list_extractions = AsyncMock(return_value=([], 0))
+ handler = ListExtractionsHandler(repository=repository)
+
+ await handler.do_handle(ListExtractionsQuery())
+
+ kwargs = repository.list_extractions.await_args.kwargs
+ assert kwargs["statuses"] is None
+ assert kwargs["post_processing_bbox_statuses"] is None
+ assert kwargs["idempotency_key"] is None
+
+
+@pytest.mark.asyncio
+async def test_pagination_defaults() -> None:
+ repository = MagicMock()
+ repository.list_extractions = AsyncMock(return_value=([], 0))
+ handler = ListExtractionsHandler(repository=repository)
+
+ response = await handler.do_handle(ListExtractionsQuery())
+
+ assert response.limit == 50
+ assert response.offset == 0
diff --git a/tests/unit/test_list_jobs_handler.py b/tests/unit/test_list_jobs_handler.py
deleted file mode 100644
index 4cd623a..0000000
--- a/tests/unit/test_list_jobs_handler.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-""":class:`ListJobsHandler` -- pagination + filter contract.
-
-The handler delegates to ``ExtractionJobRepository.list_jobs``; here
-we mock the repository and assert (a) the right filter args travel
-through, (b) the row mapping into :class:`JobStatusResponse` is
-faithful, and (c) ``total`` reflects the filtered set independent of
-``limit``.
-"""
-
-from __future__ import annotations
-
-from datetime import UTC, datetime
-from types import SimpleNamespace
-from unittest.mock import AsyncMock, MagicMock
-
-import pytest
-
-from flydocs.core.services.jobs.list_jobs_handler import (
- ListJobsHandler,
- ListJobsQuery,
-)
-from flydocs.interfaces.enums.job_status import JobStatus
-
-
-def _row(**overrides):
- base = {
- "id": "job-1",
- "status": "SUCCEEDED",
- "created_at": datetime(2026, 5, 15, 10, 0, tzinfo=UTC),
- "started_at": datetime(2026, 5, 15, 10, 0, 1, tzinfo=UTC),
- "finished_at": datetime(2026, 5, 15, 10, 1, tzinfo=UTC),
- "attempts": 1,
- "error_code": None,
- "error_message": None,
- "bbox_refine_status": None,
- "bbox_refine_attempts": 0,
- "bbox_refine_started_at": None,
- "bbox_refine_finished_at": None,
- "bbox_refine_error_code": None,
- "bbox_refine_error_message": None,
- }
- base.update(overrides)
- return SimpleNamespace(**base)
-
-
-@pytest.mark.asyncio
-async def test_passes_filters_through_and_maps_rows() -> None:
- repository = MagicMock()
- repository.list_jobs = AsyncMock(
- return_value=(
- [
- _row(id="job-1", status="SUCCEEDED"),
- _row(
- id="job-2",
- status="PARTIAL_SUCCEEDED",
- bbox_refine_status="pending",
- ),
- ],
- 42, # total across the filter
- )
- )
- handler = ListJobsHandler(repository=repository)
-
- response = await handler.do_handle(
- ListJobsQuery(
- statuses=(JobStatus.SUCCEEDED, JobStatus.PARTIAL_SUCCEEDED),
- bbox_refine_statuses=("pending",),
- created_after=datetime(2026, 5, 15, tzinfo=UTC),
- limit=2,
- offset=0,
- )
- )
-
- repository.list_jobs.assert_awaited_once()
- kwargs = repository.list_jobs.await_args.kwargs
- assert kwargs["statuses"] == ["SUCCEEDED", "PARTIAL_SUCCEEDED"]
- assert kwargs["bbox_refine_statuses"] == ["pending"]
- assert kwargs["limit"] == 2
- assert kwargs["offset"] == 0
-
- assert response.total == 42 # filtered total, not limited
- assert response.limit == 2
- assert response.offset == 0
- assert [i.job_id for i in response.items] == ["job-1", "job-2"]
- assert response.items[0].status is JobStatus.SUCCEEDED
- assert response.items[1].status is JobStatus.PARTIAL_SUCCEEDED
- assert response.items[1].bbox_refine_status == "pending"
-
-
-@pytest.mark.asyncio
-async def test_empty_filter_lists_passes_none_to_repository() -> None:
- """Empty tuples should become ``None`` so the repository builds no SQL clause."""
- repository = MagicMock()
- repository.list_jobs = AsyncMock(return_value=([], 0))
- handler = ListJobsHandler(repository=repository)
-
- await handler.do_handle(ListJobsQuery())
-
- kwargs = repository.list_jobs.await_args.kwargs
- assert kwargs["statuses"] is None
- assert kwargs["bbox_refine_statuses"] is None
- assert kwargs["idempotency_key"] is None
-
-
-@pytest.mark.asyncio
-async def test_pagination_defaults() -> None:
- repository = MagicMock()
- repository.list_jobs = AsyncMock(return_value=([], 0))
- handler = ListJobsHandler(repository=repository)
-
- response = await handler.do_handle(ListJobsQuery())
-
- assert response.limit == 50
- assert response.offset == 0
diff --git a/tests/unit/test_multi_transformation.py b/tests/unit/test_multi_transformation.py
index 53be638..72879fd 100644
--- a/tests/unit/test_multi_transformation.py
+++ b/tests/unit/test_multi_transformation.py
@@ -44,16 +44,19 @@ class _FakeLlmTransformer:
calls: list[list[str]] = field(default_factory=list)
async def apply(self, t, groups):
- target = next((g for g in groups if g.fieldGroupName == t.target_group), None)
+ target = next((g for g in groups if g.name == t.target_group), None)
names: list[str] = []
if target is not None:
- arr = next((f for f in target.fieldGroupFields if isinstance(f.fieldValueFound, list)), None)
+ arr = next(
+ (f for f in target.fields if isinstance(f.value, list)),
+ None,
+ )
if arr is not None:
- for row in arr.fieldValueFound or []:
+ for row in arr.value or []:
if isinstance(row, ExtractedField):
- for sub in row.fieldValueFound or []:
- if isinstance(sub, ExtractedField) and sub.fieldName == "nombre":
- names.append(str(sub.fieldValueFound))
+ for sub in row.value or []:
+ if isinstance(sub, ExtractedField) and sub.name == "nombre":
+ names.append(str(sub.value))
break
self.calls.append(names)
# Echo the input shape back unchanged; mutate the target group.
@@ -69,20 +72,20 @@ def _row(**values: str) -> ExtractedField:
def _personas(rows: list[ExtractedField]) -> ExtractedFieldGroup:
return ExtractedFieldGroup(
- fieldGroupName="personas",
- fieldGroupFields=[ExtractedField(name="personas", value=rows)],
+ name="personas",
+ fields=[ExtractedField(name="personas", value=rows)],
)
def _row_names(group: ExtractedFieldGroup) -> list[str]:
out: list[str] = []
- for f in group.fieldGroupFields:
- if not isinstance(f.fieldValueFound, list):
+ for f in group.fields:
+ if not isinstance(f.value, list):
continue
- for row in f.fieldValueFound:
- for sub in row.fieldValueFound or []:
- if sub.fieldName == "nombre":
- out.append(str(sub.fieldValueFound))
+ for row in f.value:
+ for sub in row.value or []:
+ if sub.name == "nombre":
+ out.append(str(sub.value))
break
return out
diff --git a/tests/unit/test_reapers.py b/tests/unit/test_reapers.py
index 8198fe6..2a7b484 100644
--- a/tests/unit/test_reapers.py
+++ b/tests/unit/test_reapers.py
@@ -1,11 +1,11 @@
# Copyright 2026 Firefly Software Solutions Inc
-""":class:`JobReaper` / :class:`BboxReaper` -- orphan-revival sweep.
+""":class:`ExtractionReaper` / :class:`BboxReaper` -- orphan-revival sweep.
These tests verify the reaper finds rows stuck in non-terminal states
and republishes the right EDA event for them. Concurrency safety
(duplicate publishes from multiple replicas being deduped at claim
-time) is covered separately in ``test_extraction_job_repository.py``
-and ``test_worker_concurrency.py``.
+time) is covered separately in ``test_extraction_repository.py`` and
+``test_worker_concurrency.py``.
"""
from __future__ import annotations
@@ -20,22 +20,22 @@
from flydocs.config import IDPSettings
from flydocs.core.services.workers.bbox_reaper import BboxReaper
-from flydocs.core.services.workers.job_reaper import JobReaper
-from flydocs.models.entities.extraction_job import Base, ExtractionJob
-from flydocs.models.repositories import ExtractionJobRepository
+from flydocs.core.services.workers.job_reaper import ExtractionReaper
+from flydocs.models.entities.extraction import Base, Extraction
+from flydocs.models.repositories import ExtractionRepository
-async def _fresh_repo() -> ExtractionJobRepository:
+async def _fresh_repo() -> ExtractionRepository:
engine = create_async_engine("sqlite+aiosqlite:///:memory:", future=True)
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
factory = async_sessionmaker(engine, expire_on_commit=False)
- return ExtractionJobRepository(factory, engine=engine)
+ return ExtractionRepository(factory, engine=engine)
-async def _seed(repo: ExtractionJobRepository, **overrides) -> ExtractionJob:
- job = ExtractionJob(
- status=overrides.get("status", "QUEUED"),
+async def _seed(repo: ExtractionRepository, **overrides) -> Extraction:
+ ext = Extraction(
+ status=overrides.get("status", "queued"),
filename=overrides.get("filename", "test.pdf"),
content_sha256=overrides.get("content_sha256", "0" * 64),
content_bytes=overrides.get("content_bytes", 1),
@@ -44,23 +44,24 @@ async def _seed(repo: ExtractionJobRepository, **overrides) -> ExtractionJob:
metadata_json=overrides.get("metadata_json", {}),
attempts=overrides.get("attempts", 0),
started_at=overrides.get("started_at"),
- bbox_refine_status=overrides.get("bbox_refine_status"),
- bbox_refine_started_at=overrides.get("bbox_refine_started_at"),
+ finished_at=overrides.get("finished_at"),
+ post_processing_bbox_status=overrides.get("post_processing_bbox_status"),
+ post_processing_bbox_started_at=overrides.get("post_processing_bbox_started_at"),
)
- job = await repo.add(job)
- if "created_at" in overrides:
- # Backdate created_at so the orphan-threshold tests don't have
+ ext = await repo.add(ext)
+ if "submitted_at" in overrides:
+ # Backdate submitted_at so the orphan-threshold tests don't have
# to wait wall-clock time. SQLAlchemy server_default fires on
# INSERT; we override here after the row exists.
async with repo._session_factory() as session: # type: ignore[attr-defined]
await session.execute(
- update(ExtractionJob)
- .where(ExtractionJob.id == job.id)
- .values(created_at=overrides["created_at"])
+ update(Extraction)
+ .where(Extraction.id == ext.id)
+ .values(submitted_at=overrides["submitted_at"])
)
await session.commit()
- job = await repo.get(job.id) # type: ignore[assignment]
- return job
+ ext = await repo.get(ext.id) # type: ignore[assignment]
+ return ext
# ---------------------------------------------------------- find_stale_* tests
@@ -70,8 +71,8 @@ async def _seed(repo: ExtractionJobRepository, **overrides) -> ExtractionJob:
async def test_find_stale_running_only_returns_rows_past_lease() -> None:
repo = await _fresh_repo()
now = datetime.now(UTC)
- fresh = await _seed(repo, status="RUNNING", started_at=now)
- stale = await _seed(repo, status="RUNNING", started_at=now - timedelta(seconds=600))
+ fresh = await _seed(repo, status="running", started_at=now)
+ stale = await _seed(repo, status="running", started_at=now - timedelta(seconds=600))
result = await repo.find_stale_running(lease_seconds=300)
assert fresh.id not in result
@@ -82,7 +83,7 @@ async def test_find_stale_running_only_returns_rows_past_lease() -> None:
async def test_find_stale_running_excludes_succeeded_failed_cancelled() -> None:
repo = await _fresh_repo()
long_ago = datetime.now(UTC) - timedelta(seconds=3600)
- for terminal in ("SUCCEEDED", "FAILED", "CANCELLED"):
+ for terminal in ("succeeded", "failed", "cancelled"):
await _seed(repo, status=terminal, started_at=long_ago)
result = await repo.find_stale_running(lease_seconds=60)
@@ -91,13 +92,13 @@ async def test_find_stale_running_excludes_succeeded_failed_cancelled() -> None:
@pytest.mark.asyncio
async def test_find_stale_queued_picks_up_brand_new_submit_orphans() -> None:
- """started_at IS NULL → fall back to created_at via COALESCE."""
+ """started_at IS NULL -> fall back to submitted_at via COALESCE."""
repo = await _fresh_repo()
- fresh = await _seed(repo, status="QUEUED") # created_at = now
+ fresh = await _seed(repo, status="queued") # submitted_at = now
old = await _seed(
repo,
- status="QUEUED",
- created_at=datetime.now(UTC) - timedelta(seconds=1200),
+ status="queued",
+ submitted_at=datetime.now(UTC) - timedelta(seconds=1200),
)
result = await repo.find_stale_queued(older_than_seconds=600)
@@ -107,32 +108,32 @@ async def test_find_stale_queued_picks_up_brand_new_submit_orphans() -> None:
@pytest.mark.asyncio
async def test_find_stale_queued_picks_up_retry_orphans() -> None:
- """status=QUEUED with started_at IS NOT NULL (post-requeue)."""
+ """status=queued with started_at IS NOT NULL (post-requeue)."""
repo = await _fresh_repo()
long_ago = datetime.now(UTC) - timedelta(seconds=1200)
- requeued = await _seed(repo, status="QUEUED", started_at=long_ago)
+ requeued = await _seed(repo, status="queued", started_at=long_ago)
result = await repo.find_stale_queued(older_than_seconds=600)
assert requeued.id in result
@pytest.mark.asyncio
-async def test_find_stale_refining_bboxes_lease_based() -> None:
+async def test_find_stale_bbox_refining_lease_based() -> None:
repo = await _fresh_repo()
fresh = await _seed(
repo,
- status="REFINING_BBOXES",
- bbox_refine_status="running",
- bbox_refine_started_at=datetime.now(UTC),
+ status="succeeded",
+ post_processing_bbox_status="running",
+ post_processing_bbox_started_at=datetime.now(UTC),
)
stale = await _seed(
repo,
- status="REFINING_BBOXES",
- bbox_refine_status="running",
- bbox_refine_started_at=datetime.now(UTC) - timedelta(seconds=1200),
+ status="succeeded",
+ post_processing_bbox_status="running",
+ post_processing_bbox_started_at=datetime.now(UTC) - timedelta(seconds=1200),
)
- result = await repo.find_stale_refining_bboxes(lease_seconds=300)
+ result = await repo.find_stale_bbox_refining(lease_seconds=300)
assert fresh.id not in result
assert stale.id in result
@@ -142,39 +143,41 @@ async def test_find_pending_bbox_revive_covers_both_subcases() -> None:
repo = await _fresh_repo()
now = datetime.now(UTC)
- # Case A: bbox_refine_started_at IS NULL, started_at is old.
+ # Case A: post_processing_bbox_started_at IS NULL, finished_at is old.
+ # The repository uses ``finished_at`` (the main pipeline's terminal
+ # timestamp) as the clock for case A.
case_a = await _seed(
repo,
- status="PARTIAL_SUCCEEDED",
- bbox_refine_status="pending",
- started_at=now - timedelta(seconds=2000),
+ status="succeeded",
+ post_processing_bbox_status="pending",
+ finished_at=now - timedelta(seconds=2000),
)
- # Case B: bbox_refine_started_at IS NOT NULL but stale.
+ # Case B: post_processing_bbox_started_at IS NOT NULL but stale.
case_b = await _seed(
repo,
- status="PARTIAL_SUCCEEDED",
- bbox_refine_status="pending",
- started_at=now - timedelta(seconds=120),
- bbox_refine_started_at=now - timedelta(seconds=800),
+ status="succeeded",
+ post_processing_bbox_status="pending",
+ finished_at=now - timedelta(seconds=120),
+ post_processing_bbox_started_at=now - timedelta(seconds=800),
)
- # Fresh case A: bbox_refine_started_at IS NULL but started_at is recent.
+ # Fresh case A: post_processing_bbox_started_at IS NULL but finished_at recent.
fresh_a = await _seed(
repo,
- status="PARTIAL_SUCCEEDED",
- bbox_refine_status="pending",
- started_at=now,
+ status="succeeded",
+ post_processing_bbox_status="pending",
+ finished_at=now,
)
- # Fresh case B: bbox_refine_started_at IS NOT NULL and recent.
+ # Fresh case B: post_processing_bbox_started_at IS NOT NULL and recent.
fresh_b = await _seed(
repo,
- status="PARTIAL_SUCCEEDED",
- bbox_refine_status="pending",
- bbox_refine_started_at=now - timedelta(seconds=60),
- started_at=now - timedelta(seconds=120),
+ status="succeeded",
+ post_processing_bbox_status="pending",
+ post_processing_bbox_started_at=now - timedelta(seconds=60),
+ finished_at=now - timedelta(seconds=120),
)
result = await repo.find_pending_bbox_revive(
- partial_threshold_seconds=1200,
+ pending_threshold_seconds=1200,
bbox_lease_seconds=600,
)
assert case_a.id in result
@@ -197,25 +200,25 @@ def _settings(**overrides) -> IDPSettings:
@pytest.mark.asyncio
-async def test_job_reaper_republishes_stale_running_and_queued() -> None:
+async def test_extraction_reaper_republishes_stale_running_and_queued() -> None:
repo = await _fresh_repo()
now = datetime.now(UTC)
- stale_running = await _seed(repo, status="RUNNING", started_at=now - timedelta(seconds=2000))
+ stale_running = await _seed(repo, status="running", started_at=now - timedelta(seconds=2000))
orphan_queued = await _seed(
repo,
- status="QUEUED",
- created_at=now - timedelta(seconds=1200),
+ status="queued",
+ submitted_at=now - timedelta(seconds=1200),
)
- fresh_running = await _seed(repo, status="RUNNING", started_at=now)
- succeeded = await _seed(repo, status="SUCCEEDED")
+ fresh_running = await _seed(repo, status="running", started_at=now)
+ succeeded = await _seed(repo, status="succeeded")
publisher = _make_publisher()
settings = _settings(
job_run_lease_s=1260,
queued_orphan_threshold_s=600,
)
- reaper = JobReaper(
+ reaper = ExtractionReaper(
repository=repo,
event_publisher=publisher,
settings=settings,
@@ -223,7 +226,7 @@ async def test_job_reaper_republishes_stale_running_and_queued() -> None:
await reaper._sweep()
- published_ids = [call.kwargs["payload"]["job_id"] for call in publisher.publish.await_args_list]
+ published_ids = [call.kwargs["payload"]["extraction"]["id"] for call in publisher.publish.await_args_list]
assert stale_running.id in published_ids
assert orphan_queued.id in published_ids
assert fresh_running.id not in published_ids
@@ -237,23 +240,23 @@ async def test_bbox_reaper_republishes_stale_refining_and_pending() -> None:
stale_refining = await _seed(
repo,
- status="REFINING_BBOXES",
- bbox_refine_status="running",
- bbox_refine_started_at=now - timedelta(seconds=2000),
+ status="succeeded",
+ post_processing_bbox_status="running",
+ post_processing_bbox_started_at=now - timedelta(seconds=2000),
)
pending_orphan = await _seed(
repo,
- status="PARTIAL_SUCCEEDED",
- bbox_refine_status="pending",
- started_at=now - timedelta(seconds=2000),
+ status="succeeded",
+ post_processing_bbox_status="pending",
+ finished_at=now - timedelta(seconds=2000),
)
fresh_refining = await _seed(
repo,
- status="REFINING_BBOXES",
- bbox_refine_status="running",
- bbox_refine_started_at=now,
+ status="succeeded",
+ post_processing_bbox_status="running",
+ post_processing_bbox_started_at=now,
)
- already_done = await _seed(repo, status="SUCCEEDED")
+ already_done = await _seed(repo, status="succeeded")
publisher = _make_publisher()
settings = _settings(
@@ -268,7 +271,7 @@ async def test_bbox_reaper_republishes_stale_refining_and_pending() -> None:
await reaper._sweep()
- published_ids = [call.kwargs["payload"]["job_id"] for call in publisher.publish.await_args_list]
+ published_ids = [call.kwargs["payload"]["extraction"]["id"] for call in publisher.publish.await_args_list]
assert stale_refining.id in published_ids
assert pending_orphan.id in published_ids
assert fresh_refining.id not in published_ids
@@ -281,7 +284,7 @@ async def test_reaper_sweep_loop_stops_when_signalled() -> None:
repo = await _fresh_repo()
publisher = _make_publisher()
settings = _settings(reaper_sweep_interval_s=1)
- reaper = JobReaper(repository=repo, event_publisher=publisher, settings=settings)
+ reaper = ExtractionReaper(repository=repo, event_publisher=publisher, settings=settings)
task = asyncio.create_task(reaper.run_forever())
# Let at least one sweep run.
@@ -296,12 +299,12 @@ async def test_reaper_sweep_survives_publisher_failure() -> None:
"""A publish error during one sweep doesn't kill the loop."""
repo = await _fresh_repo()
now = datetime.now(UTC)
- await _seed(repo, status="RUNNING", started_at=now - timedelta(seconds=2000))
+ await _seed(repo, status="running", started_at=now - timedelta(seconds=2000))
publisher = _make_publisher()
publisher.publish = AsyncMock(side_effect=RuntimeError("broker down"))
settings = _settings(reaper_sweep_interval_s=1, job_run_lease_s=300)
- reaper = JobReaper(repository=repo, event_publisher=publisher, settings=settings)
+ reaper = ExtractionReaper(repository=repo, event_publisher=publisher, settings=settings)
task = asyncio.create_task(reaper.run_forever())
await asyncio.sleep(0.05)
diff --git a/tests/unit/test_request_validator.py b/tests/unit/test_request_validator.py
index 8d570ae..415063d 100644
--- a/tests/unit/test_request_validator.py
+++ b/tests/unit/test_request_validator.py
@@ -8,14 +8,14 @@
import pytest
from flydocs.core.services.validation import RequestValidator
-from flydocs.interfaces.dtos.doc import DocSpec, DocType, ValidatorsSpec, VisualValidatorSpec
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec, VisualCheck
from flydocs.interfaces.dtos.extract import (
- DocumentInput,
ExtractionOptions,
ExtractionRequest,
+ FileInput,
StageToggles,
)
-from flydocs.interfaces.dtos.field import FieldGroup, FieldSpec
+from flydocs.interfaces.dtos.field import Field, FieldGroup
from flydocs.interfaces.dtos.rule import (
RuleFieldParent,
RuleOutputSpec,
@@ -28,31 +28,29 @@
_DUMMY_B64 = base64.b64encode(b"%PDF-1.4 dummy").decode("ascii")
-def _doc(doc_type: str = "passport", *, with_visual: bool = False) -> DocSpec:
- return DocSpec(
- docType=DocType(documentType=doc_type, description="x", country="ES"),
- fieldGroups=[
+def _doc(doc_type: str = "passport", *, with_visual: bool = False) -> DocumentTypeSpec:
+ return DocumentTypeSpec(
+ id=doc_type,
+ description="x",
+ country="ES",
+ field_groups=[
FieldGroup(
- fieldGroupName="g",
- fieldGroupDesc="",
- fieldGroupFields=[
- FieldSpec(fieldName="full_name", fieldDescription="x", fieldType=FieldType.STRING),
- FieldSpec(fieldName="nif", fieldDescription="x", fieldType=FieldType.STRING),
+ name="g",
+ description="",
+ fields=[
+ Field(name="full_name", description="x", type=FieldType.STRING),
+ Field(name="nif", description="x", type=FieldType.STRING),
],
)
],
- validators=ValidatorsSpec(
- visual=[VisualValidatorSpec(name="photo_present", description="x")] if with_visual else []
- ),
+ visual_checks=[VisualCheck(name="photo_present", description="x")] if with_visual else [],
)
-def _request(*, docs=None, rules=None, options=None) -> ExtractionRequest:
+def _request(*, document_types=None, rules=None, options=None) -> ExtractionRequest:
return ExtractionRequest(
- documents=[
- DocumentInput(filename="x.pdf", content_base64=_DUMMY_B64, content_type="application/pdf")
- ],
- docs=docs or [_doc()],
+ files=[FileInput(filename="x.pdf", content_base64=_DUMMY_B64, content_type="application/pdf")],
+ document_types=document_types or [_doc()],
rules=rules or [],
options=options or ExtractionOptions(),
)
@@ -77,7 +75,7 @@ def test_valid_rule_with_field_parent(validator: RequestValidator) -> None:
RuleSpec(
id="r1",
predicate="full_name is set.",
- parents=[RuleFieldParent(parentType="field", documentType="passport", fieldNames=["full_name"])],
+ parents=[RuleFieldParent(kind="field", document_type="passport", fields=["full_name"])],
output=RuleOutputSpec(type="boolean"),
)
]
@@ -85,7 +83,7 @@ def test_valid_rule_with_field_parent(validator: RequestValidator) -> None:
assert not report.has_errors
-# -- error: rule references unknown docType ----------------------------------
+# -- error: rule references unknown document type ----------------------------
def test_rule_unknown_doctype(validator: RequestValidator) -> None:
@@ -93,7 +91,7 @@ def test_rule_unknown_doctype(validator: RequestValidator) -> None:
RuleSpec(
id="r1",
predicate="x",
- parents=[RuleFieldParent(parentType="field", documentType="invoice", fieldNames=["foo"])],
+ parents=[RuleFieldParent(kind="field", document_type="invoice", fields=["foo"])],
)
]
report = validator.validate(_request(rules=rules))
@@ -110,7 +108,7 @@ def test_rule_unknown_field(validator: RequestValidator) -> None:
RuleSpec(
id="r1",
predicate="x",
- parents=[RuleFieldParent(parentType="field", documentType="passport", fieldNames=["nope"])],
+ parents=[RuleFieldParent(kind="field", document_type="passport", fields=["nope"])],
)
]
report = validator.validate(_request(rules=rules))
@@ -126,12 +124,10 @@ def test_rule_unknown_validator(validator: RequestValidator) -> None:
RuleSpec(
id="r1",
predicate="x",
- parents=[
- RuleValidatorParent(parentType="validator", documentType="passport", validatorName="missing")
- ],
+ parents=[RuleValidatorParent(kind="validator", document_type="passport", validator="missing")],
)
]
- report = validator.validate(_request(docs=[_doc()], rules=rules))
+ report = validator.validate(_request(document_types=[_doc()], rules=rules))
codes = [i.code for i in report.errors]
assert "rule_unknown_validator" in codes
@@ -142,13 +138,11 @@ def test_rule_validator_parent_ok_when_declared(validator: RequestValidator) ->
id="r1",
predicate="x",
parents=[
- RuleValidatorParent(
- parentType="validator", documentType="passport", validatorName="photo_present"
- )
+ RuleValidatorParent(kind="validator", document_type="passport", validator="photo_present")
],
)
]
- report = validator.validate(_request(docs=[_doc(with_visual=True)], rules=rules))
+ report = validator.validate(_request(document_types=[_doc(with_visual=True)], rules=rules))
assert not report.has_errors
@@ -160,7 +154,7 @@ def test_rule_unknown_parent_rule(validator: RequestValidator) -> None:
RuleSpec(
id="r1",
predicate="x",
- parents=[RuleRuleParent(parentType="rule", ruleId="ghost")],
+ parents=[RuleRuleParent(kind="rule", rule="ghost")],
)
]
report = validator.validate(_request(rules=rules))
@@ -176,7 +170,7 @@ def test_rule_self_reference(validator: RequestValidator) -> None:
RuleSpec(
id="r1",
predicate="x",
- parents=[RuleRuleParent(parentType="rule", ruleId="r1")],
+ parents=[RuleRuleParent(kind="rule", rule="r1")],
)
]
report = validator.validate(_request(rules=rules))
@@ -189,8 +183,8 @@ def test_rule_self_reference(validator: RequestValidator) -> None:
def test_rule_cycle(validator: RequestValidator) -> None:
rules = [
- RuleSpec(id="a", predicate="x", parents=[RuleRuleParent(parentType="rule", ruleId="b")]),
- RuleSpec(id="b", predicate="x", parents=[RuleRuleParent(parentType="rule", ruleId="a")]),
+ RuleSpec(id="a", predicate="x", parents=[RuleRuleParent(kind="rule", rule="b")]),
+ RuleSpec(id="b", predicate="x", parents=[RuleRuleParent(kind="rule", rule="a")]),
]
report = validator.validate(_request(rules=rules))
codes = [i.code for i in report.errors]
@@ -210,12 +204,12 @@ def test_duplicate_rule_id(validator: RequestValidator) -> None:
assert "duplicate_rule_id" in codes
-# -- error: duplicate docType across docs[] ----------------------------------
+# -- error: duplicate document type across document_types[] ------------------
def test_duplicate_document_type(validator: RequestValidator) -> None:
- docs = [_doc("passport"), _doc("passport")]
- report = validator.validate(_request(docs=docs))
+ document_types = [_doc("passport"), _doc("passport")]
+ report = validator.validate(_request(document_types=document_types))
codes = [i.code for i in report.errors]
assert "duplicate_document_type" in codes
@@ -242,12 +236,12 @@ def test_splitter_single_doc_is_warning_only(validator: RequestValidator) -> Non
assert "splitter_single_doc" in codes
-# -- warning: visual_authenticity on but no visual validators ----------------
+# -- warning: visual_authenticity on but no visual checks --------------------
def test_visual_auth_without_validators_is_warning_only(validator: RequestValidator) -> None:
options = ExtractionOptions(stages=StageToggles(visual_authenticity=True))
- report = validator.validate(_request(docs=[_doc()], options=options))
+ report = validator.validate(_request(document_types=[_doc()], options=options))
assert not report.has_errors
codes = [i.code for i in report.warnings]
assert "visual_authenticity_no_validators" in codes
diff --git a/tests/unit/test_standard_validators.py b/tests/unit/test_standard_validators.py
deleted file mode 100644
index 1c64194..0000000
--- a/tests/unit/test_standard_validators.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-"""Unit tests for :class:`StandardValidator` checkers.
-
-Covers the cases real callers rely on: email, IBAN checksum, NIF
-checksum, Luhn credit card, phone E.164, lat/long bounds. Each test
-has a tight reason; there is no shotgun coverage."""
-
-from __future__ import annotations
-
-import pytest
-
-from flydocs.core.services.validation.standard_validator_registry import run_standard_validator
-from flydocs.interfaces.enums.standard_validator import StandardValidatorType
-
-
-@pytest.mark.parametrize(
- "value",
- ["jane.doe@example.com", "User+tag@example.co.uk"],
-)
-def test_email_accepts_valid(value: str) -> None:
- assert run_standard_validator(StandardValidatorType.EMAIL, value, {}) is None
-
-
-@pytest.mark.parametrize("value", ["", "not-an-email", "@example.com", "x@y"])
-def test_email_rejects_invalid(value: str) -> None:
- assert run_standard_validator(StandardValidatorType.EMAIL, value, {}) is not None
-
-
-def test_iban_valid_checksum() -> None:
- # A canonical valid IBAN
- assert run_standard_validator(StandardValidatorType.IBAN, "GB82 WEST 1234 5698 7654 32", {}) is None
-
-
-def test_iban_invalid_checksum() -> None:
- assert run_standard_validator(StandardValidatorType.IBAN, "GB82WEST12345698765499", {}) is not None
-
-
-@pytest.mark.parametrize(
- "card,ok",
- [
- ("4242 4242 4242 4242", True), # Stripe-test
- ("4111-1111-1111-1111", True), # Visa-test
- ("4242 4242 4242 4243", False), # Luhn fails
- ],
-)
-def test_credit_card_luhn(card: str, ok: bool) -> None:
- result = run_standard_validator(StandardValidatorType.CREDIT_CARD, card, {})
- if ok:
- assert result is None
- else:
- assert result is not None
-
-
-def test_phone_e164() -> None:
- assert run_standard_validator(StandardValidatorType.PHONE_E164, "+34612345678", {}) is None
- assert run_standard_validator(StandardValidatorType.PHONE_E164, "abc", {}) is not None
-
-
-def test_latitude_bounds() -> None:
- assert run_standard_validator(StandardValidatorType.LATITUDE, 12.34, {}) is None
- assert run_standard_validator(StandardValidatorType.LATITUDE, 91.0, {}) is not None
-
-
-def test_nif_checksum_spanish() -> None:
- # Known-valid NIF (8 digits + control letter)
- assert run_standard_validator(StandardValidatorType.NIF, "12345678Z", {}) is None
- assert run_standard_validator(StandardValidatorType.NIF, "12345678A", {}) is not None
-
-
-def test_nie_checksum_spanish() -> None:
- # Known-valid NIE
- assert run_standard_validator(StandardValidatorType.NIE, "X1234567L", {}) is None
- assert run_standard_validator(StandardValidatorType.NIE, "X1234567Z", {}) is not None
-
-
-def test_postal_code_country_aware() -> None:
- es_ok = run_standard_validator(StandardValidatorType.POSTAL_CODE, "28013", {"country": "ES"})
- es_bad = run_standard_validator(StandardValidatorType.POSTAL_CODE, "ABC", {"country": "ES"})
- gb_ok = run_standard_validator(StandardValidatorType.POSTAL_CODE, "SW1A 1AA", {"country": "GB"})
- assert es_ok is None and es_bad is not None and gb_ok is None
diff --git a/tests/unit/test_submit_extraction_handler.py b/tests/unit/test_submit_extraction_handler.py
new file mode 100644
index 0000000..802e0c2
--- /dev/null
+++ b/tests/unit/test_submit_extraction_handler.py
@@ -0,0 +1,267 @@
+# Copyright 2026 Firefly Software Solutions Inc
+""":class:`SubmitExtractionHandler` -- persistence shape.
+
+These tests pin the contract between the REST DTO and what the worker
+later finds in ``Extraction.schema_json``. Every submission writes a
+``files`` list (single-file submits are just a 1-element list).
+The DB row's ``filename`` column gets a summary ("first.pdf (+N more)")
+for multi-file submits and ``content_sha256`` hashes the concatenation
+of every file's bytes so idempotency still collapses identical retries.
+"""
+
+from __future__ import annotations
+
+import base64
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from flydocs.core.services.extractions.submit_extraction_handler import (
+ SubmitExtractionCommand,
+ SubmitExtractionHandler,
+)
+from flydocs.core.services.validation import ValidationReport
+from flydocs.interfaces.dtos.document_type import DocumentTypeSpec
+from flydocs.interfaces.dtos.extract import FileInput
+from flydocs.interfaces.dtos.extraction import SubmitExtractionRequest
+from flydocs.interfaces.dtos.field import Field, FieldGroup
+from flydocs.interfaces.enums.extraction_status import ExtractionStatus
+from flydocs.interfaces.enums.field_type import FieldType
+
+
+def _pdf_b64(marker: bytes) -> str:
+ return base64.b64encode(b"%PDF-1.4\n" + marker + b"\n%%EOF\n").decode()
+
+
+def _doc_spec() -> DocumentTypeSpec:
+ return DocumentTypeSpec(
+ id="invoice",
+ description="test",
+ field_groups=[
+ FieldGroup(
+ name="g",
+ fields=[
+ Field(name="f", description="x", type=FieldType.STRING),
+ ],
+ )
+ ],
+ )
+
+
+def _handler() -> tuple[SubmitExtractionHandler, MagicMock, dict[str, Any]]:
+ repository = MagicMock()
+ repository.get_by_idempotency_key = AsyncMock(return_value=None)
+
+ captured: dict[str, Any] = {}
+
+ async def _add(ext: Any) -> Any:
+ captured["ext"] = ext
+ ext.id = "ext_TEST00000000000000000000000"
+ from datetime import UTC, datetime
+
+ ext.submitted_at = datetime.now(UTC)
+ # Defaults the projector reads from a real row.
+ for attr, default in (
+ ("started_at", None),
+ ("finished_at", None),
+ ("attempts", 0),
+ ("error_code", None),
+ ("error_message", None),
+ ("post_processing_bbox_status", None),
+ ("post_processing_bbox_attempts", 0),
+ ("post_processing_bbox_started_at", None),
+ ("post_processing_bbox_finished_at", None),
+ ("post_processing_bbox_error_code", None),
+ ("post_processing_bbox_error_message", None),
+ ):
+ if not hasattr(ext, attr) or getattr(ext, attr) is None:
+ setattr(ext, attr, default)
+ return ext
+
+ repository.add = AsyncMock(side_effect=_add)
+
+ publisher = MagicMock()
+ publisher.publish = AsyncMock()
+
+ validator = MagicMock()
+ validator.validate = MagicMock(return_value=ValidationReport(issues=[]))
+
+ settings = MagicMock()
+ settings.jobs_topic = "extractions.queue"
+
+ handler = SubmitExtractionHandler(
+ repository=repository,
+ event_publisher=publisher,
+ validator=validator,
+ settings=settings,
+ )
+ return handler, repository, captured
+
+
+@pytest.mark.asyncio
+async def test_single_file_submit_persists_files_list() -> None:
+ """A 1-element ``files`` list is the only shape we accept."""
+ handler, _, captured = _handler()
+ request = SubmitExtractionRequest(
+ files=[
+ FileInput(
+ filename="invoice.pdf",
+ content_base64=_pdf_b64(b"alpha"),
+ content_type="application/pdf",
+ )
+ ],
+ document_types=[_doc_spec()],
+ )
+ response = await handler.do_handle(SubmitExtractionCommand(request=request))
+
+ assert response.status is ExtractionStatus.QUEUED
+ ext = captured["ext"]
+ assert ext.filename == "invoice.pdf"
+ assert "files" in ext.schema_json
+ assert len(ext.schema_json["files"]) == 1
+ assert ext.schema_json["files"][0]["filename"] == "invoice.pdf"
+ assert ext.content_bytes > 0
+
+
+@pytest.mark.asyncio
+async def test_multi_file_submit_persists_files_list() -> None:
+ handler, _, captured = _handler()
+ request = SubmitExtractionRequest(
+ files=[
+ FileInput(
+ filename=f"deed_{i}.pdf",
+ content_base64=_pdf_b64(bytes([0x30 + i])),
+ content_type="application/pdf",
+ )
+ for i in range(3)
+ ],
+ document_types=[_doc_spec()],
+ )
+ await handler.do_handle(SubmitExtractionCommand(request=request))
+
+ ext = captured["ext"]
+ assert ext.filename.startswith("deed_0.pdf")
+ assert "(+2 more)" in ext.filename
+ assert "files" in ext.schema_json
+ assert len(ext.schema_json["files"]) == 3
+ for entry in ext.schema_json["files"]:
+ assert entry["content_type"] == "application/pdf"
+ assert entry["filename"].startswith("deed_")
+ assert entry["content_base64"]
+
+
+@pytest.mark.asyncio
+async def test_multi_file_idempotency_hash_includes_every_file() -> None:
+ """Same files in the same order produce the same content_sha256."""
+ handler_a, _, captured_a = _handler()
+ handler_b, _, captured_b = _handler()
+ files = [
+ FileInput(
+ filename=f"d_{i}.pdf",
+ content_base64=_pdf_b64(bytes([0x40 + i])),
+ content_type="application/pdf",
+ )
+ for i in range(2)
+ ]
+ await handler_a.do_handle(
+ SubmitExtractionCommand(request=SubmitExtractionRequest(files=files, document_types=[_doc_spec()]))
+ )
+ await handler_b.do_handle(
+ SubmitExtractionCommand(request=SubmitExtractionRequest(files=files, document_types=[_doc_spec()]))
+ )
+ assert captured_a["ext"].content_sha256 == captured_b["ext"].content_sha256
+
+
+def test_request_rejects_empty_files() -> None:
+ """``files`` is required and must have at least one entry."""
+ with pytest.raises(ValueError):
+ SubmitExtractionRequest(files=[], document_types=[_doc_spec()])
+
+
+@pytest.mark.asyncio
+async def test_concurrent_idempotent_submit_resolves_winning_row() -> None:
+ """Two submits with the same key racing past the SELECT must not 500.
+
+ Concrete scenario: the SELECT-by-key inside ``do_handle`` returns
+ ``None`` (no existing row), so the handler proceeds to INSERT. The
+ partial unique index on the DB raises ``IntegrityError`` for the
+ loser. The handler must catch it, re-resolve the winning row by
+ key, and return its identifier with the idempotent shape.
+ """
+ from datetime import UTC, datetime
+
+ from sqlalchemy.exc import IntegrityError
+
+ handler, repository, _ = _handler()
+
+ # First call: SELECT returns None, INSERT raises IntegrityError, then
+ # the recovery path SELECTs again and finds the winning row.
+ winning_row = MagicMock()
+ winning_row.id = "ext_WIN00000000000000000000000"
+ winning_row.status = ExtractionStatus.QUEUED.value
+ winning_row.submitted_at = datetime.now(UTC)
+ winning_row.started_at = None
+ winning_row.finished_at = None
+ winning_row.attempts = 0
+ winning_row.error_code = None
+ winning_row.error_message = None
+ winning_row.post_processing_bbox_status = None
+ winning_row.post_processing_bbox_attempts = 0
+ winning_row.post_processing_bbox_started_at = None
+ winning_row.post_processing_bbox_finished_at = None
+ winning_row.post_processing_bbox_error_code = None
+ winning_row.post_processing_bbox_error_message = None
+
+ select_calls = {"n": 0}
+
+ async def _get_by_key(key: str):
+ select_calls["n"] += 1
+ # First call (before INSERT) returns None; second call (after
+ # IntegrityError) returns the winner.
+ return None if select_calls["n"] == 1 else winning_row
+
+ repository.get_by_idempotency_key = AsyncMock(side_effect=_get_by_key)
+ repository.IntegrityError = IntegrityError
+ repository.add = AsyncMock(side_effect=IntegrityError("", None, None))
+
+ request = SubmitExtractionRequest(
+ files=[
+ FileInput(
+ filename="invoice.pdf",
+ content_base64=_pdf_b64(b"x"),
+ content_type="application/pdf",
+ )
+ ],
+ document_types=[_doc_spec()],
+ )
+ response = await handler.do_handle(SubmitExtractionCommand(request=request, idempotency_key="dupe-key"))
+
+ assert response.id == "ext_WIN00000000000000000000000"
+ assert response.status is ExtractionStatus.QUEUED
+ # Two SELECTs: pre-INSERT probe + post-IntegrityError recovery probe.
+ assert select_calls["n"] == 2
+
+
+@pytest.mark.asyncio
+async def test_idempotent_submit_reraises_if_no_winner_resolved() -> None:
+ """IntegrityError with no recoverable row re-raises (vanishingly rare)."""
+ from sqlalchemy.exc import IntegrityError
+
+ handler, repository, _ = _handler()
+ repository.get_by_idempotency_key = AsyncMock(return_value=None)
+ repository.IntegrityError = IntegrityError
+ repository.add = AsyncMock(side_effect=IntegrityError("", None, None))
+
+ request = SubmitExtractionRequest(
+ files=[
+ FileInput(
+ filename="invoice.pdf",
+ content_base64=_pdf_b64(b"x"),
+ content_type="application/pdf",
+ )
+ ],
+ document_types=[_doc_spec()],
+ )
+ with pytest.raises(IntegrityError):
+ await handler.do_handle(SubmitExtractionCommand(request=request, idempotency_key="k"))
diff --git a/tests/unit/test_submit_job_handler.py b/tests/unit/test_submit_job_handler.py
deleted file mode 100644
index 1b9cd06..0000000
--- a/tests/unit/test_submit_job_handler.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Copyright 2026 Firefly Software Solutions Inc
-""":class:`SubmitJobHandler` -- persistence shape.
-
-These tests pin the contract between the REST DTO and what the worker
-later finds in ``ExtractionJob.schema_json``. Every submission writes
-a ``documents`` list (single-file submits are just a 1-element list).
-The DB row's ``filename`` column gets a summary ("first.pdf (+N more)")
-for multi-file submits and ``content_sha256`` hashes the concatenation
-of every file's bytes so idempotency still collapses identical retries.
-"""
-
-from __future__ import annotations
-
-import base64
-from typing import Any
-from unittest.mock import AsyncMock, MagicMock
-
-import pytest
-
-from flydocs.core.services.jobs.submit_job_handler import (
- SubmitJobCommand,
- SubmitJobHandler,
-)
-from flydocs.core.services.validation import ValidationReport
-from flydocs.interfaces.dtos.doc import DocSpec, DocType
-from flydocs.interfaces.dtos.extract import DocumentInput
-from flydocs.interfaces.dtos.job import SubmitJobRequest
-from flydocs.interfaces.enums.job_status import JobStatus
-
-
-def _pdf_b64(marker: bytes) -> str:
- return base64.b64encode(b"%PDF-1.4\n" + marker + b"\n%%EOF\n").decode()
-
-
-def _doc_spec() -> DocSpec:
- return DocSpec(
- docType=DocType(documentType="invoice", description="test"),
- fieldGroups=[
- {
- "fieldGroupName": "g",
- "fieldGroupFields": [{"fieldName": "f", "fieldDescription": "x", "fieldType": "string"}],
- }
- ],
- )
-
-
-def _handler() -> tuple[SubmitJobHandler, MagicMock, MagicMock]:
- repository = MagicMock()
- repository.get_by_idempotency_key = AsyncMock(return_value=None)
-
- captured: dict[str, Any] = {}
-
- async def _add(job: Any) -> Any:
- captured["job"] = job
- job.id = "test-job-id"
- from datetime import UTC, datetime
-
- job.created_at = datetime.now(UTC)
- return job
-
- repository.add = AsyncMock(side_effect=_add)
-
- publisher = MagicMock()
- publisher.publish = AsyncMock()
-
- validator = MagicMock()
- validator.validate = MagicMock(return_value=ValidationReport(issues=[]))
-
- settings = MagicMock()
- settings.jobs_topic = "jobs.extract"
- settings.jobs_event_type = "job.submitted"
-
- handler = SubmitJobHandler(
- repository=repository,
- event_publisher=publisher,
- validator=validator,
- settings=settings,
- )
- return handler, repository, captured # type: ignore[return-value]
-
-
-@pytest.mark.asyncio
-async def test_single_file_submit_persists_documents_list() -> None:
- """A 1-element ``documents`` list is the only shape we accept."""
- handler, _, captured = _handler()
- request = SubmitJobRequest(
- documents=[
- DocumentInput(
- filename="invoice.pdf",
- content_base64=_pdf_b64(b"alpha"),
- content_type="application/pdf",
- )
- ],
- docs=[_doc_spec()],
- )
- response = await handler.do_handle(SubmitJobCommand(request=request))
-
- assert response.status is JobStatus.QUEUED
- job = captured["job"]
- assert job.filename == "invoice.pdf"
- assert "documents" in job.schema_json
- assert len(job.schema_json["documents"]) == 1
- assert job.schema_json["documents"][0]["filename"] == "invoice.pdf"
- assert job.content_bytes > 0
-
-
-@pytest.mark.asyncio
-async def test_multi_file_submit_persists_documents_list() -> None:
- handler, _, captured = _handler()
- request = SubmitJobRequest(
- documents=[
- DocumentInput(
- filename=f"deed_{i}.pdf",
- content_base64=_pdf_b64(bytes([0x30 + i])),
- content_type="application/pdf",
- )
- for i in range(3)
- ],
- docs=[_doc_spec()],
- )
- await handler.do_handle(SubmitJobCommand(request=request))
-
- job = captured["job"]
- assert job.filename.startswith("deed_0.pdf")
- assert "(+2 more)" in job.filename
- assert "documents" in job.schema_json
- assert len(job.schema_json["documents"]) == 3
- for entry in job.schema_json["documents"]:
- assert entry["content_type"] == "application/pdf"
- assert entry["filename"].startswith("deed_")
- assert entry["content_base64"]
-
-
-@pytest.mark.asyncio
-async def test_multi_file_idempotency_hash_includes_every_file() -> None:
- """Same files in the same order produce the same content_sha256."""
- handler_a, _, captured_a = _handler()
- handler_b, _, captured_b = _handler()
- files = [
- DocumentInput(
- filename=f"d_{i}.pdf",
- content_base64=_pdf_b64(bytes([0x40 + i])),
- content_type="application/pdf",
- )
- for i in range(2)
- ]
- await handler_a.do_handle(SubmitJobCommand(request=SubmitJobRequest(documents=files, docs=[_doc_spec()])))
- await handler_b.do_handle(SubmitJobCommand(request=SubmitJobRequest(documents=files, docs=[_doc_spec()])))
- assert captured_a["job"].content_sha256 == captured_b["job"].content_sha256
-
-
-def test_request_rejects_empty_documents() -> None:
- """``documents`` is required and must have at least one entry."""
- with pytest.raises(ValueError):
- SubmitJobRequest(documents=[], docs=[_doc_spec()])
-
-
-@pytest.mark.asyncio
-async def test_concurrent_idempotent_submit_resolves_winning_row() -> None:
- """Two submits with the same key racing past the SELECT must not 500.
-
- Concrete scenario: the SELECT-by-key inside ``do_handle`` returns
- ``None`` (no existing row), so the handler proceeds to INSERT. The
- partial unique index on the DB raises ``IntegrityError`` for the
- loser. The handler must catch it, re-resolve the winning row by
- key, and return its identifier with the idempotent shape.
- """
- from datetime import UTC, datetime
-
- from sqlalchemy.exc import IntegrityError
-
- from flydocs.interfaces.dtos.job import SubmitJobResponse # noqa: F401
-
- handler, repository, captured = _handler()
-
- # First call: SELECT returns None, INSERT raises IntegrityError, then
- # the recovery path SELECTs again and finds the winning row.
- winning_row = MagicMock()
- winning_row.id = "winner-job-id"
- winning_row.status = JobStatus.QUEUED.value
- winning_row.created_at = datetime.now(UTC)
-
- select_calls = {"n": 0}
-
- async def _get_by_key(key: str):
- select_calls["n"] += 1
- # First call (before INSERT) returns None; second call (after
- # IntegrityError) returns the winner.
- return None if select_calls["n"] == 1 else winning_row
-
- repository.get_by_idempotency_key = AsyncMock(side_effect=_get_by_key)
- repository.IntegrityError = IntegrityError
- repository.add = AsyncMock(side_effect=IntegrityError("", None, None))
-
- request = SubmitJobRequest(
- documents=[
- DocumentInput(
- filename="invoice.pdf",
- content_base64=_pdf_b64(b"x"),
- content_type="application/pdf",
- )
- ],
- docs=[_doc_spec()],
- )
- response = await handler.do_handle(SubmitJobCommand(request=request, idempotency_key="dupe-key"))
-
- assert response.job_id == "winner-job-id"
- assert response.status is JobStatus.QUEUED
- # Two SELECTs: pre-INSERT probe + post-IntegrityError recovery probe.
- assert select_calls["n"] == 2
-
-
-@pytest.mark.asyncio
-async def test_idempotent_submit_reraises_if_no_winner_resolved() -> None:
- """IntegrityError with no recoverable row re-raises (vanishingly rare)."""
- from sqlalchemy.exc import IntegrityError
-
- handler, repository, captured = _handler()
- repository.get_by_idempotency_key = AsyncMock(return_value=None)
- repository.IntegrityError = IntegrityError
- repository.add = AsyncMock(side_effect=IntegrityError("", None, None))
-
- request = SubmitJobRequest(
- documents=[
- DocumentInput(
- filename="invoice.pdf",
- content_base64=_pdf_b64(b"x"),
- content_type="application/pdf",
- )
- ],
- docs=[_doc_spec()],
- )
- with pytest.raises(IntegrityError):
- await handler.do_handle(SubmitJobCommand(request=request, idempotency_key="k"))
diff --git a/tests/unit/test_transformation_engine.py b/tests/unit/test_transformation_engine.py
index b8b7783..ba46ad7 100644
--- a/tests/unit/test_transformation_engine.py
+++ b/tests/unit/test_transformation_engine.py
@@ -42,8 +42,8 @@ def __post_init__(self) -> None:
async def apply(self, t, groups):
self.calls.append((t, groups))
produced = ExtractedFieldGroup(
- fieldGroupName=t.output_group or t.target_group,
- fieldGroupFields=[ExtractedField(fieldName="rows", fieldValueFound=[])],
+ name=t.output_group or t.target_group,
+ fields=[ExtractedField(name="rows", value=[])],
)
groups.append(produced)
return produced
@@ -51,15 +51,15 @@ async def apply(self, t, groups):
def _row(values: dict[str, str]) -> ExtractedField:
return ExtractedField(
- fieldName="row",
- fieldValueFound=[ExtractedField(fieldName=k, fieldValueFound=v) for k, v in values.items()],
+ name="row",
+ value=[ExtractedField(name=k, value=v) for k, v in values.items()],
)
def _personas_group(rows: list[ExtractedField]) -> ExtractedFieldGroup:
return ExtractedFieldGroup(
- fieldGroupName="personas",
- fieldGroupFields=[ExtractedField(fieldName="personas", fieldValueFound=rows)],
+ name="personas",
+ fields=[ExtractedField(name="personas", value=rows)],
)
@@ -90,7 +90,7 @@ async def test_dispatch_entity_resolution() -> None:
assert result is not None
assert fake_llm.calls == []
# Dedup happened.
- inner = result.fieldGroupFields[0].fieldValueFound # type: ignore[index]
+ inner = result.fields[0].value # type: ignore[index]
assert isinstance(inner, list) and len(inner) == 1
@@ -130,9 +130,9 @@ async def test_request_scope_consolidates_across_tasks() -> None:
task_b = [_personas_group([_row({"nombre": "Andres Contreras Guillen", "dni": ""})])]
produced = await engine.apply_request_scope(t, [task_a, task_b])
assert produced is not None
- # The synth consolidated group reduces 2 → 1.
- inner = produced.fieldGroupFields[0].fieldValueFound
+ # The synth consolidated group reduces 2 -> 1.
+ inner = produced.fields[0].value
assert isinstance(inner, list) and len(inner) == 1
# Task-scope groups stay untouched.
- assert len(task_a[0].fieldGroupFields[0].fieldValueFound) == 1 # type: ignore[arg-type]
- assert len(task_b[0].fieldGroupFields[0].fieldValueFound) == 1 # type: ignore[arg-type]
+ assert len(task_a[0].fields[0].value) == 1 # type: ignore[arg-type]
+ assert len(task_b[0].fields[0].value) == 1 # type: ignore[arg-type]
diff --git a/tests/unit/test_validators.py b/tests/unit/test_validators.py
new file mode 100644
index 0000000..e0f757a
--- /dev/null
+++ b/tests/unit/test_validators.py
@@ -0,0 +1,80 @@
+# Copyright 2026 Firefly Software Solutions Inc
+"""Unit tests for the built-in validator catalogue.
+
+Covers the cases real callers rely on: email, IBAN checksum, NIF
+checksum, Luhn credit card, phone E.164, lat/long bounds. Each test
+has a tight reason; there is no shotgun coverage."""
+
+from __future__ import annotations
+
+import pytest
+
+from flydocs.core.services.validation.validator_registry import run_validator
+from flydocs.interfaces.enums.validator import ValidatorType
+
+
+@pytest.mark.parametrize(
+ "value",
+ ["jane.doe@example.com", "User+tag@example.co.uk"],
+)
+def test_email_accepts_valid(value: str) -> None:
+ assert run_validator(ValidatorType.EMAIL, value, {}) is None
+
+
+@pytest.mark.parametrize("value", ["", "not-an-email", "@example.com", "x@y"])
+def test_email_rejects_invalid(value: str) -> None:
+ assert run_validator(ValidatorType.EMAIL, value, {}) is not None
+
+
+def test_iban_valid_checksum() -> None:
+ # A canonical valid IBAN
+ assert run_validator(ValidatorType.IBAN, "GB82 WEST 1234 5698 7654 32", {}) is None
+
+
+def test_iban_invalid_checksum() -> None:
+ assert run_validator(ValidatorType.IBAN, "GB82WEST12345698765499", {}) is not None
+
+
+@pytest.mark.parametrize(
+ "card,ok",
+ [
+ ("4242 4242 4242 4242", True), # Stripe-test
+ ("4111-1111-1111-1111", True), # Visa-test
+ ("4242 4242 4242 4243", False), # Luhn fails
+ ],
+)
+def test_credit_card_luhn(card: str, ok: bool) -> None:
+ result = run_validator(ValidatorType.CREDIT_CARD, card, {})
+ if ok:
+ assert result is None
+ else:
+ assert result is not None
+
+
+def test_phone_e164() -> None:
+ assert run_validator(ValidatorType.PHONE_E164, "+34612345678", {}) is None
+ assert run_validator(ValidatorType.PHONE_E164, "abc", {}) is not None
+
+
+def test_latitude_bounds() -> None:
+ assert run_validator(ValidatorType.LATITUDE, 12.34, {}) is None
+ assert run_validator(ValidatorType.LATITUDE, 91.0, {}) is not None
+
+
+def test_nif_checksum_spanish() -> None:
+ # Known-valid NIF (8 digits + control letter)
+ assert run_validator(ValidatorType.NIF, "12345678Z", {}) is None
+ assert run_validator(ValidatorType.NIF, "12345678A", {}) is not None
+
+
+def test_nie_checksum_spanish() -> None:
+ # Known-valid NIE
+ assert run_validator(ValidatorType.NIE, "X1234567L", {}) is None
+ assert run_validator(ValidatorType.NIE, "X1234567Z", {}) is not None
+
+
+def test_postal_code_country_aware() -> None:
+ es_ok = run_validator(ValidatorType.POSTAL_CODE, "28013", {"country": "ES"})
+ es_bad = run_validator(ValidatorType.POSTAL_CODE, "ABC", {"country": "ES"})
+ gb_ok = run_validator(ValidatorType.POSTAL_CODE, "SW1A 1AA", {"country": "GB"})
+ assert es_ok is None and es_bad is not None and gb_ok is None
diff --git a/tests/unit/test_webhook_publisher.py b/tests/unit/test_webhook_publisher.py
index aa88f6d..e18eba8 100644
--- a/tests/unit/test_webhook_publisher.py
+++ b/tests/unit/test_webhook_publisher.py
@@ -19,8 +19,12 @@
import pytest
from flydocs.core.services.webhook import WebhookPublisher
-from flydocs.interfaces.dtos.webhook import JobWebhookPayload
-from flydocs.interfaces.enums.job_status import JobStatus
+from flydocs.interfaces.dtos.event import (
+ EVENT_TYPE_EXTRACTION_COMPLETED,
+ EventEnvelope,
+)
+from flydocs.interfaces.dtos.extraction import Extraction
+from flydocs.interfaces.enums.extraction_status import ExtractionStatus
# ---------------------------------------------------------------------------
# In-process webhook receiver
@@ -89,11 +93,15 @@ def receiver():
server.server_close()
-def _payload() -> JobWebhookPayload:
- return JobWebhookPayload(
- job_id="01HEM2ZZ7M0Q800000000",
- status=JobStatus.SUCCEEDED,
+def _payload() -> EventEnvelope:
+ return EventEnvelope(
+ event_type=EVENT_TYPE_EXTRACTION_COMPLETED,
occurred_at=datetime(2026, 5, 14, 12, 0, 0, tzinfo=UTC),
+ extraction=Extraction(
+ id="ext_TEST00000000000000000000000",
+ status=ExtractionStatus.SUCCEEDED,
+ submitted_at=datetime(2026, 5, 14, 11, 59, 0, tzinfo=UTC),
+ ),
metadata={"tenant_id": "acme"},
result=None,
)
@@ -117,8 +125,9 @@ async def test_webhook_delivers_and_signs(receiver) -> None:
assert capture.path == "/webhook"
# The body is the JSON-encoded payload.
body = json.loads(capture.body.decode("utf-8"))
- assert body["job_id"] == "01HEM2ZZ7M0Q800000000"
- assert body["status"] == "SUCCEEDED"
+ assert body["event_type"] == "extraction.completed"
+ assert body["extraction"]["id"] == "ext_TEST00000000000000000000000"
+ assert body["extraction"]["status"] == "succeeded"
# The signature header carries an HMAC-SHA256 of the body.
sig = capture.headers.get("X-Flydocs-Signature", "")
assert sig.startswith("sha256=")
diff --git a/tests/unit/test_worker_concurrency.py b/tests/unit/test_worker_concurrency.py
index 939516d..852358e 100644
--- a/tests/unit/test_worker_concurrency.py
+++ b/tests/unit/test_worker_concurrency.py
@@ -1,8 +1,8 @@
# Copyright 2026 Firefly Software Solutions Inc
-""":class:`JobWorker` + :class:`BboxRefineWorker` -- race-loser behaviour.
+""":class:`ExtractionWorker` + :class:`BboxRefineWorker` -- race-loser behaviour.
The repository-level concurrency contract is exercised in
-``test_extraction_job_repository.py``. These tests verify the workers
+``test_extraction_repository.py``. These tests verify the workers
*react* correctly when their atomic claim returns ``None``:
* No duplicate orchestrator call.
@@ -23,16 +23,16 @@
from flydocs.config import IDPSettings
from flydocs.core.services.workers.bbox_refine_worker import BboxRefineWorker
-from flydocs.core.services.workers.job_worker import JobWorker
-from flydocs.interfaces.enums.job_status import JobStatus
+from flydocs.core.services.workers.job_worker import ExtractionWorker
+from flydocs.interfaces.enums.extraction_status import ExtractionStatus
# --------------------------------------------------------------- shared fixtures
@dataclass
-class _Job:
- id: str = "job-1"
- status: str = JobStatus.QUEUED.value
+class _Ext:
+ id: str = "ext_TEST00000000000000000000001"
+ status: str = ExtractionStatus.QUEUED.value
filename: str = "test.pdf"
schema_json: dict[str, Any] = field(default_factory=dict)
options_json: dict[str, Any] = field(default_factory=dict)
@@ -40,11 +40,17 @@ class _Job:
result_json: dict[str, Any] = field(default_factory=dict)
callback_url: str | None = None
attempts: int = 0
- bbox_refine_status: str | None = None
- bbox_refine_attempts: int = 0
+ post_processing_bbox_status: str | None = None
+ post_processing_bbox_attempts: int = 0
+ post_processing_bbox_started_at: datetime | None = None
+ post_processing_bbox_finished_at: datetime | None = None
+ post_processing_bbox_error_code: str | None = None
+ post_processing_bbox_error_message: str | None = None
+ error_code: str | None = None
+ error_message: str | None = None
started_at: datetime | None = None
finished_at: datetime | None = None
- created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
+ submitted_at: datetime = field(default_factory=lambda: datetime.now(UTC))
class _Repo:
@@ -52,98 +58,129 @@ class _Repo:
def __init__(
self,
- job: _Job,
+ ext: _Ext,
*,
claim_returns_none: bool = False,
finalise_returns_none: bool = False,
) -> None:
- self.job = job
+ self.ext = ext
self.claim_returns_none = claim_returns_none
self.finalise_returns_none = finalise_returns_none
self.calls: list[str] = []
- async def get(self, job_id: str) -> _Job | None:
- return self.job if self.job.id == job_id else None
+ async def get(self, ext_id: str) -> _Ext | None:
+ return self.ext if self.ext.id == ext_id else None
- async def mark_running(self, job_id: str, *, lease_seconds: int) -> _Job | None:
+ async def mark_running(self, ext_id: str, *, lease_seconds: int) -> _Ext | None:
self.calls.append("mark_running")
if self.claim_returns_none:
return None
- self.job.status = JobStatus.RUNNING.value
- self.job.attempts += 1
- return self.job
+ self.ext.status = ExtractionStatus.RUNNING.value
+ self.ext.attempts += 1
+ return self.ext
- async def mark_succeeded(self, job_id: str, *, result: dict[str, Any]) -> _Job | None:
+ async def mark_succeeded(
+ self,
+ ext_id: str,
+ *,
+ result: dict[str, Any],
+ request_bbox_refinement: bool = False,
+ ) -> _Ext | None:
self.calls.append("mark_succeeded")
if self.finalise_returns_none:
return None
- self.job.status = JobStatus.SUCCEEDED.value
- self.job.result_json = result
- return self.job
-
- async def mark_partial_succeeded(self, job_id: str, *, result: dict[str, Any]) -> _Job | None:
- self.calls.append("mark_partial_succeeded")
- if self.finalise_returns_none:
- return None
- self.job.status = JobStatus.PARTIAL_SUCCEEDED.value
- self.job.result_json = result
- return self.job
+ self.ext.status = ExtractionStatus.SUCCEEDED.value
+ self.ext.result_json = result
+ if request_bbox_refinement:
+ self.ext.post_processing_bbox_status = "pending"
+ return self.ext
- async def mark_failed(self, job_id: str, *, code: str, message: str) -> _Job | None:
+ async def mark_failed(self, ext_id: str, *, code: str, message: str) -> _Ext | None:
self.calls.append("mark_failed")
if self.finalise_returns_none:
return None
- self.job.status = JobStatus.FAILED.value
- return self.job
+ self.ext.status = ExtractionStatus.FAILED.value
+ return self.ext
- async def requeue_for_retry(self, job_id: str) -> _Job | None:
+ async def requeue_for_retry(self, ext_id: str) -> _Ext | None:
self.calls.append("requeue_for_retry")
if self.finalise_returns_none:
return None
- self.job.status = JobStatus.QUEUED.value
- return self.job
+ self.ext.status = ExtractionStatus.QUEUED.value
+ return self.ext
- async def update(self, job_id: str, **kwargs: Any) -> _Job | None:
+ async def update(self, ext_id: str, **kwargs: Any) -> _Ext | None:
self.calls.append(f"update:{','.join(sorted(kwargs))}")
for k, v in kwargs.items():
- setattr(self.job, k, v)
- return self.job
+ setattr(self.ext, k, v)
+ return self.ext
# bbox leg
- async def mark_bbox_refining(self, job_id: str, *, lease_seconds: int) -> _Job | None:
- self.calls.append("mark_bbox_refining")
+ async def claim_bbox_refinement(self, ext_id: str, *, lease_seconds: int) -> _Ext | None:
+ self.calls.append("claim_bbox_refinement")
if self.claim_returns_none:
return None
- self.job.status = JobStatus.REFINING_BBOXES.value
- self.job.bbox_refine_attempts += 1
- return self.job
+ self.ext.post_processing_bbox_status = "running"
+ self.ext.post_processing_bbox_attempts += 1
+ return self.ext
- async def mark_bbox_refined(self, job_id: str, *, result: dict[str, Any]) -> _Job | None:
- self.calls.append("mark_bbox_refined")
+ async def complete_bbox_refinement(self, ext_id: str, *, result: dict[str, Any]) -> _Ext | None:
+ self.calls.append("complete_bbox_refinement")
if self.finalise_returns_none:
return None
- self.job.status = JobStatus.SUCCEEDED.value
- return self.job
+ self.ext.post_processing_bbox_status = "succeeded"
+ return self.ext
- async def mark_bbox_refine_failed(self, job_id: str, *, code: str, message: str) -> _Job | None:
- self.calls.append("mark_bbox_refine_failed")
+ async def fail_bbox_refinement(self, ext_id: str, *, code: str, message: str) -> _Ext | None:
+ self.calls.append("fail_bbox_refinement")
if self.finalise_returns_none:
return None
- return self.job
+ self.ext.post_processing_bbox_status = "failed"
+ return self.ext
- async def requeue_bbox_refine(self, job_id: str) -> _Job | None:
- self.calls.append("requeue_bbox_refine")
+ async def requeue_bbox_refinement(self, ext_id: str) -> _Ext | None:
+ self.calls.append("requeue_bbox_refinement")
if self.finalise_returns_none:
return None
- return self.job
-
-
-def _make_job_worker(
+ self.ext.post_processing_bbox_status = "pending"
+ return self.ext
+
+
+def _v1_schema() -> dict[str, Any]:
+ """A minimal v1 schema_json shape the worker reads via _build_request."""
+ return {
+ "intention": "test",
+ "document_types": [
+ {
+ "id": "invoice",
+ "description": "x",
+ "field_groups": [
+ {
+ "name": "g",
+ "fields": [
+ {"name": "f", "description": "y", "type": "string"},
+ ],
+ }
+ ],
+ }
+ ],
+ "files": [
+ {
+ "filename": "a.pdf",
+ "content_base64": "Zm9v",
+ "content_type": "application/pdf",
+ "expected_type": None,
+ }
+ ],
+ }
+
+
+def _make_extraction_worker(
repo: _Repo,
*,
orchestrator_result: Any = None,
orchestrator_raises: Exception | None = None,
-) -> tuple[JobWorker, MagicMock, MagicMock]:
+) -> tuple[ExtractionWorker, MagicMock, MagicMock]:
orchestrator = MagicMock()
if orchestrator_raises is not None:
orchestrator.execute = AsyncMock(side_effect=orchestrator_raises)
@@ -154,7 +191,7 @@ def _make_job_worker(
webhook = MagicMock()
webhook.deliver = AsyncMock()
settings = IDPSettings(job_max_attempts=3)
- worker = JobWorker(
+ worker = ExtractionWorker(
orchestrator=orchestrator,
repository=repo, # type: ignore[arg-type]
event_publisher=publisher,
@@ -164,17 +201,17 @@ def _make_job_worker(
return worker, orchestrator, webhook
-# --------------------------------------------------------------- JobWorker tests
+# --------------------------------------------------------------- ExtractionWorker tests
@pytest.mark.asyncio
-async def test_job_worker_bails_silently_when_claim_returns_none() -> None:
+async def test_extraction_worker_bails_silently_when_claim_returns_none() -> None:
"""A worker that loses the claim race must not run the orchestrator."""
- job = _Job()
- repo = _Repo(job, claim_returns_none=True)
- worker, orchestrator, webhook = _make_job_worker(repo)
+ ext = _Ext()
+ repo = _Repo(ext, claim_returns_none=True)
+ worker, orchestrator, webhook = _make_extraction_worker(repo)
- await worker._process(job.id)
+ await worker._process(ext.id)
assert repo.calls == ["mark_running"]
orchestrator.execute.assert_not_called()
@@ -182,13 +219,17 @@ async def test_job_worker_bails_silently_when_claim_returns_none() -> None:
@pytest.mark.asyncio
-async def test_job_worker_skips_terminal_status_without_calling_claim() -> None:
- """Re-delivered events for SUCCEEDED / CANCELLED / FAILED jobs short-circuit."""
- for terminal in (JobStatus.SUCCEEDED, JobStatus.CANCELLED, JobStatus.FAILED):
- job = _Job(status=terminal.value)
- repo = _Repo(job)
- worker, orchestrator, webhook = _make_job_worker(repo)
- await worker._process(job.id)
+async def test_extraction_worker_skips_terminal_status_without_calling_claim() -> None:
+ """Re-delivered events for succeeded / cancelled / failed extractions short-circuit."""
+ for terminal in (
+ ExtractionStatus.SUCCEEDED,
+ ExtractionStatus.CANCELLED,
+ ExtractionStatus.FAILED,
+ ):
+ ext = _Ext(status=terminal.value)
+ repo = _Repo(ext)
+ worker, orchestrator, webhook = _make_extraction_worker(repo)
+ await worker._process(ext.id)
# No mark_running call, no orchestrator run, no webhook fire.
assert repo.calls == []
orchestrator.execute.assert_not_called()
@@ -196,77 +237,45 @@ async def test_job_worker_skips_terminal_status_without_calling_claim() -> None:
@pytest.mark.asyncio
-async def test_job_worker_skips_webhook_when_finalise_returns_none() -> None:
+async def test_extraction_worker_skips_webhook_when_finalise_returns_none() -> None:
"""If mark_succeeded races and loses, no duplicate webhook fires."""
- from flydocs.interfaces.dtos.extract import ExtractionResult
+ from flydocs.interfaces.dtos.extract import ExtractionResult, PipelineMeta
result = ExtractionResult(
- request_id="00000000-0000-0000-0000-000000000001",
+ id="ext_RESULT0000000000000000000000",
+ files=[],
documents=[],
- model="test",
- latency_ms=1,
+ pipeline=PipelineMeta(model="test", latency_ms=1),
)
- job = _Job(
+ ext = _Ext(
callback_url="http://sink/hook",
- schema_json={
- "intention": "test",
- "docs": [
- {
- "docType": {"documentType": "invoice", "description": "x"},
- "fieldGroups": [
- {
- "fieldGroupName": "g",
- "fieldGroupFields": [
- {"fieldName": "f", "fieldDescription": "y", "fieldType": "string"}
- ],
- }
- ],
- }
- ],
- "documents": [{"filename": "a.pdf", "content_base64": "Zm9v", "content_type": "application/pdf"}],
- },
+ schema_json=_v1_schema(),
)
- repo = _Repo(job, finalise_returns_none=True)
- worker, orchestrator, webhook = _make_job_worker(repo, orchestrator_result=result)
+ repo = _Repo(ext, finalise_returns_none=True)
+ worker, orchestrator, webhook = _make_extraction_worker(repo, orchestrator_result=result)
- await worker._process(job.id)
+ await worker._process(ext.id)
assert "mark_succeeded" in repo.calls
webhook.deliver.assert_not_called()
@pytest.mark.asyncio
-async def test_job_worker_retry_path_uses_atomic_requeue() -> None:
+async def test_extraction_worker_retry_path_uses_atomic_requeue() -> None:
"""A retryable failure goes through requeue_for_retry, not raw update()."""
- job = _Job(
+ ext = _Ext(
callback_url=None,
- schema_json={
- "intention": "test",
- "docs": [
- {
- "docType": {"documentType": "invoice", "description": "x"},
- "fieldGroups": [
- {
- "fieldGroupName": "g",
- "fieldGroupFields": [
- {"fieldName": "f", "fieldDescription": "y", "fieldType": "string"}
- ],
- }
- ],
- }
- ],
- "documents": [{"filename": "a.pdf", "content_base64": "Zm9v", "content_type": "application/pdf"}],
- },
+ schema_json=_v1_schema(),
)
- repo = _Repo(job)
- worker, orchestrator, webhook = _make_job_worker(
+ repo = _Repo(ext)
+ worker, orchestrator, webhook = _make_extraction_worker(
repo, orchestrator_raises=RuntimeError("transient network glitch")
)
- await worker._process(job.id)
+ await worker._process(ext.id)
assert "requeue_for_retry" in repo.calls
- # The legacy 'update(status=QUEUED)' path is gone.
+ # The legacy 'update(status=...)' path is gone.
assert not any(call.startswith("update:") and "status" in call for call in repo.calls)
@@ -295,12 +304,15 @@ def _make_bbox_worker(repo: _Repo) -> tuple[BboxRefineWorker, MagicMock, MagicMo
@pytest.mark.asyncio
async def test_bbox_worker_bails_silently_when_claim_returns_none() -> None:
"""Bbox worker that loses the claim must do absolutely nothing else."""
- job = _Job(status=JobStatus.PARTIAL_SUCCEEDED.value)
- repo = _Repo(job, claim_returns_none=True)
+ ext = _Ext(
+ status=ExtractionStatus.SUCCEEDED.value,
+ post_processing_bbox_status="pending",
+ )
+ repo = _Repo(ext, claim_returns_none=True)
worker, publisher, webhook = _make_bbox_worker(repo)
- await worker._process(job.id)
+ await worker._process(ext.id)
- assert repo.calls == ["mark_bbox_refining"]
+ assert repo.calls == ["claim_bbox_refinement"]
publisher.publish.assert_not_called()
webhook.deliver.assert_not_called()
diff --git a/uv.lock b/uv.lock
index 33b3fc5..09a4ac1 100644
--- a/uv.lock
+++ b/uv.lock
@@ -10,6 +10,9 @@ resolution-markers = [
"python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'",
]
+[manifest]
+overrides = [{ name = "mistralai", specifier = ">=1.0.0,<2.0.0" }]
+
[[package]]
name = "accelerate"
version = "1.13.0"
@@ -1405,7 +1408,7 @@ wheels = [
[[package]]
name = "fireflyframework-agentic"
-version = "26.5.11"
+version = "26.5.21"
source = { editable = "../../fireflyframework/fireflyframework-agentic" }
dependencies = [
{ name = "genai-prices" },
@@ -1439,8 +1442,10 @@ requires-dist = [
{ name = "aiokafka", marker = "extra == 'kafka'", specifier = ">=0.12.0" },
{ name = "asyncpg", marker = "extra == 'postgres'", specifier = ">=0.30.0" },
{ name = "azure-identity", marker = "extra == 'azure'", specifier = ">=1.19" },
+ { name = "azure-identity", marker = "extra == 'corpus-search'", specifier = ">=1.19" },
{ name = "azure-keyvault-secrets", marker = "extra == 'azure'", specifier = ">=4.9" },
{ name = "azure-monitor-opentelemetry-exporter", marker = "extra == 'azure'", specifier = ">=1.0.0b30" },
+ { name = "azure-storage-blob", marker = "extra == 'corpus-search'", specifier = ">=12.20.0" },
{ name = "boto3", marker = "extra == 'bedrock-embeddings'", specifier = ">=1.35.0" },
{ name = "chromadb", marker = "extra == 'vectorstores-chroma'", specifier = ">=0.5.0" },
{ name = "cohere", marker = "extra == 'cohere-embeddings'", specifier = ">=5.0.0" },
@@ -1448,9 +1453,7 @@ requires-dist = [
{ name = "fastapi", marker = "extra == 'rest'", specifier = ">=0.115.0" },
{ name = "fastmcp", marker = "extra == 'mcp'", specifier = ">=3.2" },
{ name = "fireflyframework-agentic", extras = ["kafka", "rabbitmq", "redis"], marker = "extra == 'queues'" },
- { name = "fireflyframework-agentic", extras = ["markitdown"], marker = "extra == 'dev'" },
- { name = "fireflyframework-agentic", extras = ["markitdown", "watch", "openai-embeddings"], marker = "extra == 'rag'" },
- { name = "fireflyframework-agentic", extras = ["rag", "vectorstores-sqlite-vec"], marker = "extra == 'corpus-search'" },
+ { name = "fireflyframework-agentic", extras = ["markitdown", "watch", "openai-embeddings", "vectorstores-sqlite-vec"], marker = "extra == 'corpus-search'" },
{ name = "fireflyframework-agentic", extras = ["rest", "queues", "postgres", "mongodb", "security", "azure", "http", "embeddings", "openai-embeddings", "cohere-embeddings", "google-embeddings", "mistral-embeddings", "voyage-embeddings", "azure-embeddings", "bedrock-embeddings", "ollama-embeddings", "corpus-search", "vectorstores-chroma", "vectorstores-pinecone", "vectorstores-qdrant", "vectorstores-sqlite-vec", "markitdown", "watch"], marker = "extra == 'all'" },
{ name = "genai-prices", specifier = ">=0.0.1" },
{ name = "google-generativeai", marker = "extra == 'google-embeddings'", specifier = ">=0.8.0" },
@@ -1488,7 +1491,6 @@ requires-dist = [
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" },
{ name = "sqlalchemy", marker = "extra == 'postgres'", specifier = ">=2.0.0" },
{ name = "sqlglot", marker = "extra == 'corpus-search'", specifier = ">=26.0.0" },
- { name = "sqlite-vec", marker = "extra == 'dev'", specifier = ">=0.1.6" },
{ name = "sqlite-vec", marker = "extra == 'vectorstores-sqlite-vec'", specifier = ">=0.1.6" },
{ name = "sse-starlette", marker = "extra == 'rest'", specifier = ">=2.0.0" },
{ name = "testcontainers", marker = "extra == 'dev'", specifier = ">=4.10.0" },
@@ -1496,11 +1498,11 @@ requires-dist = [
{ name = "voyageai", marker = "extra == 'voyage-embeddings'", specifier = ">=0.3.0" },
{ name = "watchfiles", marker = "extra == 'watch'", specifier = ">=0.24.0" },
]
-provides-extras = ["rest", "kafka", "rabbitmq", "redis", "queues", "postgres", "mongodb", "security", "azure", "mcp", "embeddings", "openai-embeddings", "cohere-embeddings", "google-embeddings", "mistral-embeddings", "markitdown", "voyage-embeddings", "azure-embeddings", "bedrock-embeddings", "ollama-embeddings", "rag", "corpus-search", "vectorstores-chroma", "vectorstores-sqlite-vec", "vectorstores-pinecone", "vectorstores-qdrant", "watch", "all", "dev"]
+provides-extras = ["rest", "kafka", "rabbitmq", "redis", "queues", "postgres", "mongodb", "security", "azure", "mcp", "embeddings", "openai-embeddings", "cohere-embeddings", "google-embeddings", "mistral-embeddings", "markitdown", "voyage-embeddings", "azure-embeddings", "bedrock-embeddings", "ollama-embeddings", "corpus-search", "vectorstores-chroma", "vectorstores-sqlite-vec", "vectorstores-pinecone", "vectorstores-qdrant", "watch", "all", "dev"]
[[package]]
name = "flydocs"
-version = "26.5.1"
+version = "26.6.0"
source = { editable = "." }
dependencies = [
{ name = "aiosqlite" },
@@ -3933,7 +3935,7 @@ wheels = [
[[package]]
name = "pyfly"
-version = "26.5.4"
+version = "26.5.5"
source = { editable = "../../fireflyframework/fireflyframework-pyfly" }
dependencies = [
{ name = "pydantic" },