InfoSecHack · InfoSecHack · Jun 4, 2026 · Jun 4, 2026
diff --git a/docs/specs/case-sensitive-arn-id-collision-design.md b/docs/specs/case-sensitive-arn-id-collision-design.md
@@ -92,11 +92,11 @@ mistakenly compare v2 and v3 IDs as if they were the same identifier family.
 
 ## Candidate Fix
 
-Introduce a new deterministic ID algorithm version, tentatively:
+Implemented in the v3 migration slice as:
 
 `sha256_null_separated_v3_case_sensitive_provider_ids`
 
-The candidate v3 behavior should use field-aware canonicalization:
+The v3 behavior uses field-aware canonicalization:
 
 - keep provider and structural type fields normalized where they are intended to
   be case-insensitive, such as `provider`, `node_type`, `edge_type`, and
@@ -105,13 +105,19 @@ The candidate v3 behavior should use field-aware canonicalization:
   `provider_id`, `src_provider_id`, and `dst_provider_id`;
 - keep feature canonicalization deterministic and unchanged unless a separate
   review finds a feature-level case collision;
-- review `constraint_id` separately before deciding whether it should stay on
-  the existing canonicalization behavior or move to a field-aware formula.
+- leaves `constraint_id` on the existing canonicalization behavior pending
+  separate review.
 
 The code fix should avoid a broad "never lowercase anything" change. The safer
 boundary is to make each deterministic ID formula choose the canonicalization
 rule for each field it owns.
 
+v3 is not ID-compatible with v2. Cross-version comparisons must not treat
+`node_id` or `edge_id` values as the same identifier family unless a future
+explicit migration/remapping layer is added. ARF-RT, probe overlays,
+observation logs, and `findings_diff` consumers should gate ID comparison on
+matching `id_algorithm` values.
+
 ## Migration and Versioning Plan
 
 1. Add focused regression tests that pin the current collision as design

diff --git a/iamscope/constants.py b/iamscope/constants.py
@@ -13,7 +13,7 @@
 # ---------------------------------------------------------------------------
 # Deterministic ID algorithm version
 # ---------------------------------------------------------------------------
-ID_ALGORITHM: str = "sha256_null_separated_v2"
+ID_ALGORITHM: str = "sha256_null_separated_v3_case_sensitive_provider_ids"
 
 # ---------------------------------------------------------------------------
 # confidence_q mapping (int 0..1000)

diff --git a/iamscope/identity/deterministic_ids.py b/iamscope/identity/deterministic_ids.py
@@ -1,7 +1,9 @@
 """Deterministic ID generation for IAMScope.
 
-Algorithm: sha256_null_separated_v2 (bumped from v1 in v0.2.37)
-- Fields are stripped and lowercased
+Algorithm: sha256_null_separated_v3_case_sensitive_provider_ids
+- Fields are stripped
+- Structural fields are lowercased by the ID formula that owns them
+- Provider-owned identity fields preserve exact case
 - Joined with NULL byte separator (\\x00)
 - SHA-256 hashed
 - Full hex digest (64 chars)
@@ -44,22 +46,49 @@
   produced the edge_ids in a given scenario. Consumers should
   read that field and gate any edge-id comparison on equality.
 
-Pinned formulas (v2):
-- node_id       = canonical_id(provider, node_type, provider_id)
-- edge_id       = canonical_id(edge_type, src_provider_id,
-                                dst_provider_id, region,
-                                features_digest)
+Pinned formulas (v3):
+- node_id       = hash(provider.lower(), node_type.lower(),
+                       case-preserved provider_id)
+- edge_id       = hash(edge_type.lower(),
+                       case-preserved src_provider_id,
+                       case-preserved dst_provider_id,
+                       region.lower(),
+                       case-preserved features_digest)
                   where features_digest is
                   canonical_json_bytes(features).decode("utf-8")
 - constraint_id = canonical_id(provider, constraint_type,
                                 scope_type, scope_id, policy_id,
                                 statement_id)
+                  intentionally remains on the legacy lowercase
+                  canonicalization path pending separate review
 
 `node_id` and `constraint_id` formulas are UNCHANGED between
 v1 and v2 — only `edge_id` gained the features_digest field.
 A pure v1→v2 re-emit on the same inputs produces identical node
 and constraint IDs, and v2-only-different edge IDs for edges
 whose features participated in the v1 collision.
+
+### v2 → v3 migration (case-sensitive provider IDs)
+
+v2 used `canonical_id` for every deterministic ID formula, which
+lowercased every string field before hashing. AWS IAM role and user
+names are case-sensitive, so case-distinct provider IDs such as
+`arn:aws:iam::000000000000:role/CaseRole` and
+`arn:aws:iam::000000000000:role/caserole` collided under `node_id`.
+`edge_id` had the same class of collision for source and destination
+provider IDs.
+
+v3 makes `node_id` and `edge_id` field-aware:
+- structural fields (`provider`, `node_type`, `edge_type`, `region`)
+  are stripped and lowercased;
+- provider-owned identity fields (`provider_id`, `src_provider_id`,
+  `dst_provider_id`) are stripped but preserve case;
+- `features_digest` is stripped but otherwise preserved because it is
+  already canonical JSON bytes decoded as UTF-8.
+
+`constraint_id` intentionally remains on the legacy lowercase
+canonicalization path pending separate constraint-field review.
+v2 and v3 scenarios are NOT node-id- or edge-id-comparable.
 """
 
 import hashlib
@@ -101,10 +130,33 @@ def canonical_id(*fields: str) -> str:
     return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
 
 
+def _validate_id_field(field: str, index: int) -> str:
+    if not isinstance(field, str):
+        raise ValueError(f"canonical_id field {index} must be a string, got {type(field).__name__}: {field!r}")
+    stripped = field.strip()
+    if not stripped:
+        raise ValueError(f"canonical_id field {index} is empty after stripping: {field!r}")
+    return stripped
+
+
+def _hash_processed_fields(fields: list[str]) -> str:
+    canonical = "\x00".join(fields)
+    return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
+
+
+def _structural_field(field: str, index: int) -> str:
+    return _validate_id_field(field, index).lower()
+
+
+def _provider_identity_field(field: str, index: int) -> str:
+    return _validate_id_field(field, index)
+
+
 def node_id(provider: str, node_type: str, provider_id: str) -> str:
     """Compute deterministic node ID.
 
-    Formula: canonical_id(provider, node_type, provider_id)
+    Formula (v3): sha256(provider.lower(), node_type.lower(),
+                         case-preserved provider_id)
 
     Args:
         provider: Provider string (e.g., "aws").
@@ -114,7 +166,13 @@ def node_id(provider: str, node_type: str, provider_id: str) -> str:
     Returns:
         64-character hex string.
     """
-    return canonical_id(provider, node_type, provider_id)
+    return _hash_processed_fields(
+        [
+            _structural_field(provider, 0),
+            _structural_field(node_type, 1),
+            _provider_identity_field(provider_id, 2),
+        ]
+    )
 
 
 def edge_id(
@@ -124,10 +182,12 @@ def edge_id(
     region: str,
     features_digest: str,
 ) -> str:
-    """Compute deterministic edge ID (sha256_null_separated_v2).
+    """Compute deterministic edge ID (sha256_null_separated_v3).
 
-    Formula: canonical_id(edge_type, src_provider_id, dst_provider_id,
-                          region, features_digest)
+    Formula (v3): sha256(edge_type.lower(), case-preserved
+                          src_provider_id, case-preserved
+                          dst_provider_id, region.lower(),
+                          features_digest)
 
     v2 (v0.2.37+) adds `features_digest` as a required fifth field —
     see module docstring for the v1→v2 migration rationale (reviewer
@@ -153,7 +213,15 @@ def edge_id(
     Returns:
         64-character hex string.
     """
-    return canonical_id(edge_type, src_provider_id, dst_provider_id, region, features_digest)
+    return _hash_processed_fields(
+        [
+            _structural_field(edge_type, 0),
+            _provider_identity_field(src_provider_id, 1),
+            _provider_identity_field(dst_provider_id, 2),
+            _structural_field(region, 3),
+            _provider_identity_field(features_digest, 4),
+        ]
+    )
 
 
 def constraint_id(

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -111,7 +111,7 @@ def minimal_metadata() -> ScenarioMetadata:
     return ScenarioMetadata(
         collector="iamscope",
         collector_version="0.2.0",
-        id_algorithm="sha256_null_separated_v2",
+        id_algorithm="sha256_null_separated_v3_case_sensitive_provider_ids",
         org_id="o-testorg",
         accounts_collected=1,
         accounts_skipped=0,

diff --git a/tests/fixtures/demo/complex_replay_subset_passrole_lambda/expected_rows.json b/tests/fixtures/demo/complex_replay_subset_passrole_lambda/expected_rows.json
@@ -2,8 +2,8 @@
   "expected_rows": [
     {
       "expected_generation": "replayed_by_existing_passrole_lambda_reasoner",
-      "finding_id": "bb1a6460e5e4002f2af318a0d5548ef9556a5908066cf8dfc560de86d6a39346",
-      "finding_key": "c0753d2cd79b37bafa93ea4139cdab85460fca677b89620ebb6d86d1e7bc5d87",
+      "finding_id": "62317bed606ab926df95ef1aecbcb1af51b738539bd0be385fed67c8fa3ece9f",
+      "finding_key": "5e00684bab5af59abccb3eb836889e106c9480f90cab77938aecc7f4ebbeda8f",
       "pattern_id": "passrole_lambda",
       "row_id": "subset-row-allowed-passrole-lambda",
       "severity": "high",
@@ -24,7 +24,7 @@
     "no composite benchmark score",
     "no pass/fail benchmark label"
   ],
-  "scenario_hash": "9d8c55160555d53c2864b5adc53922f930ac90215ef7db4f7cd8d5b630a7252f",
+  "scenario_hash": "536d02868ab7c610a800105e94482725142b0f70c28cdc975091d0b384714926",
   "schema_version": "complex_passrole_lambda_replay_subset_expected_rows.v1",
   "static_only_rows": [
     {