braintrustdata · Andrew Kent (realark) · May 29, 2026 · Jun 1, 2026
diff --git a/.gitignore b/.gitignore
@@ -57,3 +57,6 @@ gemfiles/*.gemfile.lock
 # Ignore local Docker files (for dev env customizations)
 docker-compose.override.yml
 Dockerfile.local
+
+# BTX: cached braintrust-spec downloads (fetched on demand)
+/test/btx/.spec-cache/
diff --git a/Rakefile b/Rakefile
@@ -5,7 +5,38 @@ require "rake/testtask"
 desc "Run tests (optionally with seed: rake test[12345])"
 task :test, [:seed] do |t, args|
   seed_opt = args[:seed] ? " -- --seed=#{args[:seed]}" : ""
-  sh "ruby -Ilib:test -e \"Dir.glob('test/**/*_test.rb').each { |f| require_relative f }\"#{seed_opt}"
+  # Exclude the BTX cross-language spec suite — it requires provider gems and
+  # is run separately via `rake test:btx` (under the contrib appraisal).
+  sh "ruby -Ilib:test -e \"Dir.glob('test/**/*_test.rb').reject { |f| f.start_with?('test/btx/') }.each { |f| require_relative f }\"#{seed_opt}"
+end
+
+namespace :test do
+  # BTX: cross-language LLM-span spec suite.
+  #
+  # Requires the openai + anthropic gems, so it runs under the `contrib`
+  # appraisal. Use `rake test:btx` while already inside a gemfile that has the
+  # provider gems (e.g. `bundle exec appraisal contrib rake test:btx`), or
+  # `rake test:btx:ci` which selects the contrib appraisal for you.
+  namespace :btx do
+    desc "Fetch the pinned braintrust-spec into the local cache (idempotent)"
+    task :fetch_spec do
+      # Run the fetch in a clean process before WebMock is loaded so the GitHub
+      # download is not blocked by the test suite's HTTP stubbing.
+      sh "ruby -Itest/btx -e \"require 'spec_fetcher'; puts Braintrust::BTX::SpecFetcher.spec_root\""
+    end
+
+    desc "Run the BTX suite under the contrib appraisal (used by `rake ci`)"
+    task :ci do
+      # Ensure the contrib gemfile (openai + anthropic) is installed, then run.
+      sh "bundle exec appraisal contrib bundle install --quiet"
+      sh "bundle exec appraisal contrib rake test:btx"
+    end
+  end
+
+  desc "Run the BTX cross-language LLM-span spec suite (run under the contrib appraisal)"
+  task btx: :"btx:fetch_spec" do
+    sh "ruby -Ilib:test -e \"require_relative 'test/btx/btx_test.rb'\""
+  end
 end
 
 desc "Run Standard linter"
@@ -91,8 +122,8 @@ task coverage: :test do
   end
 end
 
-desc "Verify CI (lint + test all appraisal scenarios)"
-task ci: [:lint, :"test:appraisal"]
+desc "Verify CI (lint + test all appraisal scenarios + btx spec suite)"
+task ci: [:lint, :"test:appraisal", :"test:btx:ci"]
 
 task default: :ci
 

@@ -33,10 +33,23 @@ def self.parse_usage_tokens(usage)
               metrics[target] = value.to_i if target
             end
 
-            # Accumulate cache tokens into prompt_tokens (matching TS/Python SDKs)
+            # Cache-creation breakdown. When Anthropic returns the per-TTL
+            # `cache_creation` breakdown, report the granular metrics
+            # (prompt_cache_creation_5m_tokens / _1h_tokens) and drop the
+            # aggregate prompt_cache_creation_tokens — the aggregate is just the
+            # sum of the variants, so reporting both would double count.
+            cache_creation_total = metrics["prompt_cache_creation_tokens"]
+            apply_cache_creation_breakdown(metrics, usage_hash)
+
+            # Accumulate cache tokens into prompt_tokens (matching TS/Python SDKs).
+            # Use the original aggregate total when present, otherwise the
+            # granular breakdown sum.
+            creation_for_prompt = cache_creation_total ||
+              (metrics["prompt_cache_creation_5m_tokens"] || 0) +
+                (metrics["prompt_cache_creation_1h_tokens"] || 0)
             prompt_tokens = (metrics["prompt_tokens"] || 0) +
               (metrics["prompt_cached_tokens"] || 0) +
-              (metrics["prompt_cache_creation_tokens"] || 0)
+              creation_for_prompt
             metrics["prompt_tokens"] = prompt_tokens if prompt_tokens > 0
 
             # Calculate total
@@ -46,6 +59,36 @@ def self.parse_usage_tokens(usage)
 
             metrics
           end
+
+          # Map the nested `cache_creation` breakdown to per-TTL metrics and
+          # remove the now-redundant aggregate. No-op when the breakdown is
+          # absent or carries no positive values.
+          # @param metrics [Hash] metrics accumulated so far (mutated)
+          # @param usage_hash [Hash] raw Anthropic usage hash
+          def self.apply_cache_creation_breakdown(metrics, usage_hash)
+            breakdown = usage_hash["cache_creation"] || usage_hash[:cache_creation]
+            breakdown = breakdown.to_h if breakdown.respond_to?(:to_h)
+            return unless breakdown.is_a?(Hash)
+
+            ttl_map = {
+              "ephemeral_5m_input_tokens" => "prompt_cache_creation_5m_tokens",
+              "ephemeral_1h_input_tokens" => "prompt_cache_creation_1h_tokens"
+            }
+
+            emitted = false
+            ttl_map.each do |source, target|
+              next unless breakdown.key?(source) || breakdown.key?(source.to_sym)
+              value = breakdown[source] || breakdown[source.to_sym]
+              next unless value.is_a?(Numeric)
+              metrics[target] = value.to_i
+              emitted = true
+            end
+
+            # When the per-TTL breakdown is present, drop the aggregate so we do
+            # not double count (spec: "anthropic cache tokens only send 5m or
+            # 1h variants").
+            metrics.delete("prompt_cache_creation_tokens") if emitted
+          end
         end
       end
     end

@@ -34,6 +34,7 @@ def create(**params)
 
               tracer.in_span("anthropic.messages.create") do |span|
                 metadata = build_metadata(params)
+                Support::OTel.set_json_attr(span, "braintrust.span_attributes", {type: "llm"})
                 set_input(span, params)
 
                 response = nil
@@ -98,6 +99,13 @@ def build_metadata(params, stream: false)
             def set_input(span, params)
               input_messages = []
 
+              # User/assistant messages come first, then the system prompt is
+              # appended (matching the cross-language spec / backend format).
+              if params[:messages]
+                messages_array = params[:messages].map(&:to_h)
+                input_messages.concat(messages_array)
+              end
+
               if params[:system_]
                 system_content = params[:system_]
                 if system_content.is_a?(Array)
@@ -110,22 +118,17 @@ def set_input(span, params)
                 end
               end
 
-              if params[:messages]
-                messages_array = params[:messages].map(&:to_h)
-                input_messages.concat(messages_array)
-              end
-
               Support::OTel.set_json_attr(span, "braintrust.input_json", input_messages) if input_messages.any?
             end
 
             def set_output(span, response)
               return unless response.respond_to?(:content) && response.content
 
               content_array = response.content.map(&:to_h)
-              output = [{
+              output = {
                 role: response.respond_to?(:role) ? response.role : "assistant",
                 content: content_array
-              }]
+              }
               Support::OTel.set_json_attr(span, "braintrust.output_json", output)
             end
 
@@ -196,7 +199,8 @@ def close
                 metadata = ctx[:metadata]
                 messages_instance = ctx[:messages_instance]
 
-                tracer.in_span("anthropic.messages.create") do |span|
+                tracer.in_span("anthropic.messages.stream") do |span|
+                  Support::OTel.set_json_attr(span, "braintrust.span_attributes", {type: "llm"})
                   messages_instance.send(:set_input, span, params)
                   Support::OTel.set_json_attr(span, "braintrust.metadata", metadata)
                 end
@@ -215,7 +219,8 @@ def trace_consumption(ctx)
               metadata = ctx[:metadata]
               messages_instance = ctx[:messages_instance]
 
-              tracer.in_span("anthropic.messages.create") do |span|
+              tracer.in_span("anthropic.messages.stream") do |span|
+                Support::OTel.set_json_attr(span, "braintrust.span_attributes", {type: "llm"})
                 messages_instance.send(:set_input, span, params)
                 Support::OTel.set_json_attr(span, "braintrust.metadata", metadata)
 

@@ -40,6 +40,7 @@ def create(**params)
                 tracer.in_span("Chat Completion") do |span|
                   metadata = build_metadata(params)
 
+                  Support::OTel.set_json_attr(span, "braintrust.span_attributes", {type: "llm"})
                   set_input(span, params)
 
                   response = nil
@@ -180,6 +181,7 @@ def trace_consumption(ctx)
                 start_time = Braintrust::Internal::Time.measure
 
                 tracer.in_span("Chat Completion") do |span|
+                  Support::OTel.set_json_attr(span, "braintrust.span_attributes", {type: "llm"})
                   completions_instance.send(:set_input, span, params)
                   Support::OTel.set_json_attr(span, "braintrust.metadata", metadata)
 
@@ -252,6 +254,7 @@ def each(&block)
                 time_to_first_token = nil
 
                 tracer.in_span("Chat Completion") do |span|
+                  Support::OTel.set_json_attr(span, "braintrust.span_attributes", {type: "llm"})
                   completions_instance.send(:set_input, span, params)
                   Support::OTel.set_json_attr(span, "braintrust.metadata", metadata)
 

@@ -39,6 +39,7 @@ def create(**params)
               tracer.in_span("openai.responses.create") do |span|
                 metadata = build_metadata(params)
 
+                Support::OTel.set_json_attr(span, "braintrust.span_attributes", {type: "llm"})
                 set_input(span, params)
 
                 response = nil
@@ -140,6 +141,7 @@ def each(&block)
               time_to_first_token = nil
 
               tracer.in_span("openai.responses.create") do |span|
+                Support::OTel.set_json_attr(span, "braintrust.span_attributes", {type: "llm"})
                 responses_instance.send(:set_input, span, params)
                 Support::OTel.set_json_attr(span, "braintrust.metadata", metadata)
 

diff --git a/test/braintrust/contrib/anthropic/instrumentation/beta_messages_test.rb b/test/braintrust/contrib/anthropic/instrumentation/beta_messages_test.rb
@@ -201,7 +201,7 @@ def test_handles_beta_streaming
       # Single span created during consumption
       span = rig.drain_one
 
-      assert_equal "anthropic.messages.create", span.name
+      assert_equal "anthropic.messages.stream", span.name
 
       # Verify input captured on span
       assert span.attributes.key?("braintrust.input_json")

diff --git a/test/braintrust/contrib/anthropic/instrumentation/common_test.rb b/test/braintrust/contrib/anthropic/instrumentation/common_test.rb
@@ -57,6 +57,32 @@ def test_handles_cache_creation_tokens
     assert_equal 120, metrics["prompt_tokens"]
   end
 
+  def test_handles_granular_cache_creation_breakdown
+    # When Anthropic returns the per-TTL cache_creation breakdown, report the
+    # granular metrics and drop the aggregate (which would double count).
+    usage = {
+      "input_tokens" => 12,
+      "output_tokens" => 5,
+      "cache_read_input_tokens" => 0,
+      "cache_creation_input_tokens" => 1369,
+      "cache_creation" => {
+        "ephemeral_5m_input_tokens" => 1369,
+        "ephemeral_1h_input_tokens" => 0
+      }
+    }
+
+    metrics = Common.parse_usage_tokens(usage)
+
+    # Both TTL variants present in the breakdown are reported (including zero),
+    # and the aggregate is dropped so the totals are not double counted.
+    assert_equal 1369, metrics["prompt_cache_creation_5m_tokens"]
+    assert_equal 0, metrics["prompt_cache_creation_1h_tokens"]
+    refute metrics.key?("prompt_cache_creation_tokens"), "aggregate dropped when breakdown present"
+    # prompt_tokens still accumulates the creation tokens: 12 + 0 + 1369
+    assert_equal 1381, metrics["prompt_tokens"]
+    assert_equal 1386, metrics["tokens"]
+  end
+
   def test_handles_object_with_to_h
     # SDK returns objects with to_h method
     usage_object = Struct.new(:input_tokens, :output_tokens, keyword_init: true)