Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,6 @@ gemfiles/*.gemfile.lock
# Ignore local Docker files (for dev env customizations)
docker-compose.override.yml
Dockerfile.local

# BTX: cached braintrust-spec downloads (fetched on demand)
/test/btx/.spec-cache/
37 changes: 34 additions & 3 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,38 @@ require "rake/testtask"
desc "Run tests (optionally with seed: rake test[12345])"
task :test, [:seed] do |t, args|
seed_opt = args[:seed] ? " -- --seed=#{args[:seed]}" : ""
sh "ruby -Ilib:test -e \"Dir.glob('test/**/*_test.rb').each { |f| require_relative f }\"#{seed_opt}"
# Exclude the BTX cross-language spec suite — it requires provider gems and
# is run separately via `rake test:btx` (under the contrib appraisal).
sh "ruby -Ilib:test -e \"Dir.glob('test/**/*_test.rb').reject { |f| f.start_with?('test/btx/') }.each { |f| require_relative f }\"#{seed_opt}"
end

namespace :test do
# BTX: cross-language LLM-span spec suite.
#
# Requires the openai + anthropic gems, so it runs under the `contrib`
# appraisal. Use `rake test:btx` while already inside a gemfile that has the
# provider gems (e.g. `bundle exec appraisal contrib rake test:btx`), or
# `rake test:btx:ci` which selects the contrib appraisal for you.
namespace :btx do
desc "Fetch the pinned braintrust-spec into the local cache (idempotent)"
task :fetch_spec do
# Run the fetch in a clean process before WebMock is loaded so the GitHub
# download is not blocked by the test suite's HTTP stubbing.
sh "ruby -Itest/btx -e \"require 'spec_fetcher'; puts Braintrust::BTX::SpecFetcher.spec_root\""
end

desc "Run the BTX suite under the contrib appraisal (used by `rake ci`)"
task :ci do
# Ensure the contrib gemfile (openai + anthropic) is installed, then run.
sh "bundle exec appraisal contrib bundle install --quiet"
sh "bundle exec appraisal contrib rake test:btx"
end
end

desc "Run the BTX cross-language LLM-span spec suite (run under the contrib appraisal)"
task btx: :"btx:fetch_spec" do
sh "ruby -Ilib:test -e \"require_relative 'test/btx/btx_test.rb'\""
end
end

desc "Run Standard linter"
Expand Down Expand Up @@ -91,8 +122,8 @@ task coverage: :test do
end
end

desc "Verify CI (lint + test all appraisal scenarios)"
task ci: [:lint, :"test:appraisal"]
desc "Verify CI (lint + test all appraisal scenarios + btx spec suite)"
task ci: [:lint, :"test:appraisal", :"test:btx:ci"]

task default: :ci

Expand Down
47 changes: 45 additions & 2 deletions lib/braintrust/contrib/anthropic/instrumentation/common.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,23 @@ def self.parse_usage_tokens(usage)
metrics[target] = value.to_i if target
end

# Accumulate cache tokens into prompt_tokens (matching TS/Python SDKs)
# Cache-creation breakdown. When Anthropic returns the per-TTL
# `cache_creation` breakdown, report the granular metrics
# (prompt_cache_creation_5m_tokens / _1h_tokens) and drop the
# aggregate prompt_cache_creation_tokens — the aggregate is just the
# sum of the variants, so reporting both would double count.
cache_creation_total = metrics["prompt_cache_creation_tokens"]
apply_cache_creation_breakdown(metrics, usage_hash)

# Accumulate cache tokens into prompt_tokens (matching TS/Python SDKs).
# Use the original aggregate total when present, otherwise the
# granular breakdown sum.
creation_for_prompt = cache_creation_total ||
(metrics["prompt_cache_creation_5m_tokens"] || 0) +
(metrics["prompt_cache_creation_1h_tokens"] || 0)
prompt_tokens = (metrics["prompt_tokens"] || 0) +
(metrics["prompt_cached_tokens"] || 0) +
(metrics["prompt_cache_creation_tokens"] || 0)
creation_for_prompt
metrics["prompt_tokens"] = prompt_tokens if prompt_tokens > 0

# Calculate total
Expand All @@ -46,6 +59,36 @@ def self.parse_usage_tokens(usage)

metrics
end

# Map the nested `cache_creation` breakdown to per-TTL metrics and
# remove the now-redundant aggregate. No-op when the breakdown is
# absent or carries no positive values.
# @param metrics [Hash] metrics accumulated so far (mutated)
# @param usage_hash [Hash] raw Anthropic usage hash
def self.apply_cache_creation_breakdown(metrics, usage_hash)
breakdown = usage_hash["cache_creation"] || usage_hash[:cache_creation]
breakdown = breakdown.to_h if breakdown.respond_to?(:to_h)
return unless breakdown.is_a?(Hash)

ttl_map = {
"ephemeral_5m_input_tokens" => "prompt_cache_creation_5m_tokens",
"ephemeral_1h_input_tokens" => "prompt_cache_creation_1h_tokens"
}

emitted = false
ttl_map.each do |source, target|
next unless breakdown.key?(source) || breakdown.key?(source.to_sym)
value = breakdown[source] || breakdown[source.to_sym]
next unless value.is_a?(Numeric)
metrics[target] = value.to_i
emitted = true
end

# When the per-TTL breakdown is present, drop the aggregate so we do
# not double count (spec: "anthropic cache tokens only send 5m or
# 1h variants").
metrics.delete("prompt_cache_creation_tokens") if emitted
end
end
end
end
Expand Down
23 changes: 14 additions & 9 deletions lib/braintrust/contrib/anthropic/instrumentation/messages.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def create(**params)

tracer.in_span("anthropic.messages.create") do |span|
metadata = build_metadata(params)
Support::OTel.set_json_attr(span, "braintrust.span_attributes", {type: "llm"})
set_input(span, params)

response = nil
Expand Down Expand Up @@ -98,6 +99,13 @@ def build_metadata(params, stream: false)
def set_input(span, params)
input_messages = []

# User/assistant messages come first, then the system prompt is
# appended (matching the cross-language spec / backend format).
if params[:messages]
messages_array = params[:messages].map(&:to_h)
input_messages.concat(messages_array)
end

if params[:system_]
system_content = params[:system_]
if system_content.is_a?(Array)
Expand All @@ -110,22 +118,17 @@ def set_input(span, params)
end
end

if params[:messages]
messages_array = params[:messages].map(&:to_h)
input_messages.concat(messages_array)
end

Support::OTel.set_json_attr(span, "braintrust.input_json", input_messages) if input_messages.any?
end

def set_output(span, response)
return unless response.respond_to?(:content) && response.content

content_array = response.content.map(&:to_h)
output = [{
output = {
role: response.respond_to?(:role) ? response.role : "assistant",
content: content_array
}]
}
Support::OTel.set_json_attr(span, "braintrust.output_json", output)
end

Expand Down Expand Up @@ -196,7 +199,8 @@ def close
metadata = ctx[:metadata]
messages_instance = ctx[:messages_instance]

tracer.in_span("anthropic.messages.create") do |span|
tracer.in_span("anthropic.messages.stream") do |span|
Support::OTel.set_json_attr(span, "braintrust.span_attributes", {type: "llm"})
messages_instance.send(:set_input, span, params)
Support::OTel.set_json_attr(span, "braintrust.metadata", metadata)
end
Expand All @@ -215,7 +219,8 @@ def trace_consumption(ctx)
metadata = ctx[:metadata]
messages_instance = ctx[:messages_instance]

tracer.in_span("anthropic.messages.create") do |span|
tracer.in_span("anthropic.messages.stream") do |span|
Support::OTel.set_json_attr(span, "braintrust.span_attributes", {type: "llm"})
messages_instance.send(:set_input, span, params)
Support::OTel.set_json_attr(span, "braintrust.metadata", metadata)

Expand Down
3 changes: 3 additions & 0 deletions lib/braintrust/contrib/openai/instrumentation/chat.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def create(**params)
tracer.in_span("Chat Completion") do |span|
metadata = build_metadata(params)

Support::OTel.set_json_attr(span, "braintrust.span_attributes", {type: "llm"})
set_input(span, params)

response = nil
Expand Down Expand Up @@ -180,6 +181,7 @@ def trace_consumption(ctx)
start_time = Braintrust::Internal::Time.measure

tracer.in_span("Chat Completion") do |span|
Support::OTel.set_json_attr(span, "braintrust.span_attributes", {type: "llm"})
completions_instance.send(:set_input, span, params)
Support::OTel.set_json_attr(span, "braintrust.metadata", metadata)

Expand Down Expand Up @@ -252,6 +254,7 @@ def each(&block)
time_to_first_token = nil

tracer.in_span("Chat Completion") do |span|
Support::OTel.set_json_attr(span, "braintrust.span_attributes", {type: "llm"})
completions_instance.send(:set_input, span, params)
Support::OTel.set_json_attr(span, "braintrust.metadata", metadata)

Expand Down
2 changes: 2 additions & 0 deletions lib/braintrust/contrib/openai/instrumentation/responses.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def create(**params)
tracer.in_span("openai.responses.create") do |span|
metadata = build_metadata(params)

Support::OTel.set_json_attr(span, "braintrust.span_attributes", {type: "llm"})
set_input(span, params)

response = nil
Expand Down Expand Up @@ -140,6 +141,7 @@ def each(&block)
time_to_first_token = nil

tracer.in_span("openai.responses.create") do |span|
Support::OTel.set_json_attr(span, "braintrust.span_attributes", {type: "llm"})
responses_instance.send(:set_input, span, params)
Support::OTel.set_json_attr(span, "braintrust.metadata", metadata)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def test_handles_beta_streaming
# Single span created during consumption
span = rig.drain_one

assert_equal "anthropic.messages.create", span.name
assert_equal "anthropic.messages.stream", span.name

# Verify input captured on span
assert span.attributes.key?("braintrust.input_json")
Expand Down
26 changes: 26 additions & 0 deletions test/braintrust/contrib/anthropic/instrumentation/common_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,32 @@ def test_handles_cache_creation_tokens
assert_equal 120, metrics["prompt_tokens"]
end

def test_handles_granular_cache_creation_breakdown
# When Anthropic returns the per-TTL cache_creation breakdown, report the
# granular metrics and drop the aggregate (which would double count).
usage = {
"input_tokens" => 12,
"output_tokens" => 5,
"cache_read_input_tokens" => 0,
"cache_creation_input_tokens" => 1369,
"cache_creation" => {
"ephemeral_5m_input_tokens" => 1369,
"ephemeral_1h_input_tokens" => 0
}
}

metrics = Common.parse_usage_tokens(usage)

# Both TTL variants present in the breakdown are reported (including zero),
# and the aggregate is dropped so the totals are not double counted.
assert_equal 1369, metrics["prompt_cache_creation_5m_tokens"]
assert_equal 0, metrics["prompt_cache_creation_1h_tokens"]
refute metrics.key?("prompt_cache_creation_tokens"), "aggregate dropped when breakdown present"
# prompt_tokens still accumulates the creation tokens: 12 + 0 + 1369
assert_equal 1381, metrics["prompt_tokens"]
assert_equal 1386, metrics["tokens"]
end

def test_handles_object_with_to_h
# SDK returns objects with to_h method
usage_object = Struct.new(:input_tokens, :output_tokens, keyword_init: true)
Expand Down
Loading