Skip to content
119 changes: 119 additions & 0 deletions src/bosh-director/lib/bosh/director/metrics_collector.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,33 @@ def initialize(config)
labels: %i[name],
docstring: 'Number of unresponsive agents per deployment',
)

@unhealthy_agents = Prometheus::Client.registry.gauge(
:bosh_unhealthy_agents,
labels: %i[name],
docstring: 'Number of unhealthy agents (job_state == running AND number_of_processes == 0) per deployment',
)
@total_available_agents = Prometheus::Client.registry.gauge(
:bosh_total_available_agents,
labels: %i[name],
docstring: 'Number of total available agents (all agents, no criteria) per deployment',
)
@failing_instances = Prometheus::Client.registry.gauge(
:bosh_failing_instances,
labels: %i[name],
docstring: 'Number of failing instances (job_state == "failing") per deployment',
)
@stopped_instances = Prometheus::Client.registry.gauge(
:bosh_stopped_instances,
labels: %i[name],
docstring: 'Number of instances (job_state == "stopped") per deployment',
)

@unknown_instances = Prometheus::Client.registry.gauge(
:bosh_unknown_instances,
labels: %i[name],
docstring: 'Number of instances with unknown job_state per deployment',
)
@scheduler = Rufus::Scheduler.new
end

Expand Down Expand Up @@ -136,6 +163,98 @@ def populate_vm_metrics
removed_deployments.each do |deployment|
@unresponsive_agents.set(0, labels: { name: deployment })
end

# Fetch and populate unhealthy_agents metrics
response_unhealthy = Net::HTTP.get_response("127.0.0.1", "/unhealthy_agents", @config.health_monitor_port)
Copy link

Copilot AI Dec 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent quote style: this line uses double quotes while line 188 below uses single quotes. Consider using a consistent quote style throughout the method.

Suggested change
response_unhealthy = Net::HTTP.get_response("127.0.0.1", "/unhealthy_agents", @config.health_monitor_port)
response_unhealthy = Net::HTTP.get_response('127.0.0.1', '/unhealthy_agents', @config.health_monitor_port)

Copilot uses AI. Check for mistakes.
return unless response_unhealthy.is_a?(Net::HTTPSuccess)

unhealthy_agent_counts = JSON.parse(response_unhealthy.body)
return unless unhealthy_agent_counts.is_a?(Hash)

existing_unhealthy_deployment_names = @unhealthy_agents.values.map do |key, _|
key[:name]
end

unhealthy_agent_counts.each do |deployment, count|
@unhealthy_agents.set(count, labels: { name: deployment })
end

removed_unhealthy_deployments = existing_unhealthy_deployment_names - unhealthy_agent_counts.keys
removed_unhealthy_deployments.each do |deployment|
@unhealthy_agents.set(0, labels: { name: deployment })
end

# Fetch and populate total_available_agents metrics
response_total = Net::HTTP.get_response('127.0.0.1', '/total_available_agents', @config.health_monitor_port)
if response_total.is_a?(Net::HTTPSuccess)
total_agent_counts = JSON.parse(response_total.body) rescue nil
Copy link

Copilot AI Dec 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The rescue nil pattern silently swallows all exceptions during JSON parsing. If the parsing fails for unexpected reasons (not just malformed JSON), this could hide bugs. Consider using a more specific rescue clause like rescue JSON::ParserError or at least logging the error when it occurs.

Suggested change
total_agent_counts = JSON.parse(response_total.body) rescue nil
total_agent_counts = begin
JSON.parse(response_total.body)
rescue JSON::ParserError => e
@logger.warn("Failed to parse total_available_agents response as JSON: #{e.message}") if @logger
nil
end

Copilot uses AI. Check for mistakes.
if total_agent_counts.is_a?(Hash)
existing_total_deployment_names = @total_available_agents.values.map { |key, _| key[:name] }

total_agent_counts.each do |deployment, count|
@total_available_agents.set(count, labels: { name: deployment })
end

removed_total_deployments = existing_total_deployment_names - total_agent_counts.keys
removed_total_deployments.each do |deployment|
@total_available_agents.set(0, labels: { name: deployment })
end
end
end

# Fetch and populate failing_instances metrics
response_failing = Net::HTTP.get_response('127.0.0.1', '/failing_instances', @config.health_monitor_port)
if response_failing.is_a?(Net::HTTPSuccess)
failing_counts = JSON.parse(response_failing.body) rescue nil
Copy link

Copilot AI Dec 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The rescue nil pattern silently swallows all exceptions during JSON parsing. Consider using a more specific rescue clause like rescue JSON::ParserError for consistency with error handling best practices.

Copilot uses AI. Check for mistakes.
if failing_counts.is_a?(Hash)
existing_failing_deployment_names = @failing_instances.values.map { |key, _| key[:name] }

failing_counts.each do |deployment, count|
@failing_instances.set(count, labels: { name: deployment })
end

removed_failing_deployments = existing_failing_deployment_names - failing_counts.keys
removed_failing_deployments.each do |deployment|
@failing_instances.set(0, labels: { name: deployment })
end
end
end

Comment on lines +189 to +222
Copy link

Copilot AI Dec 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error handling pattern differs from the unresponsive_agents block above. The first block uses early returns after each validation check, while the new metrics blocks wrap all logic inside nested if statements. Consider using the same early-return pattern for consistency and better readability.

Suggested change
if response_total.is_a?(Net::HTTPSuccess)
total_agent_counts = JSON.parse(response_total.body) rescue nil
if total_agent_counts.is_a?(Hash)
existing_total_deployment_names = @total_available_agents.values.map { |key, _| key[:name] }
total_agent_counts.each do |deployment, count|
@total_available_agents.set(count, labels: { name: deployment })
end
removed_total_deployments = existing_total_deployment_names - total_agent_counts.keys
removed_total_deployments.each do |deployment|
@total_available_agents.set(0, labels: { name: deployment })
end
end
end
# Fetch and populate failing_instances metrics
response_failing = Net::HTTP.get_response('127.0.0.1', '/failing_instances', @config.health_monitor_port)
if response_failing.is_a?(Net::HTTPSuccess)
failing_counts = JSON.parse(response_failing.body) rescue nil
if failing_counts.is_a?(Hash)
existing_failing_deployment_names = @failing_instances.values.map { |key, _| key[:name] }
failing_counts.each do |deployment, count|
@failing_instances.set(count, labels: { name: deployment })
end
removed_failing_deployments = existing_failing_deployment_names - failing_counts.keys
removed_failing_deployments.each do |deployment|
@failing_instances.set(0, labels: { name: deployment })
end
end
end
return unless response_total.is_a?(Net::HTTPSuccess)
total_agent_counts = JSON.parse(response_total.body) rescue nil
return unless total_agent_counts.is_a?(Hash)
existing_total_deployment_names = @total_available_agents.values.map { |key, _| key[:name] }
total_agent_counts.each do |deployment, count|
@total_available_agents.set(count, labels: { name: deployment })
end
removed_total_deployments = existing_total_deployment_names - total_agent_counts.keys
removed_total_deployments.each do |deployment|
@total_available_agents.set(0, labels: { name: deployment })
end
# Fetch and populate failing_instances metrics
response_failing = Net::HTTP.get_response('127.0.0.1', '/failing_instances', @config.health_monitor_port)
return unless response_failing.is_a?(Net::HTTPSuccess)
failing_counts = JSON.parse(response_failing.body) rescue nil
return unless failing_counts.is_a?(Hash)
existing_failing_deployment_names = @failing_instances.values.map { |key, _| key[:name] }
failing_counts.each do |deployment, count|
@failing_instances.set(count, labels: { name: deployment })
end
removed_failing_deployments = existing_failing_deployment_names - failing_counts.keys
removed_failing_deployments.each do |deployment|
@failing_instances.set(0, labels: { name: deployment })
end

Copilot uses AI. Check for mistakes.
# Fetch and populate stopped_instances metrics
response_stopped = Net::HTTP.get_response('127.0.0.1', '/stopped_instances', @config.health_monitor_port)
if response_stopped.is_a?(Net::HTTPSuccess)
stopped_counts = JSON.parse(response_stopped.body) rescue nil
Copy link

Copilot AI Dec 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The rescue nil pattern silently swallows all exceptions during JSON parsing. Consider using a more specific rescue clause like rescue JSON::ParserError for consistency with error handling best practices.

Copilot uses AI. Check for mistakes.
if stopped_counts.is_a?(Hash)
existing_stopped_deployment_names = @stopped_instances.values.map { |key, _| key[:name] }

stopped_counts.each do |deployment, count|
@stopped_instances.set(count, labels: { name: deployment })
end

removed_stopped_deployments = existing_stopped_deployment_names - stopped_counts.keys
removed_stopped_deployments.each do |deployment|
@stopped_instances.set(0, labels: { name: deployment })
end
end
end

# Fetch and populate unknown_instances metrics
response_unknown = Net::HTTP.get_response('127.0.0.1', '/unknown_instances', @config.health_monitor_port)
if response_unknown.is_a?(Net::HTTPSuccess)
unknown_counts = JSON.parse(response_unknown.body) rescue nil
Copy link

Copilot AI Dec 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The rescue nil pattern silently swallows all exceptions during JSON parsing. Consider using a more specific rescue clause like rescue JSON::ParserError for consistency with error handling best practices.

Copilot uses AI. Check for mistakes.
if unknown_counts.is_a?(Hash)
existing_unknown_deployment_names = @unknown_instances.values.map { |key, _| key[:name] }

unknown_counts.each do |deployment, count|
@unknown_instances.set(count, labels: { name: deployment })
end

removed_unknown_deployments = existing_unknown_deployment_names - unknown_counts.keys
removed_unknown_deployments.each do |deployment|
@unknown_instances.set(0, labels: { name: deployment })
end
end
Comment on lines +225 to +256
Copy link

Copilot AI Dec 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All five new metric collection blocks follow a nearly identical pattern with only minor differences (endpoint name, metric variable, method call). This represents significant code duplication. Consider extracting this into a helper method that takes the endpoint name and metric variable as parameters to improve maintainability.

Suggested change
if response_stopped.is_a?(Net::HTTPSuccess)
stopped_counts = JSON.parse(response_stopped.body) rescue nil
if stopped_counts.is_a?(Hash)
existing_stopped_deployment_names = @stopped_instances.values.map { |key, _| key[:name] }
stopped_counts.each do |deployment, count|
@stopped_instances.set(count, labels: { name: deployment })
end
removed_stopped_deployments = existing_stopped_deployment_names - stopped_counts.keys
removed_stopped_deployments.each do |deployment|
@stopped_instances.set(0, labels: { name: deployment })
end
end
end
# Fetch and populate unknown_instances metrics
response_unknown = Net::HTTP.get_response('127.0.0.1', '/unknown_instances', @config.health_monitor_port)
if response_unknown.is_a?(Net::HTTPSuccess)
unknown_counts = JSON.parse(response_unknown.body) rescue nil
if unknown_counts.is_a?(Hash)
existing_unknown_deployment_names = @unknown_instances.values.map { |key, _| key[:name] }
unknown_counts.each do |deployment, count|
@unknown_instances.set(count, labels: { name: deployment })
end
removed_unknown_deployments = existing_unknown_deployment_names - unknown_counts.keys
removed_unknown_deployments.each do |deployment|
@unknown_instances.set(0, labels: { name: deployment })
end
end
populate_instance_metrics('/stopped_instances', @stopped_instances)
populate_instance_metrics('/unknown_instances', @unknown_instances)
end
def populate_instance_metrics(endpoint, metric)
response = Net::HTTP.get_response('127.0.0.1', endpoint, @config.health_monitor_port)
return unless response.is_a?(Net::HTTPSuccess)
counts = JSON.parse(response.body) rescue nil
return unless counts.is_a?(Hash)
existing_deployment_names = metric.values.map { |key, _| key[:name] }
counts.each do |deployment, count|
metric.set(count, labels: { name: deployment })
end
removed_deployments = existing_deployment_names - counts.keys
removed_deployments.each do |deployment|
metric.set(0, labels: { name: deployment })

Copilot uses AI. Check for mistakes.
end
end

def populate_network_metrics
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,17 @@ def tick
allow(resurrector_manager).to receive(:pause_for_all?).and_return(false, true, false)
allow(Api::ConfigManager).to receive(:deploy_config_enabled?).and_return(true, false)
stub_request(:get, /unresponsive_agents/)
.to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
.to_return(status: 200, body: JSON.dump("flaky_deployment" => 1, "good_deployment" => 0))
stub_request(:get, /unhealthy_agents/)
.to_return(status: 200, body: JSON.dump("flaky_deployment" => 2, "good_deployment" => 0))
stub_request(:get, /total_available_agents/)
.to_return(status: 200, body: JSON.dump("flaky_deployment" => 3, "good_deployment" => 2))
stub_request(:get, /failing_instances/)
.to_return(status: 200, body: JSON.dump("flaky_deployment" => 1, "good_deployment" => 0))
stub_request(:get, /stopped_instances/)
.to_return(status: 200, body: JSON.dump("flaky_deployment" => 0, "good_deployment" => 0))
stub_request(:get, /unknown_instances/)
.to_return(status: 200, body: JSON.dump("flaky_deployment" => 0, "good_deployment" => 1))
Comment on lines +40 to +47
Copy link

Copilot AI Dec 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent indentation: these stub_request calls are indented with extra spaces (10 spaces total) while the previous stub_request calls use 6 spaces. All stub_request calls at this level should use consistent indentation.

Copilot uses AI. Check for mistakes.
end

after do
Expand All @@ -44,6 +54,11 @@ def tick
Prometheus::Client.registry.unregister(:bosh_networks_dynamic_ips_total)
Prometheus::Client.registry.unregister(:bosh_networks_dynamic_free_ips_total)
Prometheus::Client.registry.unregister(:bosh_unresponsive_agents)
Prometheus::Client.registry.unregister(:bosh_unhealthy_agents)
Prometheus::Client.registry.unregister(:bosh_total_available_agents)
Prometheus::Client.registry.unregister(:bosh_failing_instances)
Prometheus::Client.registry.unregister(:bosh_stopped_instances)
Prometheus::Client.registry.unregister(:bosh_unknown_instances)
end

describe '#prep' do
Expand Down Expand Up @@ -293,46 +308,89 @@ def tick
expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)
end

context 'when the health monitor returns a non 200 response' do
it "emits the number of unhealthy agents for each deployment" do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unhealthy_agents)
expect(metric.get(labels: { name: "flaky_deployment" })).to eq(2)
expect(metric.get(labels: { name: "good_deployment" })).to eq(0)
end

it 'emits the number of total available agents for each deployment' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_total_available_agents)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(3)
expect(metric.get(labels: { name: 'good_deployment' })).to eq(2)
end

it 'emits the failing/stopped/unknown instance metrics for each deployment' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_failing_instances)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(1)
expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)

metric = Prometheus::Client.registry.get(:bosh_stopped_instances)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)

metric = Prometheus::Client.registry.get(:bosh_unknown_instances)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
expect(metric.get(labels: { name: 'good_deployment' })).to eq(1)
end

context "when the health monitor returns a non 200 response" do
before do
stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
stub_request(:get, "127.0.0.1:12345/unresponsive_agents")
.to_return(status: 404)
stub_request(:get, "127.0.0.1:12345/unhealthy_agents")
.to_return(status: 404)
end

it 'does not emit the vm metrics' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.values).to be_empty
metric = Prometheus::Client.registry.get(:bosh_unhealthy_agents)
expect(metric.values).to be_empty
end
end

context 'when the health monitor returns a non-json response' do
before do
stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
.to_return(status: 200, body: JSON.dump('bad response'))
stub_request(:get, "127.0.0.1:12345/unresponsive_agents")
.to_return(status: 200, body: JSON.dump("bad response"))
stub_request(:get, "127.0.0.1:12345/unhealthy_agents")
.to_return(status: 200, body: JSON.dump("bad response"))
end

it 'does not emit the vm metrics' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.values).to be_empty
metric = Prometheus::Client.registry.get(:bosh_unhealthy_agents)
expect(metric.values).to be_empty
end
end

context 'when a deployment is deleted after metrics are gathered' do
before do
stub_request(:get, /unresponsive_agents/)
.to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
.to_return(status: 200, body: JSON.dump("flaky_deployment" => 1, "good_deployment" => 0))
stub_request(:get, /unhealthy_agents/)
.to_return(status: 200, body: JSON.dump("flaky_deployment" => 2, "good_deployment" => 0))
metrics_collector.start

stub_request(:get, /unresponsive_agents/)
.to_return(status: 200, body: JSON.dump('good_deployment' => 0))
.to_return(status: 200, body: JSON.dump("good_deployment" => 0))
stub_request(:get, /unhealthy_agents/)
.to_return(status: 200, body: JSON.dump("good_deployment" => 0))
scheduler.tick
end

it 'resets the metrics for the deleted deployment' do
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
expect(metric.get(labels: { name: "flaky_deployment" })).to eq(0)
metric = Prometheus::Client.registry.get(:bosh_unhealthy_agents)
expect(metric.get(labels: { name: "flaky_deployment" })).to eq(0)
end
end
end
Expand Down
2 changes: 2 additions & 0 deletions src/bosh-monitor/lib/bosh/monitor/agent.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ class Agent
attr_reader :id
attr_reader :discovered_at
attr_accessor :updated_at
attr_accessor :job_state
attr_accessor :number_of_processes

ATTRIBUTES = %i[deployment job index instance_id cid].freeze

Expand Down
40 changes: 40 additions & 0 deletions src/bosh-monitor/lib/bosh/monitor/api_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,45 @@ def initialize(heartbeat_interval = 1)
status(503)
end
end

get "/unhealthy_agents" do
Copy link

Copilot AI Dec 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent quote style in the path parameter: uses double quotes while the previous unresponsive_agents endpoint uses single quotes. Consider using single quotes consistently as seen throughout the rest of the file.

Suggested change
get "/unhealthy_agents" do
get '/unhealthy_agents' do

Copilot uses AI. Check for mistakes.
if @instance_manager.director_initial_deployment_sync_done
JSON.generate(@instance_manager.unhealthy_agents)
else
status(503)
end
end

get '/total_available_agents' do
if @instance_manager.director_initial_deployment_sync_done
JSON.generate(@instance_manager.total_available_agents)
else
status(503)
end
end

get '/failing_instances' do
if @instance_manager.director_initial_deployment_sync_done
JSON.generate(@instance_manager.failing_instances)
else
status(503)
end
end

get '/stopped_instances' do
if @instance_manager.director_initial_deployment_sync_done
JSON.generate(@instance_manager.stopped_instances)
else
status(503)
end
end

get '/unknown_instances' do
if @instance_manager.director_initial_deployment_sync_done
JSON.generate(@instance_manager.unknown_instances)
else
status(503)
end
end
end
end
6 changes: 5 additions & 1 deletion src/bosh-monitor/lib/bosh/monitor/events/heartbeat.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def to_s
end

def to_hash
{
result = {
kind: 'heartbeat',
id: @id,
timestamp: @timestamp.to_i,
Expand All @@ -79,6 +79,10 @@ def to_hash
teams: @teams,
metrics: @metrics.map(&:to_hash),
}
# Include number_of_processes if present in attributes
result[:number_of_processes] = @attributes["number_of_processes"] if @attributes.key?("number_of_processes")

result
end

def to_json(*_args)
Expand Down
Loading
Loading