Problem
When editing metrics in datadog UI (i.e. /metrics/summary) a warning is shown when editing an in-use metric (i.e. a dashboard or monitor uses it). But if that metrics is used by a Kubernetes HorizontalPodAutoscaler, no such warning will show.
Solution
Generate a dashboard that uses 1 widget for every query an HPA uses.
require 'kennel'
class HpaDashboard
SOURCE_METRIC = "datadog.cluster_agent.external_metrics.delay_seconds".freeze
attr_reader :id
def initialize(id, timeframe:)
@id = id
@api = Kennel::Api.new
@from = Time.now.to_i - timeframe
end
# see https://docs.datadoghq.com/api/latest/metrics/#get-active-metrics-list
# this has an undocumented limit of 250000 metrics so we can't just use super old @from
# also tried /api/v2/metrics which returns similar results but is even slower (filtering it with 'queried' + big window did not help)
def available_metrics
@api.send(
:request, :get, "/api/v1/metrics",
params: { from: @from }
).fetch(:metrics).to_set
end
def queries_used_by_any_hpa
@api.send(
:request, :get, "/api/v1/query",
params: {
query: "avg:#{SOURCE_METRIC}{*} by {metric}",
from: @from,
to: Time.now.to_i
}
).fetch(:series).map do |data|
data.fetch(:scope).split(",").to_h { |t| t.split(":", 2) }["metric"]
end.uniq
end
# covert fallout from query normalization to find actual metrics
# for example default_zero(foo{a:b}) is converted to "default_zero_foo_a:b"
# this ignores when multiple metrics are in a single query for example a / b * 100
# since a and b are usually the same
def extract_metrics(queries)
queries = queries.dup
queries.each do |query|
query.sub!(/\.total_\d+$/, ".total") # math leftover *.total_100 -> *.total
query.sub!(/^_*(ewma_\d+|default_zero)_*/, "") # remove math
end
queries.uniq!
queries.sort! # for debug printing and to keep the dashboard stable
queries.to_set
end
# since available_metrics is not reliable (hits limit or just has old data)
# we verify each potentially unknown metric 1-by-1 by hitting this cheap endpoint
# https://docs.datadoghq.com/api/latest/metrics/?code-lang=curl#get-metric-metadata
def slow_filter_unknown!(unknown)
unknown.select! do |metric|
print "Verifying potentially unknown metric #{metric} ..."
not_found = @api.send(:request, :get, "/api/v1/metrics/#{metric}", ignore_404: true)[:error]
print "#{not_found ? "not found" : "found"}\n"
not_found # keep the truly not found
end
end
def update(used_metrics)
attributes = {
title: "HPA metrics used",
description: <<~DESC,
1 widget for each metric used in compute maintained kubernetes clusters (anything that reports #{SOURCE_METRIC})
Automatically filled by a `rake hpa_dashboard` cron from kennel GHA.
Last updated: #{Time.now} #{$stdout.tty? ? "manually" : RakeHelper.ci_url}
DESC
layout_type: "ordered",
reflow_type: "auto",
tags: ["team:compute", "team:compute-accelerate"],
widgets: used_metrics.map do |m|
{
definition: {
title: m,
type: "timeseries",
requests: [
{
response_format: "timeseries",
queries: [
{
name: "query1",
data_source: "metrics",
query: "avg:#{m}{*}"
}
],
display_type: "line"
}
]
}
}
end
}
@api.update("dashboard", @id, attributes)
end
end
desc "Update hpa dashboard to track all currently used external metrics people that change metrics in the UI see that they are used"
task hpa_dashboard: "kennel:environment" do
dashboard = HpaDashboard.new(DASHBOARD_ID, timeframe: 24 * 60 * 60)
available_metrics = dashboard.available_metrics
puts "Found #{available_metrics.size} available metrics"
used_queries = dashboard.queries_used_by_any_hpa
puts "Found #{used_queries.size} used queries"
used_metrics = dashboard.extract_metrics(used_queries)
puts "Found #{used_metrics.size} used metrics"
# validate we found everything
unknown = used_metrics - available_metrics
dashboard.slow_filter_unknown! unknown if unknown.size < 100
if unknown.any?
$stdout.flush # otherwise mixes with stderr in GHA
abort <<~MSG
#{unknown.size} unknown metrics found, these would not be displayable on the dashboard, improve parsing code
usually that means some part of the metrics got mangled and it cannot be found in datadog
see https://datadoghq.com/metric/summary to find valid metrics
#{unknown.join("\n")}
MSG
end
dashboard.update used_metrics
puts "Updated dashboard https://datadoghq.com/dashboard/#{dashboard.id}"
rescue Exception # rubocop:disable Lint/RescueException
unless $stdout.tty? # do not spam slack when debugging
send_to_slack <<~MSG
HPA dashboard update failed #{RakeHelper.ci_url}, fix it!
MSG
end
raise
end