GCP load balancer https/backend_latencies buckets issue
Hi, I'm using stackdriver-exporter v0.13.0, and only recently somebody noticed the https/backend_latencies have values on all buckets up to 4.410119471141699e+09 for example:
stackdriver_https_lb_rule_loadbalancing_googleapis_com_https_backend_latencies_bucket{backend_name="[redacted]",backend_scope="europe-west1-d",backend_scope_type="ZONE",backend_target_name="[redacted]",backend_target_type="BACKEND_SERVICE",backend_type="NETWORK_ENDPOINT_GROUP",cache_result="DISABLED",client_country="Belgium",forwarding_rule_name="[redacted]",matched_url_path_rule="/",project_id="[redacted]",protocol="HTTP/2.0",proxy_continent="Europe",region="global",response_code="200",response_code_class="200",target_proxy_name="[redacted]",unit="ms",url_map_name="[redacted]",le="4.410119471141699e+09"} 2 1688377920000
When I check GCP metrics explorer, the backend latency doesn't show these buckets so I guess there's something broken between GCP Cloud monitoring metrics and stackdriver-exporter.
I pulled the cloud monitoring metric as following example:
sampled
ListTimeSeriesPager<time_series {
metric {
type: "loadbalancing.googleapis.com/https/backend_latencies"
labels {
key: "response_code"
value: "200"
}
labels {
key: "response_code_class"
value: "200"
}
labels {
key: "proxy_continent"
value: "Europe"
}
labels {
key: "protocol"
value: "HTTP/2.0"
}
labels {
key: "client_country"
value: "Belgium"
}
labels {
key: "cache_result"
value: "DISABLED"
}
}
resource {
type: "https_lb_rule"
labels {
key: "url_map_name"
value: "[redacted]"
}
labels {
key: "target_proxy_name"
value: "[redacted]"
}
labels {
key: "region"
value: "global"
}
labels {
key: "project_id"
value: "[redacted]"
}
labels {
key: "matched_url_path_rule"
value: "/"
}
labels {
key: "forwarding_rule_name"
value: "[redacted]"
}
labels {
key: "backend_type"
value: "NETWORK_ENDPOINT_GROUP"
}
labels {
key: "backend_target_type"
value: "BACKEND_SERVICE"
}
labels {
key: "backend_target_name"
value: "[redacted]"
}
labels {
key: "backend_scope"
value: "europe-west1-d"
}
labels {
key: "backend_scope_type"
value: "ZONE"
}
labels {
key: "backend_name"
value: "[redacted]"
}
}
metric_kind: DELTA
value_type: DISTRIBUTION
points {
interval {
end_time {
seconds: 1688704920
}
start_time {
seconds: 1688704860
nanos: 1000000
}
}
value {
distribution_value {
count: 12
mean: 2.624083333333334
sum_of_squared_deviation: 0.15022291666666704
bucket_options {
exponential_buckets {
num_finite_buckets: 66
growth_factor: 1.4
scale: 1
}
}
bucket_counts: 0
bucket_counts: 0
bucket_counts: 0
bucket_counts: 8
bucket_counts: 4
}
}
}
points {
interval {
end_time {
seconds: 1688704860
}
start_time {
seconds: 1688704800
nanos: 1000000
}
}
value {
distribution_value {
count: 13
mean: 2.3600769230769227
sum_of_squared_deviation: 1.774426923076923
bucket_options {
exponential_buckets {
num_finite_buckets: 66
growth_factor: 1.4
scale: 1
}
}
bucket_counts: 0
bucket_counts: 0
bucket_counts: 5
bucket_counts: 8
}
}
}
points {
interval {
end_time {
seconds: 1688704800
}
start_time {
seconds: 1688704740
nanos: 1000000
}
}
value {
distribution_value {
count: 5
mean: 2.732
bucket_options {
exponential_buckets {
num_finite_buckets: 66
growth_factor: 1.4
scale: 1
}
}
bucket_counts: 0
bucket_counts: 0
bucket_counts: 0
bucket_counts: 5
}
}
}
points {
interval {
end_time {
seconds: 1688704680
}
start_time {
seconds: 1688704620
nanos: 1000000
}
}
value {
distribution_value {
count: 15
mean: 3.1370000000000005
sum_of_squared_deviation: 6.19792
bucket_options {
exponential_buckets {
num_finite_buckets: 66
growth_factor: 1.4
scale: 1
}
}
bucket_counts: 0
bucket_counts: 0
bucket_counts: 0
bucket_counts: 10
bucket_counts: 0
bucket_counts: 5
}
}
}
points {
interval {
end_time {
seconds: 1688704620
}
start_time {
seconds: 1688704560
nanos: 1000000
}
}
value {
distribution_value {
count: 15
mean: 2.692333333333333
sum_of_squared_deviation: 0.73642333333333276
bucket_options {
exponential_buckets {
num_finite_buckets: 66
growth_factor: 1.4
scale: 1
}
}
bucket_counts: 0
bucket_counts: 0
bucket_counts: 0
bucket_counts: 5
bucket_counts: 10
}
}
}
points {
interval {
end_time {
seconds: 1688704500
}
start_time {
seconds: 1688704440
nanos: 1000000
}
}
value {
distribution_value {
count: 1
mean: 2.941
bucket_options {
exponential_buckets {
num_finite_buckets: 66
growth_factor: 1.4
scale: 1
}
}
bucket_counts: 0
bucket_counts: 0
bucket_counts: 0
bucket_counts: 0
bucket_counts: 1
}
}
}
points {
interval {
end_time {
seconds: 1688704320
}
start_time {
seconds: 1688704260
nanos: 1000000
}
}
value {
distribution_value {
count: 11
mean: 2.4069090909090907
sum_of_squared_deviation: 0.025640909090909013
bucket_options {
exponential_buckets {
num_finite_buckets: 66
growth_factor: 1.4
scale: 1
}
}
bucket_counts: 0
bucket_counts: 0
bucket_counts: 0
bucket_counts: 11
}
}
}
points {
interval {
end_time {
seconds: 1688704260
}
start_time {
seconds: 1688704200
nanos: 1000000
}
}
value {
distribution_value {
count: 10
mean: 3.293
sum_of_squared_deviation: 14.184810000000002
bucket_options {
exponential_buckets {
num_finite_buckets: 66
growth_factor: 1.4
scale: 1
}
}
bucket_counts: 0
bucket_counts: 0
bucket_counts: 0
bucket_counts: 5
bucket_counts: 0
bucket_counts: 5
}
}
}
points {
interval {
end_time {
seconds: 1688704200
}
start_time {
seconds: 1688704140
nanos: 1000000
}
}
value {
distribution_value {
count: 6
mean: 2.987
sum_of_squared_deviation: 0.048000000000000043
bucket_options {
exponential_buckets {
num_finite_buckets: 66
growth_factor: 1.4
scale: 1
}
}
bucket_counts: 0
bucket_counts: 0
bucket_counts: 0
bucket_counts: 0
bucket_counts: 6
}
}
}
points {
interval {
end_time {
seconds: 1688704140
}
start_time {
seconds: 1688704080
nanos: 1000000
}
}
value {
distribution_value {
count: 4
mean: 2.7869999999999995
bucket_options {
exponential_buckets {
num_finite_buckets: 66
growth_factor: 1.4
scale: 1
}
}
bucket_counts: 0
bucket_counts: 0
bucket_counts: 0
bucket_counts: 0
bucket_counts: 4
}
}
}
points {
interval {
end_time {
seconds: 1688704020
}
start_time {
seconds: 1688703960
nanos: 1000000
}
}
value {
distribution_value {
count: 10
mean: 2.6585
sum_of_squared_deviation: 0.048302500000000158
bucket_options {
exponential_buckets {
num_finite_buckets: 66
growth_factor: 1.4
scale: 1
}
}
bucket_counts: 0
bucket_counts: 0
bucket_counts: 0
bucket_counts: 10
}
}
}
points {
interval {
end_time {
seconds: 1688703960
}
start_time {
seconds: 1688703900
nanos: 1000000
}
}
value {
distribution_value {
count: 15
mean: 2.4699999999999998
sum_of_squared_deviation: 0.53427999999999964
bucket_options {
exponential_buckets {
num_finite_buckets: 66
growth_factor: 1.4
scale: 1
}
}
bucket_counts: 0
bucket_counts: 0
bucket_counts: 0
bucket_counts: 15
}
}
}
points {
interval {
end_time {
seconds: 1688703900
}
start_time {
seconds: 1688703840
nanos: 1000000
}
}
value {
distribution_value {
count: 13
mean: 2.6156153846153845
sum_of_squared_deviation: 0.65521107692307656
bucket_options {
exponential_buckets {
num_finite_buckets: 66
growth_factor: 1.4
scale: 1
}
}
bucket_counts: 0
bucket_counts: 0
bucket_counts: 0
bucket_counts: 8
bucket_counts: 5
}
}
}
points {
interval {
end_time {
seconds: 1688703840
}
start_time {
seconds: 1688703780
nanos: 1000000
}
}
value {
distribution_value {
count: 5
mean: 2.5065999999999997
sum_of_squared_deviation: 0.11998719999999996
bucket_options {
exponential_buckets {
num_finite_buckets: 66
growth_factor: 1.4
scale: 1
}
}
bucket_counts: 0
bucket_counts: 0
bucket_counts: 0
bucket_counts: 4
bucket_counts: 1
}
}
}
}
I cross-checked the document https://cloud.google.com/monitoring/api/ref_v3/rest/v3/TypedValue#distribution then these buckets look fine and these buckets are in low range of num_finite_buckets.
This leads to this line in stackdriver-exporter: https://github.com/prometheus-community/stackdriver_exporter/blob/be2625d7598866ef443ee7b61ecd7ed37e462eae/collectors/monitoring_collector.go#L546-L553
I suspect the last value from the previous calculation is now assigned to the rest of buckets instead of 0, so I print the buckets just to see its value to confirm the theory:
map[1:0 1.4:0 1.9599999999999997:0 2.7439999999999993:5 3.841599999999999:5 5.378239999999998:5 7.529535999999997:5 10.541350399999994:5 14.757890559999991:5 20.661046783999986:5 28.92546549759998:5 40.49565169663997:5 56.693912375295945:5 79.37147732541432:5 111.12006825558004:5 155.56809555781203:5 217.79533378093686:5 304.91346729331156:5 426.8788542106362:5 597.6303958948906:5 836.6825542528468:5 1171.3555759539854:5 1639.8978063355794:5 2295.856928869811:5 3214.199700417735:5 4499.879580584829:5 6299.83141281876:5 8819.763977946264:5 12347.669569124768:5 17286.73739677467:5 24201.43235548454:5 33882.00529767835:5 47434.807416749696:5 66408.73038344957:5 92972.22253682939:5 130161.11155156113:5 182225.55617218558:5 255115.7786410598:5 357162.09009748365:5 500026.9261364771:5 700037.696591068:5 980052.775227495:5 1.372073885318493e+06:5 1.92090343944589e+06:5 2.6892648152242457e+06:5 3.7649707413139436e+06:5 5.270959037839521e+06:5 7.379342652975327e+06:5 1.0331079714165458e+07:5 1.446351159983164e+07:5 2.0248916239764296e+07:5 2.8348482735670015e+07:5 3.968787582993802e+07:5 5.5563026161913216e+07:5 7.77882366266785e+07:5 1.0890353127734989e+08:5 1.5246494378828984e+08:5 2.1345092130360574e+08:5 2.98831289825048e+08:5 4.183638057550673e+08:5 5.857093280570941e+08:5 8.199930592799315e+08:5 1.147990282991904e+09:5 1.6071863961886654e+09:5 2.250060954664132e+09:5 3.1500853365297847e+09:5 4.410119471141699e+09:5 +Inf:5]
From our long term tsdb (thanos), the first data point that has this large bucket dates back to 2023-02-13 22:30:00 UTC. So I'm not sure what caused this issue with buckets distribution.
Does anybody have the same issue? And is my suspicion correct?