Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,358 @@
{
"annotations": {
"list": [
{
"builtIn": true,
"enable": true,
"hide": true,
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"id": 2,
"title": "Disk Space Convergence Profile (Historical Before vs After)",
"type": "timeseries",
"description": "Shows how full each disk is over time. When the balancer is running, you should see these lines slowly move closer together until they meet in the middle.",
"gridPos": { "x": 0, "y": 0, "w": 12, "h": 7 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"targets": [
{
"expr": "sum({__name__=~\"volume_info_metrics_.*_ozone_used\", instance=~\"$datanode.*\"}) by (instance, storagedirectory)",
"legendFormat": "{{storagedirectory}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "decbytes",
"color": { "mode": "palette-classic" },
"custom": {
"drawStyle": "line",
"lineInterpolation": "smooth",
"lineWidth": 2
}
}
}
},
{
"id": 3,
"title": "Current Disk Space Allocation Skew (Max - Min Volume Gap)",
"type": "bargauge",
"description": "The exact size of the data gap between your most full disk and your emptiest disk right now. The balancer's goal is to shrink this number down to zero.",
"gridPos": { "x": 12, "y": 0, "w": 12, "h": 7 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"targets": [
{
"expr": "max(sum({__name__=~\"volume_info_metrics_.*_ozone_used\", instance=~\"$datanode.*\"}) by (instance, storagedirectory)) by (instance) - min(sum({__name__=~\"volume_info_metrics_.*_ozone_used\", instance=~\"$datanode.*\"}) by (instance, storagedirectory)) by (instance)",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "decbytes",
"max": 5368709120
}
},
"options": {
"displayMode": "lcd",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"] }
}
},
{
"id": 5,
"title": "Active Balancing Throughput Speed",
"type": "stat",
"description": "The live speedometer. This shows exactly how fast data is physically copying from one disk to another across the node.",
"gridPos": { "x": 0, "y": 7, "w": 6, "h": 7 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"targets": [
{
"expr": "rate(disk_balancer_service_metrics_success_bytes{instance=~\"$datanode.*\"}[1m])",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "Bps"
}
},
"options": {
"graphMode": "area",
"textMode": "value",
"reduceOptions": { "calcs": ["lastNotNull"] }
}
},
{
"id": 7,
"title": "Unscheduled Target Backlog",
"type": "stat",
"description": "The remaining amount of data the balancer still needs to queue up to reach perfect balance.",
"gridPos": { "x": 6, "y": 7, "w": 8, "h": 7 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"targets": [
{
"expr": "max({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\", instance=~\"$datanode.*\"}) by (instance) - avg({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\", instance=~\"$datanode.*\"}) by (instance)",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "decbytes"
}
},
"options": {
"graphMode": "none",
"textMode": "value",
"reduceOptions": { "calcs": ["lastNotNull"] }
}
},
{
"id": 8,
"title": "Est. Time for Unscheduled Backlog",
"type": "stat",
"description": "A rough estimate of how many minutes are left before the balancer finishes balancing.",
"gridPos": { "x": 14, "y": 7, "w": 10, "h": 7 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"targets": [
{
"expr": "((sum(max({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\", instance=~\"$datanode.*\"}) by (instance) - avg({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\", instance=~\"$datanode.*\"}) by (instance)) / (sum(rate(disk_balancer_service_metrics_success_bytes{instance=~\"$datanode.*\"}[1m])) > 0)) / 60) or vector(0)",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "m"
}
},
"options": {
"graphMode": "none",
"textMode": "value",
"reduceOptions": { "calcs": ["lastNotNull"] }
}
},
{
"id": 6,
"title": "Container Migration Latency Profile (Processing Duration)",
"type": "timeseries",
"description": "How much time it takes the system to successfully copy, verify, and finalize a single container.",
"gridPos": { "x": 0, "y": 14, "w": 24, "h": 7 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"targets": [
{
"expr": "{__name__=~\"disk_balancer_service_metrics_copy_success_latency|disk_balancer_service_metrics_copy_failure_latency|disk_balancer_service_metrics_move_success_time_avg_time\", instance=~\"$datanode.*\"}",
"legendFormat": "{{__name__}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "ms",
"color": { "mode": "palette-classic" }
}
}
},
{
"id": 11,
"title": "Overall Move Success Rate",
"type": "gauge",
"description": "The percentage of container moves that finished perfectly.",
"gridPos": { "x": 0, "y": 21, "w": 5, "h": 6 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"targets": [
{
"expr": "(sum(disk_balancer_service_metrics_success_count) / (sum(disk_balancer_service_metrics_success_count) + sum(disk_balancer_service_metrics_failure_count) + 0.00001)) * 100",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "green" },
{ "value": 80, "color": "red" }
]
}
}
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"] }
}
},
{
"id": 12,
"title": "Total Lifetime Container Moves",
"type": "stat",
"description": "The total number of individual data blocks (containers) successfully relocated by the balancer.",
"gridPos": { "x": 5, "y": 21, "w": 4, "h": 6 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"targets": [
{
"expr": "max_over_time(disk_balancer_service_metrics_success_count{instance=~\"$datanode.*\"}[1h])",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"noValue": "0",
"unit": "short"
}
},
"options": {
"graphMode": "none",
"textMode": "value",
"reduceOptions": { "calcs": ["lastNotNull"] }
}
},
{
"id": 14,
"title": "Total Lifetime Data Balanced",
"type": "stat",
"description": "The total physical weight of all the data successfully shifted across the disks since the balancer started.",
"gridPos": { "x": 9, "y": 21, "w": 5, "h": 6 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"targets": [
{
"expr": "max_over_time(disk_balancer_service_metrics_success_bytes{instance=~\"$datanode.*\"}[1h])",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"noValue": "0",
"unit": "decbytes"
}
},
"options": {
"graphMode": "area",
"textMode": "auto",
"reduceOptions": { "calcs": ["lastNotNull"] }
}
},
{
"id": 15,
"title": "Failed Container Moves",
"type": "stat",
"description": "The number of times a move was aborted due to error. This should ideally stay at 0.",
"gridPos": { "x": 14, "y": 21, "w": 4, "h": 6 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"targets": [
{
"expr": "max_over_time(disk_balancer_service_metrics_failure_count{instance=~\"$datanode.*\"}[1h])",
"instant": true,
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"noValue": "0",
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "green" },
{ "value": 1, "color": "red" }
]
}
}
},
"options": {
"graphMode": "area",
"textMode": "auto",
"reduceOptions": { "calcs": ["lastNotNull"] }
}
},
{
"id": 13,
"title": "Balancer Idle Reason",
"type": "piechart",
"description": "Shows why the balancer is taking a break. 'Target Reached' means the disks are already balanced. 'Throttled' means it's slowing down on purpose so it doesn't overload the server.",
"gridPos": { "x": 18, "y": 21, "w": 6, "h": 6 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"targets": [
{
"expr": "sum(increase(disk_balancer_service_metrics_idle_loop_no_available_volume_pair_count[15m]))",
"instant": true,
"legendFormat": "Target Reached",
"refId": "A"
},
{
"expr": "sum(increase(disk_balancer_service_metrics_idle_loop_exceeds_bandwidth_count[15m]))",
"instant": true,
"legendFormat": "Throttled",
"refId": "B"
}
],
"options": {
"pieType": "donut",
"legend": { "displayMode": "list", "placement": "bottom", "showLegend": true },
"reduceOptions": { "calcs": ["lastNotNull"] }
}
}
],
"refresh": "5s",
"schemaVersion": 38,
"style": "dark",
"tags": ["Ozone", "DiskBalancer"],
"templating": {
"list": [
{
"current": {},
"hide": 0,
"includeAll": false,
"label": "Data Source",
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"skipUrlSync": false,
"type": "datasource"
},
{
"current": {},
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"definition": "label_values(disk_balancer_service_metrics_volume_data_density, instance)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "datanode",
"options": [],
"query": {
"query": "label_values(disk_balancer_service_metrics_volume_data_density, instance)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-5m",
"to": "now"
},
"timezone": "browser",
"title": "Ozone Disk Balancer Operations",
"uid": "ozone-disk-balancer"
}