diff --git a/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - Disk Balancer.json b/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - Disk Balancer.json new file mode 100644 index 000000000000..eaafef839c71 --- /dev/null +++ b/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - Disk Balancer.json @@ -0,0 +1,358 @@ +{ + "annotations": { + "list": [ + { + "builtIn": true, + "enable": true, + "hide": true, + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "id": 2, + "title": "Disk Space Convergence Profile (Historical Before vs After)", + "type": "timeseries", + "description": "Shows how full each disk is over time. When the balancer is running, you should see these lines slowly move closer together until they meet in the middle.", + "gridPos": { "x": 0, "y": 0, "w": 12, "h": 7 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "targets": [ + { + "expr": "sum({__name__=~\"volume_info_metrics_.*_ozone_used\", instance=~\"$datanode.*\"}) by (instance, storagedirectory)", + "legendFormat": "{{storagedirectory}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decbytes", + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "lineInterpolation": "smooth", + "lineWidth": 2 + } + } + } + }, + { + "id": 3, + "title": "Current Disk Space Allocation Skew (Max - Min Volume Gap)", + "type": "bargauge", + "description": "The exact size of the data gap between your most full disk and your emptiest disk right now. The balancer's goal is to shrink this number down to zero.", + "gridPos": { "x": 12, "y": 0, "w": 12, "h": 7 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "targets": [ + { + "expr": "max(sum({__name__=~\"volume_info_metrics_.*_ozone_used\", instance=~\"$datanode.*\"}) by (instance, storagedirectory)) by (instance) - min(sum({__name__=~\"volume_info_metrics_.*_ozone_used\", instance=~\"$datanode.*\"}) by (instance, storagedirectory)) by (instance)", + "instant": true, + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decbytes", + "max": 5368709120 + } + }, + "options": { + "displayMode": "lcd", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"] } + } + }, + { + "id": 5, + "title": "Active Balancing Throughput Speed", + "type": "stat", + "description": "The live speedometer. This shows exactly how fast data is physically copying from one disk to another across the node.", + "gridPos": { "x": 0, "y": 7, "w": 6, "h": 7 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "targets": [ + { + "expr": "rate(disk_balancer_service_metrics_success_bytes{instance=~\"$datanode.*\"}[1m])", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + } + }, + "options": { + "graphMode": "area", + "textMode": "value", + "reduceOptions": { "calcs": ["lastNotNull"] } + } + }, + { + "id": 7, + "title": "Unscheduled Target Backlog", + "type": "stat", + "description": "The remaining amount of data the balancer still needs to queue up to reach perfect balance.", + "gridPos": { "x": 6, "y": 7, "w": 8, "h": 7 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "targets": [ + { + "expr": "max({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\", instance=~\"$datanode.*\"}) by (instance) - avg({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\", instance=~\"$datanode.*\"}) by (instance)", + "instant": true, + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decbytes" + } + }, + "options": { + "graphMode": "none", + "textMode": "value", + "reduceOptions": { "calcs": ["lastNotNull"] } + } + }, + { + "id": 8, + "title": "Est. Time for Unscheduled Backlog", + "type": "stat", + "description": "A rough estimate of how many minutes are left before the balancer finishes balancing.", + "gridPos": { "x": 14, "y": 7, "w": 10, "h": 7 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "targets": [ + { + "expr": "((sum(max({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\", instance=~\"$datanode.*\"}) by (instance) - avg({__name__=~\"volume_info_metrics_data_disk[0-9]+_ozone_used\", instance=~\"$datanode.*\"}) by (instance)) / (sum(rate(disk_balancer_service_metrics_success_bytes{instance=~\"$datanode.*\"}[1m])) > 0)) / 60) or vector(0)", + "instant": true, + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "m" + } + }, + "options": { + "graphMode": "none", + "textMode": "value", + "reduceOptions": { "calcs": ["lastNotNull"] } + } + }, + { + "id": 6, + "title": "Container Migration Latency Profile (Processing Duration)", + "type": "timeseries", + "description": "How much time it takes the system to successfully copy, verify, and finalize a single container.", + "gridPos": { "x": 0, "y": 14, "w": 24, "h": 7 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "targets": [ + { + "expr": "{__name__=~\"disk_balancer_service_metrics_copy_success_latency|disk_balancer_service_metrics_copy_failure_latency|disk_balancer_service_metrics_move_success_time_avg_time\", instance=~\"$datanode.*\"}", + "legendFormat": "{{__name__}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "color": { "mode": "palette-classic" } + } + } + }, + { + "id": 11, + "title": "Overall Move Success Rate", + "type": "gauge", + "description": "The percentage of container moves that finished perfectly.", + "gridPos": { "x": 0, "y": 21, "w": 5, "h": 6 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "targets": [ + { + "expr": "(sum(disk_balancer_service_metrics_success_count) / (sum(disk_balancer_service_metrics_success_count) + sum(disk_balancer_service_metrics_failure_count) + 0.00001)) * 100", + "instant": true, + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": 0, "color": "green" }, + { "value": 80, "color": "red" } + ] + } + } + }, + "options": { + "reduceOptions": { "calcs": ["lastNotNull"] } + } + }, + { + "id": 12, + "title": "Total Lifetime Container Moves", + "type": "stat", + "description": "The total number of individual data blocks (containers) successfully relocated by the balancer.", + "gridPos": { "x": 5, "y": 21, "w": 4, "h": 6 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "targets": [ + { + "expr": "max_over_time(disk_balancer_service_metrics_success_count{instance=~\"$datanode.*\"}[1h])", + "instant": true, + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "noValue": "0", + "unit": "short" + } + }, + "options": { + "graphMode": "none", + "textMode": "value", + "reduceOptions": { "calcs": ["lastNotNull"] } + } + }, + { + "id": 14, + "title": "Total Lifetime Data Balanced", + "type": "stat", + "description": "The total physical weight of all the data successfully shifted across the disks since the balancer started.", + "gridPos": { "x": 9, "y": 21, "w": 5, "h": 6 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "targets": [ + { + "expr": "max_over_time(disk_balancer_service_metrics_success_bytes{instance=~\"$datanode.*\"}[1h])", + "instant": true, + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "noValue": "0", + "unit": "decbytes" + } + }, + "options": { + "graphMode": "area", + "textMode": "auto", + "reduceOptions": { "calcs": ["lastNotNull"] } + } + }, + { + "id": 15, + "title": "Failed Container Moves", + "type": "stat", + "description": "The number of times a move was aborted due to error. This should ideally stay at 0.", + "gridPos": { "x": 14, "y": 21, "w": 4, "h": 6 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "targets": [ + { + "expr": "max_over_time(disk_balancer_service_metrics_failure_count{instance=~\"$datanode.*\"}[1h])", + "instant": true, + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": 0, "color": "green" }, + { "value": 1, "color": "red" } + ] + } + } + }, + "options": { + "graphMode": "area", + "textMode": "auto", + "reduceOptions": { "calcs": ["lastNotNull"] } + } + }, + { + "id": 13, + "title": "Balancer Idle Reason", + "type": "piechart", + "description": "Shows why the balancer is taking a break. 'Target Reached' means the disks are already balanced. 'Throttled' means it's slowing down on purpose so it doesn't overload the server.", + "gridPos": { "x": 18, "y": 21, "w": 6, "h": 6 }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "targets": [ + { + "expr": "sum(increase(disk_balancer_service_metrics_idle_loop_no_available_volume_pair_count[15m]))", + "instant": true, + "legendFormat": "Target Reached", + "refId": "A" + }, + { + "expr": "sum(increase(disk_balancer_service_metrics_idle_loop_exceeds_bandwidth_count[15m]))", + "instant": true, + "legendFormat": "Throttled", + "refId": "B" + } + ], + "options": { + "pieType": "donut", + "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, + "reduceOptions": { "calcs": ["lastNotNull"] } + } + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": ["Ozone", "DiskBalancer"], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "label": "Data Source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": {}, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(disk_balancer_service_metrics_volume_data_density, instance)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "datanode", + "options": [], + "query": { + "query": "label_values(disk_balancer_service_metrics_volume_data_density, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timezone": "browser", + "title": "Ozone Disk Balancer Operations", + "uid": "ozone-disk-balancer" +}