Skip to content

Commit cad5a39

Browse files
LennonChinpan3793
authored andcommitted
[KYUUBI #7072] Expose metrics of engine startup permit state
### Why are the changes needed? The metrics `kyuubi_operation_state_LaunchEngine_*` cannot reflect the state of Semaphore after configuring the maximum engine startup limit through `kyuubi.server.limit.engine.startup`, add some metrics to show the relevant permit state. ### How was this patch tested? ### Was this patch authored or co-authored using generative AI tooling? Closes #7072 from LennonChin/engine_startup_metrics. Closes #7072 d6bf369 [Lennon Chin] Expose metrics of engine startup permit status Authored-by: Lennon Chin <[email protected]> Signed-off-by: Cheng Pan <[email protected]>
1 parent bcaff5a commit cad5a39

File tree

4 files changed

+157
-24
lines changed

4 files changed

+157
-24
lines changed

docs/monitor/metrics.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ These metrics include:
6565
| `kyuubi.engine.timeout` | | counter | 1.2.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> cumulative timeout engines</div> |
6666
| `kyuubi.engine.failed` | `${user}` | counter | 1.2.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> cumulative explicitly failed engine count for a `${user}`</div> |
6767
| `kyuubi.engine.failed` | `${errorType}` | counter | 1.2.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> cumulative explicitly failed engine count for a particular `${errorType}`, e.g. `ClassNotFoundException`</div> |
68+
| `kyuubi.engine.startup.permit.limit.total` | | meter | 1.11.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> concurrently startup engines permit limit </div> |
69+
| `kyuubi.engine.startup.permit.available` | | gauge | 1.11.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> available permits of concurrently startup engines </div> |
70+
| `kyuubi.engine.startup.permit.waiting` | | gauge | 1.11.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> startup engines that waiting to acquire permit </div> |
6871
| `kyuubi.backend_service.open_session` | | timer | 1.5.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> kyuubi backend service `openSession` method execution time and rate </div> |
6972
| `kyuubi.backend_service.close_session` | | timer | 1.5.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> kyuubi backend service `closeSession` method execution time and rate </div> |
7073
| `kyuubi.backend_service.get_info` | | timer | 1.5.0 | <div style='width: 150pt;word-wrap: break-word;white-space: normal'> kyuubi backend service `getInfo` method execution time and rate </div> |

grafana/dashboard-template.json

Lines changed: 141 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -823,6 +823,123 @@
823823
"x": 16,
824824
"y": 7
825825
},
826+
"id": 104,
827+
"maxPerRow": 2,
828+
"options": {
829+
"alertThreshold": true,
830+
"legend": {
831+
"calcs": [],
832+
"displayMode": "table",
833+
"placement": "right",
834+
"showLegend": true
835+
},
836+
"tooltip": {
837+
"mode": "multi",
838+
"sort": "none"
839+
}
840+
},
841+
"repeatDirection": "h",
842+
"targets": [
843+
{
844+
"datasource": "${DS_PROMETHEUS}",
845+
"editorMode": "code",
846+
"expr": " kyuubi_engine_startup_permit_limit_total{$baseFilter,instance=~\"$instance\"}",
847+
"hide": false,
848+
"legendFormat": "${baseLegend}-limit",
849+
"range": true,
850+
"refId": "A"
851+
},
852+
{
853+
"datasource": "${DS_PROMETHEUS}",
854+
"editorMode": "code",
855+
"expr": " kyuubi_engine_startup_permit_waiting{$baseFilter,instance=~\"$instance\"}",
856+
"hide": false,
857+
"legendFormat": "${baseLegend}-waiting",
858+
"range": true,
859+
"refId": "B"
860+
},
861+
{
862+
"datasource": "${DS_PROMETHEUS}",
863+
"editorMode": "code",
864+
"expr": " kyuubi_engine_startup_permit_available{$baseFilter,instance=~\"$instance\"}",
865+
"hide": false,
866+
"legendFormat": "${baseLegend}-available",
867+
"range": true,
868+
"refId": "C"
869+
}
870+
],
871+
"title": "Engine startup permit",
872+
"type": "timeseries"
873+
},
874+
{
875+
"datasource": {
876+
"type": "prometheus",
877+
"uid": "${DS_PROMETHEUS}"
878+
},
879+
"description": "",
880+
"fieldConfig": {
881+
"defaults": {
882+
"color": {
883+
"mode": "palette-classic"
884+
},
885+
"custom": {
886+
"axisBorderShow": false,
887+
"axisCenteredZero": false,
888+
"axisColorMode": "text",
889+
"axisLabel": "",
890+
"axisPlacement": "auto",
891+
"barAlignment": 0,
892+
"barWidthFactor": 0.6,
893+
"drawStyle": "line",
894+
"fillOpacity": 10,
895+
"gradientMode": "none",
896+
"hideFrom": {
897+
"legend": false,
898+
"tooltip": false,
899+
"viz": false
900+
},
901+
"insertNulls": false,
902+
"lineInterpolation": "linear",
903+
"lineWidth": 1,
904+
"pointSize": 5,
905+
"scaleDistribution": {
906+
"type": "linear"
907+
},
908+
"showPoints": "never",
909+
"spanNulls": false,
910+
"stacking": {
911+
"group": "A",
912+
"mode": "none"
913+
},
914+
"thresholdsStyle": {
915+
"mode": "off"
916+
}
917+
},
918+
"links": [],
919+
"mappings": [],
920+
"thresholds": {
921+
"mode": "absolute",
922+
"steps": [
923+
{
924+
"color": "green",
925+
"value": null
926+
},
927+
{
928+
"color": "red",
929+
"value": 80
930+
}
931+
]
932+
},
933+
"unit": "ms"
934+
},
935+
"overrides": []
936+
},
937+
"gridPos": {
938+
"h": 7,
939+
"w": 8,
940+
"x": 0,
941+
"y": 13
942+
},
826943
"id": 75,
827944
"maxPerRow": 2,
828945
"options": {
@@ -918,7 +1035,7 @@
9181035
"gridPos": {
9191036
"h": 7,
9201037
"w": 8,
921-
"x": 0,
1038+
"x": 8,
9221039
"y": 13
9231040
},
9241041
"id": 77,
@@ -1025,7 +1142,7 @@
10251142
"gridPos": {
10261143
"h": 7,
10271144
"w": 8,
1028-
"x": 8,
1145+
"x": 16,
10291146
"y": 13
10301147
},
10311148
"id": 79,
@@ -1130,10 +1247,10 @@
11301247
"overrides": []
11311248
},
11321249
"gridPos": {
1133-
"h": 7,
1250+
"h": 6,
11341251
"w": 8,
1135-
"x": 16,
1136-
"y": 13
1252+
"x": 0,
1253+
"y": 20
11371254
},
11381255
"id": 80,
11391256
"maxPerRow": 2,
@@ -1236,7 +1353,7 @@
12361353
"gridPos": {
12371354
"h": 6,
12381355
"w": 8,
1239-
"x": 0,
1356+
"x": 8,
12401357
"y": 20
12411358
},
12421359
"id": 34,
@@ -1335,7 +1452,7 @@
13351452
"gridPos": {
13361453
"h": 6,
13371454
"w": 8,
1338-
"x": 8,
1455+
"x": 16,
13391456
"y": 20
13401457
},
13411458
"id": 71,
@@ -1430,8 +1547,8 @@
14301547
"gridPos": {
14311548
"h": 6,
14321549
"w": 8,
1433-
"x": 16,
1434-
"y": 20
1550+
"x": 0,
1551+
"y": 26
14351552
},
14361553
"id": 76,
14371554
"maxPerRow": 2,
@@ -1478,7 +1595,7 @@
14781595
"h": 1,
14791596
"w": 24,
14801597
"x": 0,
1481-
"y": 26
1598+
"y": 32
14821599
},
14831600
"id": 88,
14841601
"panels": [],
@@ -1549,7 +1666,7 @@
15491666
"h": 8,
15501667
"w": 12,
15511668
"x": 0,
1552-
"y": 27
1669+
"y": 33
15531670
},
15541671
"id": 89,
15551672
"maxPerRow": 2,
@@ -1646,7 +1763,7 @@
16461763
"h": 8,
16471764
"w": 12,
16481765
"x": 12,
1649-
"y": 27
1766+
"y": 33
16501767
},
16511768
"id": 92,
16521769
"maxPerRow": 2,
@@ -1743,7 +1860,7 @@
17431860
"h": 7,
17441861
"w": 12,
17451862
"x": 0,
1746-
"y": 35
1863+
"y": 41
17471864
},
17481865
"id": 90,
17491866
"maxPerRow": 2,
@@ -1849,7 +1966,7 @@
18491966
"h": 7,
18501967
"w": 12,
18511968
"x": 12,
1852-
"y": 35
1969+
"y": 41
18531970
},
18541971
"id": 91,
18551972
"maxPerRow": 2,
@@ -1899,7 +2016,7 @@
18992016
"h": 1,
19002017
"w": 24,
19012018
"x": 0,
1902-
"y": 42
2019+
"y": 48
19032020
},
19042021
"id": 93,
19052022
"panels": [],
@@ -1970,7 +2087,7 @@
19702087
"h": 8,
19712088
"w": 12,
19722089
"x": 0,
1973-
"y": 43
2090+
"y": 49
19742091
},
19752092
"id": 94,
19762093
"maxPerRow": 2,
@@ -2076,7 +2193,7 @@
20762193
"h": 8,
20772194
"w": 12,
20782195
"x": 12,
2079-
"y": 43
2196+
"y": 49
20802197
},
20812198
"id": 99,
20822199
"maxPerRow": 2,
@@ -2173,7 +2290,7 @@
21732290
"h": 8,
21742291
"w": 12,
21752292
"x": 0,
2176-
"y": 51
2293+
"y": 57
21772294
},
21782295
"id": 98,
21792296
"maxPerRow": 2,
@@ -2271,7 +2388,7 @@
22712388
"h": 8,
22722389
"w": 12,
22732390
"x": 12,
2274-
"y": 51
2391+
"y": 57
22752392
},
22762393
"id": 97,
22772394
"maxPerRow": 2,
@@ -2321,7 +2438,7 @@
23212438
"h": 1,
23222439
"w": 24,
23232440
"x": 0,
2324-
"y": 59
2441+
"y": 65
23252442
},
23262443
"id": 68,
23272444
"panels": [],
@@ -2392,7 +2509,7 @@
23922509
"h": 8,
23932510
"w": 12,
23942511
"x": 0,
2395-
"y": 60
2512+
"y": 66
23962513
},
23972514
"id": 100,
23982515
"options": {
@@ -2511,7 +2628,7 @@
25112628
"h": 8,
25122629
"w": 12,
25132630
"x": 12,
2514-
"y": 60
2631+
"y": 66
25152632
},
25162633
"id": 101,
25172634
"options": {
@@ -2601,7 +2718,7 @@
26012718
"h": 8,
26022719
"w": 12,
26032720
"x": 0,
2604-
"y": 68
2721+
"y": 74
26052722
},
26062723
"id": 49,
26072724
"options": {
@@ -2653,7 +2770,7 @@
26532770
"h": 1,
26542771
"w": 24,
26552772
"x": 0,
2656-
"y": 68
2773+
"y": 82
26572774
},
26582775
"id": 60,
26592776
"panels": [

kyuubi-metrics/src/main/scala/org/apache/kyuubi/metrics/MetricsConstants.scala

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ object MetricsConstants {
6060
final val ENGINE_TIMEOUT: String = ENGINE + "timeout"
6161
final val ENGINE_TOTAL: String = ENGINE + "total"
6262

63+
final private val ENGINE_STARTUP_PERMIT: String = ENGINE + "startup.permit."
64+
final val ENGINE_STARTUP_PERMIT_LIMIT: String = ENGINE_STARTUP_PERMIT + "limit"
65+
final val ENGINE_STARTUP_PERMIT_AVAILABLE: String = ENGINE_STARTUP_PERMIT + "available"
66+
final val ENGINE_STARTUP_PERMIT_WAITING: String = ENGINE_STARTUP_PERMIT + "waiting"
67+
6368
final private val OPERATION = KYUUBI + "operation."
6469
final val OPERATION_OPEN: String = OPERATION + "opened"
6570
final val OPERATION_FAIL: String = OPERATION + "failed"

kyuubi-server/src/main/scala/org/apache/kyuubi/session/KyuubiSessionManager.scala

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,14 @@ class KyuubiSessionManager private (name: String) extends SessionManager(name) {
300300
ms.registerGauge(EXEC_POOL_ALIVE, getExecPoolSize, 0)
301301
ms.registerGauge(EXEC_POOL_ACTIVE, getActiveCount, 0)
302302
ms.registerGauge(EXEC_POOL_WORK_QUEUE_SIZE, getWorkQueueSize, 0)
303+
this.engineStartupProcessSemaphore.foreach { semaphore =>
304+
ms.markMeter(ENGINE_STARTUP_PERMIT_LIMIT, semaphore.availablePermits)
305+
ms.registerGauge(
306+
ENGINE_STARTUP_PERMIT_AVAILABLE,
307+
semaphore.availablePermits,
308+
semaphore.availablePermits)
309+
ms.registerGauge(ENGINE_STARTUP_PERMIT_WAITING, semaphore.getQueueLength, 0)
310+
}
303311
}
304312
super.start()
305313
startEngineAliveChecker()

0 commit comments

Comments
 (0)