CP-53858: Domain CPU ready RRD1 metric - runnable_any

BengangY · gangj · commit c2663f2873f2 · 2025-08-06T15:45:20.000+08:00
Adding a new metric 'runnable_any' as % of time that at least one vCPU of the domain is in the runnable state. It is the sum of the following 3 metrics: - runstate_full_contention - runstate_concurrency_hazard - runstate_partial_contention Naming it 'runnable_any' instead of 'runnable' is to resolve one problem with rrd2csv: if we name it 'runnable', rrd2csv will select both RRD1("runnable") and RRD2("runnable_vcpus") when the 'runnable' is used: > rrd2csv AVERAGE:vm:<vm-uuid>:runnable > timestamp, AVERAGE:vm:<vm-uuid>:runnable, AVERAGE:vm:<vm-uuid>:runnable_vcpus This is because "runnable" is a prefix of "runnable_vcpus". Naming it 'runnable_any', with rrd2csv: * can select only RRD1 if we use: rrd2csv AVERAGE:vm:<vm-uuid>:runnable_any * can select only RRD2 if we use: rrd2csv AVERAGE:vm:<vm-uuid>:runnable_vcpus * can select both RRD1 and RRD2 if we use: rrd2csv AVERAGE:vm:<vm-uuid>:runnable Naming it 'runnable_any' also makes it clearer as the 'runnable' metric is % of time that at least one vCPU of the domain is in the runnable state. Add max to "runnable_any" metric to follow the fix here: #6493 Signed-off-by: Bengang Yuan <bengang.yuan@cloud.com> [Rebase with renaming and one fix] Signed-off-by: Gang Ji <gang.ji@cloud.com>
diff --git a/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml b/ocaml/xcp-rrdd/bin/rrdp-cpu/rrdp_cpu.ml
@@ -20,7 +20,7 @@ module Process = Rrdd_plugin.Process (struct let name = "xcp-rrdd-cpu" end)
 
 let xen_flag_complement = Int64.(shift_left 1L 63 |> lognot)
 
-(* This function is used for getting vcpu stats of the VMs present on this host. *)
+(* This function is used for getting vCPU stats of the VMs present on this host. *)
 let dss_vcpus xc doms =
   List.fold_left
     (fun dss (dom, uuid, domid) ->
@@ -49,27 +49,28 @@ let dss_vcpus xc doms =
           in
           cpus (i + 1) (cputime_rrd :: dss)
       in
-      (* Runstate info is per-domain rather than per-vcpu *)
+      (* Runstate info is per-domain rather than per-vCPU *)
       let dss =
         let dom_cpu_time =
           Int64.(to_float @@ logand dom.Xenctrl.cpu_time xen_flag_complement)
         in
         let dom_cpu_time =
           dom_cpu_time /. (1.0e9 *. float_of_int dom.Xenctrl.nr_online_vcpus)
         in
+        let ( ++ ) = Int64.add in
         try
           let ri = Xenctrl.domain_get_runstate_info xc domid in
           ( Rrd.VM uuid
           , Ds.ds_make ~name:"runstate_fullrun" ~units:"(fraction)"
               ~value:(Rrd.VT_Float (Int64.to_float ri.Xenctrl.time0 /. 1.0e9))
-              ~description:"Fraction of time that all VCPUs are running"
+              ~description:"Fraction of time that all vCPUs are running"
               ~ty:Rrd.Derive ~default:false ~min:0.0 ~max:1.0 ()
           )
           :: ( Rrd.VM uuid
              , Ds.ds_make ~name:"runstate_full_contention" ~units:"(fraction)"
                  ~value:(Rrd.VT_Float (Int64.to_float ri.Xenctrl.time1 /. 1.0e9))
                  ~description:
-                   "Fraction of time that all VCPUs are runnable (i.e., \
+                   "Fraction of time that all vCPUs are runnable (i.e., \
                     waiting for CPU)"
                  ~ty:Rrd.Derive ~default:false ~min:0.0 ~max:1.0 ()
              )
@@ -78,22 +79,22 @@ let dss_vcpus xc doms =
                  ~units:"(fraction)"
                  ~value:(Rrd.VT_Float (Int64.to_float ri.Xenctrl.time2 /. 1.0e9))
                  ~description:
-                   "Fraction of time that some VCPUs are running and some are \
+                   "Fraction of time that some vCPUs are running and some are \
                     runnable"
                  ~ty:Rrd.Derive ~default:false ~min:0.0 ~max:1.0 ()
              )
           :: ( Rrd.VM uuid
              , Ds.ds_make ~name:"runstate_blocked" ~units:"(fraction)"
                  ~value:(Rrd.VT_Float (Int64.to_float ri.Xenctrl.time3 /. 1.0e9))
                  ~description:
-                   "Fraction of time that all VCPUs are blocked or offline"
+                   "Fraction of time that all vCPUs are blocked or offline"
                  ~ty:Rrd.Derive ~default:false ~min:0.0 ~max:1.0 ()
              )
           :: ( Rrd.VM uuid
              , Ds.ds_make ~name:"runstate_partial_run" ~units:"(fraction)"
                  ~value:(Rrd.VT_Float (Int64.to_float ri.Xenctrl.time4 /. 1.0e9))
                  ~description:
-                   "Fraction of time that some VCPUs are running, and some are \
+                   "Fraction of time that some vCPUs are running and some are \
                     blocked"
                  ~ty:Rrd.Derive ~default:false ~min:0.0 ~max:1.0 ()
              )
@@ -102,10 +103,27 @@ let dss_vcpus xc doms =
                  ~units:"(fraction)"
                  ~value:(Rrd.VT_Float (Int64.to_float ri.Xenctrl.time5 /. 1.0e9))
                  ~description:
-                   "Fraction of time that some VCPUs are runnable and some are \
+                   "Fraction of time that some vCPUs are runnable and some are \
                     blocked"
                  ~ty:Rrd.Derive ~default:false ~min:0.0 ~max:1.0 ()
              )
+          :: ( Rrd.VM uuid
+             , Ds.ds_make ~name:"runnable_any" ~units:"(fraction)"
+                 ~value:
+                   (Rrd.VT_Float
+                      (Int64.to_float
+                         (ri.Xenctrl.time1
+                         ++ ri.Xenctrl.time2
+                         ++ ri.Xenctrl.time5
+                         )
+                      /. 1.0e9
+                      )
+                   )
+                 ~description:
+                   "Fraction of time that at least one vCPU is runnable in the \
+                    domain"
+                 ~ty:Rrd.Derive ~default:false ~min:0.0 ~max:1.0 ()
+             )
           :: ( Rrd.VM uuid
              , Ds.ds_make
                  ~name:(Printf.sprintf "cpu_usage")