Skip to content

Commit fd49f35

Browse files
committed
rrdd: Avoid missing aggregation of metrics from newly destroyed domains
Currently rrdd needs to know when a metric comes from a newly created domain, (after a local migration, for example). This is because when a new domain is created the counters start from zero again. This needs special logic for aggregating metrics since xcp-rrdd needs to provide continuity of metrics of a VM with a UUID, even if the domid changes. Previously rrdd fetched the data about domains before metrics from plugins were collected, and reused the data for self-reported metrics. While this meant that for self-reported metrics it was impossible to miss collected information, for plugin metrics it meant that for created and destroyed domains, the between between domain id and VM UUID was not available. With the current change the domain ids and VM UUIDs are collected every iteration of the monitor loop, and kept for one more iteration, so domains destroyed in the last iteration are remembered and not missed. With this done it's now safe to move the host and memory metrics collection into its own plugin. Also use sequences more thoroughly in the code for transformations Signed-off-by: Pau Ruiz Safont <[email protected]>
1 parent b8aa4a7 commit fd49f35

File tree

3 files changed

+140
-138
lines changed

3 files changed

+140
-138
lines changed

ocaml/xcp-rrdd/bin/rrdd/rrdd_monitor.ml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,6 @@ let convert_to_owner_map dss =
156156
weren't updated on this refresh cycle.
157157
*)
158158
let update_rrds uuid_domids plugins_dss =
159-
let uuid_domids = List.to_seq uuid_domids |> StringMap.of_seq in
160159
let per_owner_flattened_map, per_plugin_map =
161160
convert_to_owner_map plugins_dss
162161
in

ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml

Lines changed: 130 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -255,85 +255,124 @@ let mem_available () =
255255
let* size, kb = scan "/proc/meminfo" in
256256
match kb with "kB" -> ok size | _ -> res_error "unexpected unit: %s" kb
257257

258-
let dss_mem_vms doms =
259-
List.fold_left
260-
(fun acc (dom, uuid, domid) ->
261-
let add_vm_metrics () =
262-
let kib =
263-
Xenctrl.pages_to_kib
264-
(Int64.of_nativeint dom.Xenctrl.total_memory_pages)
265-
in
266-
let memory = Int64.mul kib 1024L in
267-
let main_mem_ds =
268-
( Rrd.VM uuid
269-
, Ds.ds_make ~name:"memory"
270-
~description:"Memory currently allocated to VM" ~units:"B"
271-
~value:(Rrd.VT_Int64 memory) ~ty:Rrd.Gauge ~min:0.0 ~default:true
272-
()
273-
)
274-
in
275-
let memory_target_opt =
276-
with_lock Rrdd_shared.memory_targets_m (fun _ ->
277-
Hashtbl.find_opt Rrdd_shared.memory_targets domid
278-
)
279-
in
280-
let mem_target_ds =
281-
Option.map
282-
(fun memory_target ->
283-
( Rrd.VM uuid
284-
, Ds.ds_make ~name:"memory_target"
285-
~description:"Target of VM balloon driver" ~units:"B"
286-
~value:(Rrd.VT_Int64 memory_target) ~ty:Rrd.Gauge ~min:0.0
287-
~default:true ()
288-
)
258+
let uuid_blacklist = ["00000000-0000-0000"; "deadbeef-dead-beef"]
259+
260+
module IntSet = Set.Make (Int)
261+
262+
let domain_snapshot xc =
263+
let metadata_of_domain dom =
264+
let ( let* ) = Option.bind in
265+
let* uuid_raw = Uuidx.of_int_array dom.Xenctrl.handle in
266+
let uuid = Uuidx.to_string uuid_raw in
267+
let domid = dom.Xenctrl.domid in
268+
let start = String.sub uuid 0 18 in
269+
(* Actively hide migrating VM uuids, these are temporary and xenops writes
270+
the original and the final uuid to xenstore *)
271+
let uuid_from_key key =
272+
let path = Printf.sprintf "/vm/%s/%s" uuid key in
273+
try Ezxenstore_core.Xenstore.(with_xs (fun xs -> xs.read path))
274+
with Xs_protocol.Enoent _hint ->
275+
info "Couldn't read path %s; falling back to actual uuid" path ;
276+
uuid
277+
in
278+
let stable_uuid = Option.fold ~none:uuid ~some:uuid_from_key in
279+
if List.mem start uuid_blacklist then
280+
None
281+
else
282+
let key =
283+
if Astring.String.is_suffix ~affix:"000000000000" uuid then
284+
Some "origin-uuid"
285+
else if Astring.String.is_suffix ~affix:"000000000001" uuid then
286+
Some "final-uuid"
287+
else
288+
None
289+
in
290+
Some (dom, stable_uuid key, domid)
291+
in
292+
let domains =
293+
Xenctrl.domain_getinfolist xc 0 |> List.filter_map metadata_of_domain
294+
in
295+
let domids = List.map (fun (_, _, i) -> i) domains |> IntSet.of_list in
296+
let domains_only k v = Option.map (Fun.const v) (IntSet.find_opt k domids) in
297+
Hashtbl.filter_map_inplace domains_only Rrdd_shared.memory_targets ;
298+
domains |> List.to_seq
299+
300+
let dss_mem_vms xc =
301+
let mem_metrics_of (dom, uuid, domid) =
302+
let vm_metrics () =
303+
let kib =
304+
Xenctrl.pages_to_kib (Int64.of_nativeint dom.Xenctrl.total_memory_pages)
305+
in
306+
let memory = Int64.mul kib 1024L in
307+
let main_mem_ds =
308+
( Rrd.VM uuid
309+
, Ds.ds_make ~name:"memory"
310+
~description:"Memory currently allocated to VM" ~units:"B"
311+
~value:(Rrd.VT_Int64 memory) ~ty:Rrd.Gauge ~min:0.0 ~default:true ()
312+
)
313+
in
314+
let memory_target_opt =
315+
with_lock Rrdd_shared.memory_targets_m (fun _ ->
316+
Hashtbl.find_opt Rrdd_shared.memory_targets domid
317+
)
318+
in
319+
let mem_target_ds =
320+
Option.map
321+
(fun memory_target ->
322+
( Rrd.VM uuid
323+
, Ds.ds_make ~name:"memory_target"
324+
~description:"Target of VM balloon driver" ~units:"B"
325+
~value:(Rrd.VT_Int64 memory_target) ~ty:Rrd.Gauge ~min:0.0
326+
~default:true ()
289327
)
290-
memory_target_opt
291-
in
292-
let other_ds =
293-
if domid = 0 then
294-
match mem_available () with
295-
| Ok mem ->
296-
Some
297-
( Rrd.VM uuid
298-
, Ds.ds_make ~name:"memory_internal_free" ~units:"KiB"
299-
~description:"Dom0 current free memory"
300-
~value:(Rrd.VT_Int64 mem) ~ty:Rrd.Gauge ~min:0.0
301-
~default:true ()
302-
)
303-
| Error msg ->
304-
let _ =
305-
error "%s: retrieving Dom0 free memory failed: %s"
306-
__FUNCTION__ msg
307-
in
308-
None
309-
else
310-
try
311-
let mem_free =
312-
Watch.IntMap.find domid !current_meminfofree_values
313-
in
328+
)
329+
memory_target_opt
330+
in
331+
let other_ds =
332+
if domid = 0 then
333+
match mem_available () with
334+
| Ok mem ->
314335
Some
315336
( Rrd.VM uuid
316337
, Ds.ds_make ~name:"memory_internal_free" ~units:"KiB"
317-
~description:"Memory used as reported by the guest agent"
318-
~value:(Rrd.VT_Int64 mem_free) ~ty:Rrd.Gauge ~min:0.0
338+
~description:"Dom0 current free memory"
339+
~value:(Rrd.VT_Int64 mem) ~ty:Rrd.Gauge ~min:0.0
319340
~default:true ()
320341
)
321-
with Not_found -> None
322-
in
342+
| Error msg ->
343+
let _ =
344+
error "%s: retrieving Dom0 free memory failed: %s" __FUNCTION__
345+
msg
346+
in
347+
None
348+
else
349+
try
350+
let mem_free =
351+
Watch.IntMap.find domid !current_meminfofree_values
352+
in
353+
Some
354+
( Rrd.VM uuid
355+
, Ds.ds_make ~name:"memory_internal_free" ~units:"KiB"
356+
~description:"Memory used as reported by the guest agent"
357+
~value:(Rrd.VT_Int64 mem_free) ~ty:Rrd.Gauge ~min:0.0
358+
~default:true ()
359+
)
360+
with Not_found -> None
361+
in
362+
let metrics =
323363
List.concat
324-
[
325-
main_mem_ds :: Option.to_list other_ds
326-
; Option.to_list mem_target_ds
327-
; acc
328-
]
364+
[main_mem_ds :: Option.to_list other_ds; Option.to_list mem_target_ds]
329365
in
330-
(* CA-34383: Memory updates from paused domains serve no useful purpose.
331-
During a migrate such updates can also cause undesirable
332-
discontinuities in the observed value of memory_actual. Hence, we
333-
ignore changes from paused domains: *)
334-
if dom.Xenctrl.paused then acc else add_vm_metrics ()
335-
)
336-
[] doms
366+
Some (List.to_seq metrics)
367+
in
368+
(* CA-34383: Memory updates from paused domains serve no useful purpose.
369+
During a migrate such updates can also cause undesirable
370+
discontinuities in the observed value of memory_actual. Hence, we
371+
ignore changes from paused domains: *)
372+
if dom.Xenctrl.paused then None else vm_metrics ()
373+
in
374+
let domains = domain_snapshot xc in
375+
Seq.filter_map mem_metrics_of domains |> Seq.concat |> List.of_seq
337376

338377
(**** Local cache SR stuff *)
339378

@@ -438,62 +477,18 @@ let handle_exn log f default =
438477
(Printexc.to_string e) ;
439478
default
440479

441-
let uuid_blacklist = ["00000000-0000-0000"; "deadbeef-dead-beef"]
442-
443-
module IntSet = Set.Make (Int)
444-
445-
let domain_snapshot xc =
446-
let metadata_of_domain dom =
447-
let ( let* ) = Option.bind in
448-
let* uuid_raw = Uuidx.of_int_array dom.Xenctrl.handle in
449-
let uuid = Uuidx.to_string uuid_raw in
450-
let domid = dom.Xenctrl.domid in
451-
let start = String.sub uuid 0 18 in
452-
(* Actively hide migrating VM uuids, these are temporary and xenops writes
453-
the original and the final uuid to xenstore *)
454-
let uuid_from_key key =
455-
let path = Printf.sprintf "/vm/%s/%s" uuid key in
456-
try Ezxenstore_core.Xenstore.(with_xs (fun xs -> xs.read path))
457-
with Xs_protocol.Enoent _hint ->
458-
info "Couldn't read path %s; falling back to actual uuid" path ;
459-
uuid
460-
in
461-
let stable_uuid = Option.fold ~none:uuid ~some:uuid_from_key in
462-
if List.mem start uuid_blacklist then
463-
None
464-
else
465-
let key =
466-
if Astring.String.is_suffix ~affix:"000000000000" uuid then
467-
Some "origin-uuid"
468-
else if Astring.String.is_suffix ~affix:"000000000001" uuid then
469-
Some "final-uuid"
470-
else
471-
None
472-
in
473-
Some (dom, stable_uuid key, domid)
474-
in
475-
let domains =
476-
Xenctrl.domain_getinfolist xc 0 |> List.filter_map metadata_of_domain
477-
in
478-
let domids = List.map (fun (_, _, i) -> i) domains |> IntSet.of_list in
479-
let domains_only k v = Option.map (Fun.const v) (IntSet.find_opt k domids) in
480-
Hashtbl.filter_map_inplace domains_only Rrdd_shared.memory_targets ;
481-
domains
482-
483480
let dom0_stat_generators =
484481
[
485-
("ha", fun _ _ _ -> Rrdd_ha_stats.all ())
486-
; ("mem_host", fun xc _ _ -> dss_mem_host xc)
487-
; ("mem_vms", fun _ _ domains -> dss_mem_vms domains)
488-
; ("cache", fun _ timestamp _ -> dss_cache timestamp)
482+
("ha", fun _ _ -> Rrdd_ha_stats.all ())
483+
; ("mem_host", fun xc _ -> dss_mem_host xc)
484+
; ("mem_vms", fun xc _ -> dss_mem_vms xc)
485+
; ("cache", fun _ timestamp -> dss_cache timestamp)
489486
]
490487

491-
let generate_all_dom0_stats xc domains =
488+
let generate_all_dom0_stats xc =
492489
let handle_generator (name, generator) =
493490
let timestamp = Unix.gettimeofday () in
494-
( name
495-
, (timestamp, handle_exn name (fun _ -> generator xc timestamp domains) [])
496-
)
491+
(name, (timestamp, handle_exn name (fun _ -> generator xc timestamp) []))
497492
in
498493
List.map handle_generator dom0_stat_generators
499494

@@ -510,10 +505,9 @@ let write_dom0_stats writers tagged_dss =
510505
in
511506
List.iter write_dss writers
512507

513-
let do_monitor_write xc writers =
508+
let do_monitor_write domains_before xc writers =
514509
Rrdd_libs.Stats.time_this "monitor" (fun _ ->
515-
let domains = domain_snapshot xc in
516-
let tagged_dom0_stats = generate_all_dom0_stats xc domains in
510+
let tagged_dom0_stats = generate_all_dom0_stats xc in
517511
write_dom0_stats writers tagged_dom0_stats ;
518512
let dom0_stats =
519513
tagged_dom0_stats
@@ -523,26 +517,34 @@ let do_monitor_write xc writers =
523517
)
524518
in
525519
let plugins_stats = Rrdd_server.Plugin.read_stats () in
520+
let domains_after = domain_snapshot xc in
526521
let stats = Seq.append plugins_stats dom0_stats in
527522
Rrdd_stats.print_snapshot () ;
528-
let uuid_domids = List.map (fun (_, u, i) -> (u, i)) domains in
529-
523+
(* merge the domain ids from the previous iteration and the current one
524+
to avoid missing updates *)
525+
let uuid_domids =
526+
Seq.append domains_before domains_after
527+
|> Seq.map (fun (_, u, i) -> (u, i))
528+
|> Rrd.StringMap.of_seq
529+
in
530530
(* stats are grouped per plugin, which provides its timestamp *)
531531
Rrdd_monitor.update_rrds uuid_domids stats ;
532532

533533
Rrdd_libs.Constants.datasource_dump_file
534534
|> Rrdd_server.dump_host_dss_to_file ;
535535
Rrdd_libs.Constants.datasource_vm_dump_file
536-
|> Rrdd_server.dump_vm_dss_to_file
536+
|> Rrdd_server.dump_vm_dss_to_file ;
537+
domains_after
537538
)
538539

539540
let monitor_write_loop writers =
540541
Debug.with_thread_named "monitor_write"
541542
(fun () ->
542543
Xenctrl.with_intf (fun xc ->
544+
let domains = ref Seq.empty in
543545
while true do
544546
try
545-
do_monitor_write xc writers ;
547+
domains := do_monitor_write !domains xc writers ;
546548
with_lock Rrdd_shared.next_iteration_start_m (fun _ ->
547549
Rrdd_shared.next_iteration_start :=
548550
Clock.Timer.extend_by !Rrdd_shared.timeslice

ocaml/xcp-rrdd/test/rrdd/test_rrdd_monitor.ml

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -74,60 +74,61 @@ let update_rrds_test ~timestamp ~dss ~uuid_domids ~expected_vm_rrds
7474

7575
let update_rrds =
7676
let open Rrd in
77+
let map_of_list ls = StringMap.of_seq (List.to_seq ls) in
7778
[
7879
( "Null update"
79-
, update_rrds_test ~timestamp:0. ~dss:[] ~uuid_domids:[]
80+
, update_rrds_test ~timestamp:0. ~dss:[] ~uuid_domids:StringMap.empty
8081
~expected_vm_rrds:[] ~expected_sr_rrds:[] ~expected_host_dss:[]
8182
)
8283
; ( "Single host update"
8384
, update_rrds_test ~timestamp:0.
8485
~dss:[(Host, ds_a)]
85-
~uuid_domids:[] ~expected_vm_rrds:[] ~expected_sr_rrds:[]
86+
~uuid_domids:StringMap.empty ~expected_vm_rrds:[] ~expected_sr_rrds:[]
8687
~expected_host_dss:[("host", [ds_a])]
8788
)
8889
; ( "Multiple host updates"
8990
, update_rrds_test ~timestamp:0.
9091
~dss:[(Host, ds_a); (Host, ds_b)]
91-
~uuid_domids:[] ~expected_vm_rrds:[] ~expected_sr_rrds:[]
92+
~uuid_domids:StringMap.empty ~expected_vm_rrds:[] ~expected_sr_rrds:[]
9293
~expected_host_dss:[("host", [ds_a; ds_b])]
9394
)
9495
; ( "Single non-resident VM update"
9596
, update_rrds_test ~timestamp:0.
9697
~dss:[(VM "a", ds_a)]
97-
~uuid_domids:[] ~expected_vm_rrds:[] ~expected_sr_rrds:[]
98+
~uuid_domids:StringMap.empty ~expected_vm_rrds:[] ~expected_sr_rrds:[]
9899
~expected_host_dss:[]
99100
)
100101
; ( "Multiple non-resident VM updates"
101102
, update_rrds_test ~timestamp:0.
102103
~dss:[(VM "a", ds_a); (VM "b", ds_a)]
103-
~uuid_domids:[] ~expected_vm_rrds:[] ~expected_sr_rrds:[]
104+
~uuid_domids:StringMap.empty ~expected_vm_rrds:[] ~expected_sr_rrds:[]
104105
~expected_host_dss:[]
105106
)
106107
; ( "Single resident VM update"
107108
, update_rrds_test ~timestamp:0.
108109
~dss:[(VM "a", ds_a)]
109-
~uuid_domids:[("a", 1)]
110+
~uuid_domids:(map_of_list [("a", 1)])
110111
~expected_vm_rrds:[("a", [ds_a])]
111112
~expected_sr_rrds:[] ~expected_host_dss:[]
112113
)
113114
; ( "Multiple resident VM updates"
114115
, update_rrds_test ~timestamp:0.
115116
~dss:[(VM "a", ds_a); (VM "b", ds_a); (VM "b", ds_b)]
116-
~uuid_domids:[("a", 1); ("b", 1)]
117+
~uuid_domids:(map_of_list [("a", 1); ("b", 1)])
117118
~expected_vm_rrds:[("a", [ds_a]); ("b", [ds_a; ds_b])]
118119
~expected_sr_rrds:[] ~expected_host_dss:[]
119120
)
120121
; ( "Multiple resident and non-resident VM updates"
121122
, update_rrds_test ~timestamp:0.
122123
~dss:[(VM "a", ds_a); (VM "b", ds_a); (VM "c", ds_a)]
123-
~uuid_domids:[("a", 1); ("b", 1)]
124+
~uuid_domids:(map_of_list [("a", 1); ("b", 1)])
124125
~expected_vm_rrds:[("a", [ds_a]); ("b", [ds_a])]
125126
~expected_sr_rrds:[] ~expected_host_dss:[]
126127
)
127128
; ( "Multiple SR updates"
128129
, update_rrds_test ~timestamp:0.
129130
~dss:[(SR "a", ds_a); (SR "b", ds_a); (SR "b", ds_b)]
130-
~uuid_domids:[] ~expected_vm_rrds:[]
131+
~uuid_domids:StringMap.empty ~expected_vm_rrds:[]
131132
~expected_sr_rrds:[("a", [ds_a]); ("b", [ds_a; ds_b])]
132133
~expected_host_dss:[]
133134
)

0 commit comments

Comments
 (0)