Skip to content

Commit dac7a66

Browse files
committed
CA-409482: Using computed delay for RRD loop
RRD loop is executed each 5 seconds. It delays fixed 5 seconds between each loop. But the loop self also consumes time (The time consuming depends on CPU's count. If there are many CPUs, the time consuming may be hundreds milliseconds). This implementation leads RRD will take an offset after several loops. Then one of RRD data lose and a gap can be observed on XenCenter performance graph. The solution is to use a fixed deadline as each iteration start time and to use a computed delay (timeslice - loop time consuming) instead of fixed delay. Signed-off-by: Bengang Yuan <[email protected]>
1 parent 18e8584 commit dac7a66

File tree

4 files changed

+61
-15
lines changed

4 files changed

+61
-15
lines changed

ocaml/xcp-rrdd/bin/rrdd/dune

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@
4646
http_lib
4747
httpsvr
4848
inotify
49+
mtime
50+
mtime.clock
4951
rpclib.core
5052
rpclib.json
5153
rpclib.xml

ocaml/xcp-rrdd/bin/rrdd/rrdd_server.ml

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -715,9 +715,25 @@ module Plugin = struct
715715
specified unique ID. If the plugin is not registered, -1 is returned. *)
716716
let next_reading (uid : P.uid) : float =
717717
let open Rrdd_shared in
718+
let ( --- ) a b = Mtime.Span.abs_diff a b in
718719
if with_lock registered_m (fun _ -> Hashtbl.mem registered uid) then
719-
with_lock last_loop_end_time_m (fun _ ->
720-
!last_loop_end_time +. !timeslice -. Unix.gettimeofday ()
720+
with_lock next_iteration_start_m (fun _ ->
721+
let current = Mtime_clock.count from_loop_start in
722+
let diff =
723+
Mtime.Span.to_float_ns
724+
(!Rrdd_shared.next_iteration_start
725+
--- Mtime_clock.count from_loop_start
726+
)
727+
/. 1_000_000_000.
728+
in
729+
if
730+
Mtime.Span.is_longer
731+
~than:!Rrdd_shared.next_iteration_start
732+
current
733+
then
734+
diff *. -1.
735+
else
736+
diff
721737
)
722738
else
723739
-1.

ocaml/xcp-rrdd/bin/rrdd/rrdd_shared.ml

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,17 @@ module StringSet = Set.Make (String)
2020
(* Whether to enable all non-default datasources *)
2121
let enable_all_dss = ref false
2222

23-
(* The time between each monitoring loop. *)
24-
let timeslice : float ref = ref 5.
23+
(* The expected time span between each monitoring loop. *)
24+
let timeslice : Mtime.span ref = ref Mtime.Span.(5 * s)
2525

26-
(* Timestamp of the last monitoring loop end. *)
27-
let last_loop_end_time : float ref = ref neg_infinity
26+
(* The counter since the start of all monitoring loops. *)
27+
let from_loop_start : Mtime_clock.counter = Mtime_clock.counter ()
2828

29-
(* The mutex that protects the last_loop_end_time against data corruption. *)
30-
let last_loop_end_time_m : Mutex.t = Mutex.create ()
29+
(* The time span of the next monitoring loop begin. *)
30+
let next_iteration_start : Mtime.Span.t ref = ref Mtime.Span.zero
31+
32+
(* The mutex that protects the next_iteration_start against data corruption. *)
33+
let next_iteration_start_m : Mutex.t = Mutex.create ()
3134

3235
(** Cache memory/target values *)
3336
let memory_targets : (int, int64) Hashtbl.t = Hashtbl.create 20

ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -535,21 +535,46 @@ let monitor_write_loop writers =
535535
Debug.with_thread_named "monitor_write"
536536
(fun () ->
537537
Xenctrl.with_intf (fun xc ->
538+
let ( --- ) = Mtime.Span.abs_diff in
539+
let ( +++ ) = Mtime.Span.add in
538540
while true do
539541
try
540542
do_monitor_write xc writers ;
541-
with_lock Rrdd_shared.last_loop_end_time_m (fun _ ->
542-
Rrdd_shared.last_loop_end_time := Unix.gettimeofday ()
543+
with_lock Rrdd_shared.next_iteration_start_m (fun _ ->
544+
Rrdd_shared.next_iteration_start :=
545+
!Rrdd_shared.next_iteration_start +++ !Rrdd_shared.timeslice
543546
) ;
544-
Thread.delay !Rrdd_shared.timeslice
547+
let last_iteration_end =
548+
Mtime_clock.count Rrdd_shared.from_loop_start
549+
in
550+
let time_in_iteration =
551+
!Rrdd_shared.next_iteration_start --- last_iteration_end
552+
in
553+
if
554+
Mtime.Span.is_longer
555+
~than:!Rrdd_shared.next_iteration_start
556+
last_iteration_end
557+
then
558+
warn
559+
"%s: Monitor write iteration took (%a), this is longer than \
560+
a full cycle, skipping the delay"
561+
__FUNCTION__ Debug.Pp.mtime_span
562+
(time_in_iteration +++ !Rrdd_shared.timeslice)
563+
else
564+
Thread.delay
565+
(Mtime.Span.to_float_ns time_in_iteration /. 1_000_000_000.)
545566
with e ->
546567
Backtrace.is_important e ;
547568
warn
548-
"Monitor/write thread caught an exception. Pausing for 10s, \
549-
then restarting: %s"
550-
(Printexc.to_string e) ;
569+
"%s: Monitor/write thread caught an exception. Pausing for \
570+
10s, then restarting: %s"
571+
__FUNCTION__ (Printexc.to_string e) ;
551572
log_backtrace e ;
552-
Thread.delay 10.
573+
Thread.delay 10. ;
574+
with_lock Rrdd_shared.next_iteration_start_m (fun _ ->
575+
Rrdd_shared.next_iteration_start :=
576+
!Rrdd_shared.next_iteration_start +++ Mtime.Span.(10 * s)
577+
)
553578
done
554579
)
555580
)

0 commit comments

Comments
 (0)