Skip to content

Commit 577186e

Browse files
authored
Improve other_operation_in_progress error users, fix handling of its delays (#6588)
`other_operation_in_progress` is reported inconsistently, with some users only specifying `[class; class_ref]`, and others also `op_type`. Make it consistent, changing all users to report `[class; class_ref; op_type; op_ref]`. This makes it much easier to navigate the logs, since the operation blocking progress is clearly referenced. Compare before, where it's unclear which operation is precluding progress: ``` Caught exception while marking SR for VDI.clone in message forwarder: OTHER_OPERATION_IN_PROGRESS: [ SR; OpaqueRef:d019f26a-b2e8-529b-bb66-fd4f008d4f82; VDI.clone ] ``` And after, with the ID of the operation present: ``` Caught exception while marking SR for VDI.clone in message forwarder: OTHER_OPERATION_IN_PROGRESS: [ SR; OpaqueRef:b0f54a40-485f-f48f-fdc7-6db28a64d3fd; scan; OpaqueRef:832fb22d-2ce5-0d26-8a00-a4165e78b34d ] ``` In addition, fix incorrect operation reporting, so that when progress is blocked only when particular types of operations are present in `current_operations`, these specific operations are reported, and not just any current operation. Also fix handling of `other_operation_in_progress` delays - some of the users would miss the `match` arm and use the non-interruptible `Thread.sleep` instead of the interruptible `Delay.sleep` that provides a mechanism for the blocking task to wake us up on its way out.
2 parents 2364685 + 5e895af commit 577186e

18 files changed

+326
-188
lines changed

ocaml/idl/datamodel_errors.ml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,8 @@ let _ =
538538
~doc:"You attempted an operation on a VM which is not suspendable." () ;
539539
error Api_errors.vm_is_template ["vm"]
540540
~doc:"The operation attempted is not valid for a template VM" () ;
541-
error Api_errors.other_operation_in_progress ["class"; "object"]
541+
error Api_errors.other_operation_in_progress
542+
["class"; "object"; "operation_type"; "operation_ref"]
542543
~doc:"Another operation involving the object is currently in progress" () ;
543544
error Api_errors.vbd_not_removable_media ["vbd"]
544545
~doc:"Media could not be ejected because it is not removable" () ;

ocaml/tests/test_clustering.ml

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -581,11 +581,21 @@ let test_disallow_unplug_during_cluster_host_create () =
581581
let key = Context.get_task_id __context |> Ref.string_of in
582582
Db.Cluster.add_to_current_operations ~__context ~self:cluster ~key ~value
583583
in
584-
let check_disallow_unplug_false_fails self msg =
584+
let check_disallow_unplug_false_fails self op msg =
585+
let op_ref, _ =
586+
List.hd (Db.Cluster.get_current_operations ~__context ~self:cluster)
587+
in
585588
Alcotest.check_raises msg
586589
Api_errors.(
587590
Server_error
588-
(other_operation_in_progress, ["Cluster"; Ref.string_of cluster])
591+
( other_operation_in_progress
592+
, [
593+
"Cluster"
594+
; Ref.string_of cluster
595+
; API.cluster_operation_to_string op
596+
; op_ref
597+
]
598+
)
589599
)
590600
(fun () -> Xapi_pif.set_disallow_unplug ~__context ~self ~value:false)
591601
in
@@ -598,14 +608,14 @@ let test_disallow_unplug_during_cluster_host_create () =
598608
let test_with_current op =
599609
Xapi_pif.set_disallow_unplug ~__context ~self:pIF ~value:true ;
600610
add_op op ;
601-
check_disallow_unplug_false_fails pIF
611+
check_disallow_unplug_false_fails pIF op
602612
"disallow_unplug cannot be set to false during cluster_host creation or \
603613
enable on same PIF" ;
604614
let other_pif = T.make_pif ~__context ~network ~host () in
605615
check_successful_disallow_unplug true other_pif
606616
"Should always be able to set disallow_unplug:true regardless of \
607617
clustering operations" ;
608-
check_disallow_unplug_false_fails other_pif
618+
check_disallow_unplug_false_fails other_pif op
609619
"disallow_unplug cannot be set to false during cluster_host creation or \
610620
enable on any PIF" ;
611621
let key = Context.get_task_id __context |> Ref.string_of in

ocaml/xapi/helpers.ml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1685,7 +1685,7 @@ module Repeat_with_uniform_backoff : POLICY = struct
16851685
debug "Waiting for up to %f seconds before retrying..." this_timeout ;
16861686
let start = Unix.gettimeofday () in
16871687
( match e with
1688-
| Api_errors.Server_error (code, [cls; objref])
1688+
| Api_errors.Server_error (code, cls :: objref :: _)
16891689
when code = Api_errors.other_operation_in_progress ->
16901690
Early_wakeup.wait (cls, objref) this_timeout
16911691
| _ ->

ocaml/xapi/message_forwarding.ml

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5720,14 +5720,21 @@ functor
57205720
if Helpers.i_am_srmaster ~__context ~sr then
57215721
List.iter
57225722
(fun vdi ->
5723-
if Db.VDI.get_current_operations ~__context ~self:vdi <> []
5724-
then
5725-
raise
5726-
(Api_errors.Server_error
5727-
( Api_errors.other_operation_in_progress
5728-
, [Datamodel_common._vdi; Ref.string_of vdi]
5729-
)
5730-
)
5723+
match Db.VDI.get_current_operations ~__context ~self:vdi with
5724+
| (op_ref, op_type) :: _ ->
5725+
raise
5726+
(Api_errors.Server_error
5727+
( Api_errors.other_operation_in_progress
5728+
, [
5729+
Datamodel_common._vdi
5730+
; Ref.string_of vdi
5731+
; API.vdi_operations_to_string op_type
5732+
; op_ref
5733+
]
5734+
)
5735+
)
5736+
| [] ->
5737+
()
57315738
)
57325739
(Db.SR.get_VDIs ~__context ~self:sr) ;
57335740
SR.mark_sr ~__context ~sr ~doc ~op

ocaml/xapi/xapi_cluster_helpers.ml

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,19 +24,23 @@ let is_allowed_concurrently ~op:_ ~current_ops:_ =
2424
false
2525

2626
let report_concurrent_operations_error ~current_ops ~ref_str =
27-
let current_ops_str =
27+
let current_ops_ref_str, current_ops_str =
2828
let op_to_str = Record_util.cluster_operation_to_string in
29+
let ( >> ) f g x = g (f x) in
2930
match current_ops with
3031
| [] ->
3132
failwith "No concurrent operation to report"
32-
| [(_, cop)] ->
33-
op_to_str cop
33+
| [(op_ref, cop)] ->
34+
(op_ref, op_to_str cop)
3435
| l ->
35-
"{" ^ String.concat "," (List.map op_to_str (List.map snd l)) ^ "}"
36+
( Printf.sprintf "{%s}" (String.concat "," (List.map fst l))
37+
, Printf.sprintf "{%s}"
38+
(String.concat "," (List.map (snd >> op_to_str) l))
39+
)
3640
in
3741
Some
3842
( Api_errors.other_operation_in_progress
39-
, ["Cluster." ^ current_ops_str; ref_str]
43+
, ["Cluster"; ref_str; current_ops_str; current_ops_ref_str]
4044
)
4145

4246
(** Take an internal Cluster record and a proposed operation. Return None iff the operation

ocaml/xapi/xapi_cluster_host_helpers.ml

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,23 @@ let is_allowed_concurrently ~op:_ ~current_ops:_ =
2222
false
2323

2424
let report_concurrent_operations_error ~current_ops ~ref_str =
25-
let current_ops_str =
25+
let current_ops_ref_str, current_ops_str =
2626
let op_to_str = Record_util.cluster_host_operation_to_string in
27+
let ( >> ) f g x = g (f x) in
2728
match current_ops with
2829
| [] ->
2930
failwith "No concurrent operation to report"
30-
| [(_, cop)] ->
31-
op_to_str cop
31+
| [(op_ref, cop)] ->
32+
(op_ref, op_to_str cop)
3233
| l ->
33-
"{" ^ String.concat "," (List.map op_to_str (List.map snd l)) ^ "}"
34+
( Printf.sprintf "{%s}" (String.concat "," (List.map fst l))
35+
, Printf.sprintf "{%s}"
36+
(String.concat "," (List.map (snd >> op_to_str) l))
37+
)
3438
in
3539
Some
3640
( Api_errors.other_operation_in_progress
37-
, ["Cluster_host." ^ current_ops_str; ref_str]
41+
, ["Cluster_host"; ref_str; current_ops_str; current_ops_ref_str]
3842
)
3943

4044
(** Take an internal Cluster_host record and a proposed operation. Return None iff the operation

ocaml/xapi/xapi_host_helpers.ml

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ let all_operations = API.host_allowed_operations__all
3131
(** Returns a table of operations -> API error options (None if the operation would be ok) *)
3232
let valid_operations ~__context record _ref' =
3333
let _ref = Ref.string_of _ref' in
34-
let current_ops = List.map snd record.Db_actions.host_current_operations in
34+
let current_ops = record.Db_actions.host_current_operations in
3535
let table = Hashtbl.create 10 in
3636
List.iter (fun x -> Hashtbl.replace table x None) all_operations ;
3737
let set_errors (code : string) (params : string list)
@@ -49,40 +49,53 @@ let valid_operations ~__context record _ref' =
4949
let is_creating_new x = List.mem x [`provision; `vm_resume; `vm_migrate] in
5050
let is_removing x = List.mem x [`evacuate; `reboot; `shutdown] in
5151
let creating_new =
52-
List.fold_left (fun acc op -> acc || is_creating_new op) false current_ops
53-
in
54-
let removing =
55-
List.fold_left (fun acc op -> acc || is_removing op) false current_ops
52+
List.find_opt (fun (_, op) -> is_creating_new op) current_ops
5653
in
54+
let removing = List.find_opt (fun (_, op) -> is_removing op) current_ops in
5755
List.iter
5856
(fun op ->
59-
if (is_creating_new op && removing) || (is_removing op && creating_new)
60-
then
61-
set_errors Api_errors.other_operation_in_progress
62-
["host"; _ref; host_operation_to_string (List.hd current_ops)]
63-
[op]
57+
match (is_creating_new op, removing, is_removing op, creating_new) with
58+
| true, Some (op_ref, op_type), _, _ | _, _, true, Some (op_ref, op_type)
59+
->
60+
set_errors Api_errors.other_operation_in_progress
61+
["host"; _ref; host_operation_to_string op_type; op_ref]
62+
[op]
63+
| _ ->
64+
()
6465
)
6566
(List.filter (fun x -> x <> `power_on) all_operations) ;
6667
(* reboot, shutdown and apply_updates cannot run concurrently *)
67-
if List.mem `reboot current_ops then
68-
set_errors Api_errors.other_operation_in_progress
69-
["host"; _ref; host_operation_to_string `reboot]
70-
[`shutdown; `apply_updates] ;
71-
if List.mem `shutdown current_ops then
72-
set_errors Api_errors.other_operation_in_progress
73-
["host"; _ref; host_operation_to_string `shutdown]
74-
[`reboot; `apply_updates] ;
75-
if List.mem `apply_updates current_ops then
76-
set_errors Api_errors.other_operation_in_progress
77-
["host"; _ref; host_operation_to_string `apply_updates]
78-
[`reboot; `shutdown; `enable] ;
68+
Option.iter
69+
(fun (op_ref, _op_type) ->
70+
set_errors Api_errors.other_operation_in_progress
71+
["host"; _ref; host_operation_to_string `reboot; op_ref]
72+
[`shutdown; `apply_updates]
73+
)
74+
(List.find_opt (fun (_, op) -> op = `reboot) current_ops) ;
75+
Option.iter
76+
(fun (op_ref, _op_type) ->
77+
set_errors Api_errors.other_operation_in_progress
78+
["host"; _ref; host_operation_to_string `shutdown; op_ref]
79+
[`reboot; `apply_updates]
80+
)
81+
(List.find_opt (fun (_, op) -> op = `shutdown) current_ops) ;
82+
Option.iter
83+
(fun (op_ref, _op_type) ->
84+
set_errors Api_errors.other_operation_in_progress
85+
["host"; _ref; host_operation_to_string `apply_updates; op_ref]
86+
[`reboot; `shutdown; `enable]
87+
)
88+
(List.find_opt (fun (_, op) -> op = `apply_updates) current_ops) ;
7989
(* Prevent more than one provision happening at a time to prevent extreme dom0
8090
load (in the case of the debian template). Once the template becomes a 'real'
8191
template we can relax this. *)
82-
if List.mem `provision current_ops then
83-
set_errors Api_errors.other_operation_in_progress
84-
["host"; _ref; host_operation_to_string `provision]
85-
[`provision] ;
92+
Option.iter
93+
(fun (op_ref, _op_type) ->
94+
set_errors Api_errors.other_operation_in_progress
95+
["host"; _ref; host_operation_to_string `provision; op_ref]
96+
[`provision]
97+
)
98+
(List.find_opt (fun (_, op) -> op = `provision) current_ops) ;
8699
(* The host must be disabled before reboots or shutdowns are permitted *)
87100
if record.Db_actions.host_enabled then
88101
set_errors Api_errors.host_not_disabled []

ocaml/xapi/xapi_pif.ml

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -926,17 +926,25 @@ let assert_cluster_host_operation_not_in_progress ~__context =
926926
match Db.Cluster.get_all ~__context with
927927
| [] ->
928928
()
929-
| cluster :: _ ->
930-
let ops =
931-
Db.Cluster.get_current_operations ~__context ~self:cluster
932-
|> List.map snd
933-
in
934-
if List.mem `enable ops || List.mem `add ops then
935-
raise
936-
Api_errors.(
937-
Server_error
938-
(other_operation_in_progress, ["Cluster"; Ref.string_of cluster])
939-
)
929+
| cluster :: _ -> (
930+
let ops = Db.Cluster.get_current_operations ~__context ~self:cluster in
931+
match List.find_opt (fun (_, op) -> op = `enable || op = `add) ops with
932+
| Some (op_ref, op_type) ->
933+
raise
934+
Api_errors.(
935+
Server_error
936+
( other_operation_in_progress
937+
, [
938+
"Cluster"
939+
; Ref.string_of cluster
940+
; API.cluster_operation_to_string op_type
941+
; op_ref
942+
]
943+
)
944+
)
945+
| None ->
946+
()
947+
)
940948

941949
(* Block allowing unplug if
942950
- a cluster host is enabled on this PIF

ocaml/xapi/xapi_pool_helpers.ml

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ type validity = Unknown | Allowed | Disallowed of string * string list
9999
let compute_valid_operations ~__context record pool :
100100
API.pool_allowed_operations -> validity =
101101
let ref = Ref.string_of pool in
102-
let current_ops = List.map snd record.Db_actions.pool_current_operations in
102+
let current_ops = record.Db_actions.pool_current_operations in
103103
let table = (Hashtbl.create 32 : (all_operations, validity) Hashtbl.t) in
104104
let set_validity = Hashtbl.replace table in
105105
(* Start by assuming all operations are allowed. *)
@@ -118,30 +118,45 @@ let compute_valid_operations ~__context record pool :
118118
in
119119
List.iter populate ops
120120
in
121-
let other_operation_in_progress =
122-
(Api_errors.other_operation_in_progress, [Datamodel_common._pool; ref])
121+
let other_operation_in_progress waiting_op =
122+
let additional_info =
123+
match waiting_op with
124+
| Some (op_ref, op_type) ->
125+
[API.pool_allowed_operations_to_string op_type; op_ref]
126+
| _ ->
127+
[]
128+
in
129+
( Api_errors.other_operation_in_progress
130+
, [Datamodel_common._pool; ref] @ additional_info
131+
)
132+
in
133+
let is_current_op op =
134+
List.exists (fun (_, current_op) -> op = current_op) current_ops
123135
in
124-
let is_current_op = Fun.flip List.mem current_ops in
125136
let blocking =
126137
List.find_opt (fun (op, _) -> is_current_op op) blocking_ops_table
127138
in
128-
let waiting = List.find_opt is_current_op waiting_ops in
139+
let waiting =
140+
List.find_opt
141+
(fun (_, current_op) -> List.mem current_op waiting_ops)
142+
current_ops
143+
in
129144
( match (blocking, waiting) with
130-
| Some (_, reason), _ ->
145+
| Some (_, reason), waiting_current_op ->
131146
(* Mark all potentially blocking operations as invalid due
132147
to the specific blocking operation's "in progress" error. *)
133148
set_errors blocking_ops (reason, []) ;
134149
(* Mark all waiting operations as invalid for the generic
135150
"OTHER_OPERATION_IN_PROGRESS" reason. *)
136-
set_errors waiting_ops other_operation_in_progress
151+
set_errors waiting_ops (other_operation_in_progress waiting_current_op)
137152
(* Note that all_operations ⊆ blocking_ops ∪ waiting_ops, so this
138153
invalidates all operations (with the reason partitioned
139154
between whether the operation is blocking or waiting). *)
140-
| None, Some _ ->
155+
| None, (Some _ as waiting_current_op) ->
141156
(* If there's no blocking operation in current operations, but
142157
there is a waiting operation, invalidate all operations for the
143158
generic reason. Again, this covers every operation. *)
144-
set_errors all_operations other_operation_in_progress
159+
set_errors all_operations (other_operation_in_progress waiting_current_op)
145160
| None, None -> (
146161
(* If there's no blocking or waiting operation in current
147162
operations (i.e. current operations is empty), we can report

ocaml/xapi/xapi_sr_operations.ml

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -200,24 +200,35 @@ let valid_operations ~__context ?op record _ref' : table =
200200
let check_parallel_ops ~__context _record =
201201
let safe_to_parallelise = [`plug] in
202202
let current_ops =
203-
Xapi_stdext_std.Listext.List.setify (List.map snd current_ops)
203+
List.sort_uniq
204+
(fun (_ref1, op1) (_ref2, op2) -> compare op1 op2)
205+
current_ops
204206
in
205207
(* If there are any current operations, all the non_parallelisable operations
206208
must definitely be stopped *)
207-
if current_ops <> [] then
208-
set_errors Api_errors.other_operation_in_progress
209-
["SR"; _ref; sr_operation_to_string (List.hd current_ops)]
210-
(Xapi_stdext_std.Listext.List.set_difference all_ops safe_to_parallelise) ;
211-
let all_are_parallelisable =
212-
List.fold_left ( && ) true
213-
(List.map (fun op -> List.mem op safe_to_parallelise) current_ops)
214-
in
215-
(* If not all are parallelisable (eg a vdi_resize), ban the otherwise
216-
parallelisable operations too *)
217-
if not all_are_parallelisable then
218-
set_errors Api_errors.other_operation_in_progress
219-
["SR"; _ref; sr_operation_to_string (List.hd current_ops)]
220-
safe_to_parallelise
209+
match current_ops with
210+
| (current_op_ref, current_op_type) :: _ ->
211+
set_errors Api_errors.other_operation_in_progress
212+
["SR"; _ref; sr_operation_to_string current_op_type; current_op_ref]
213+
(Xapi_stdext_std.Listext.List.set_difference all_ops
214+
safe_to_parallelise
215+
) ;
216+
let non_parallelisable_op =
217+
List.find_opt
218+
(fun (_, op) -> not (List.mem op safe_to_parallelise))
219+
current_ops
220+
in
221+
(* If not all are parallelisable (eg a vdi_resize), ban the otherwise
222+
parallelisable operations too *)
223+
Option.iter
224+
(fun (op_ref, op_type) ->
225+
set_errors Api_errors.other_operation_in_progress
226+
["SR"; _ref; sr_operation_to_string op_type; op_ref]
227+
safe_to_parallelise
228+
)
229+
non_parallelisable_op
230+
| [] ->
231+
()
221232
in
222233
let check_cluster_stack_compatible ~__context _record =
223234
(* Check whether there are any conflicts with HA that prevent us from

0 commit comments

Comments
 (0)