Skip to content

Commit 7c59c66

Browse files
authored
CA-404062: Wrongly restart xapi when receiving HTTP errors (#6205)
(This is a back porting from a [PR](#6201) against master branch) The xapi on a supporter host would restart when it received HTTP error from the xapi on the coordinator host. This breaks the pool.designate_new_master use case for a big pool, e.g. 64-host pool. In this case, some supporters may restart unexpectedly within the phase of committing new coordinator due to the logic above. Additionally, the purpose of this logic, explained by the error message, is not correct also. Not all HTTP errors are caused by "our master address is wrong". On the other hand, if a use case requires to restart the xapi, a more explicit logic should ensure that, instead of leveraging an implicit HTTP error code. Furhtermore, if a supporter indeed is connecting to a wrong coordinator, this should be a bug and can be recovered manually. Based on above arguments, the restarting xapi after receiving HTTP error is removed. This follows the TODO concluded in CA-36936 as well.
2 parents 5f22d15 + 2671869 commit 7c59c66

File tree

1 file changed

+54
-58
lines changed

1 file changed

+54
-58
lines changed

ocaml/database/master_connection.ml

Lines changed: 54 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,55 @@ let do_db_xml_rpc_persistent_with_reopen ~host ~path (req : string) :
214214
else if !backoff_delay > 256.0 then
215215
backoff_delay := 256.0
216216
in
217+
let reconnect () =
218+
(* RPC failed - there's no way we can recover from this so try reopening connection every 2s + backoff delay *)
219+
( match !my_connection with
220+
| None ->
221+
()
222+
| Some st_proc -> (
223+
my_connection := None ;
224+
(* don't want to try closing multiple times *)
225+
try Stunnel.disconnect st_proc with _ -> ()
226+
)
227+
) ;
228+
let time_sofar = Unix.gettimeofday () -. time_call_started in
229+
if !connection_timeout < 0. then (
230+
if not !surpress_no_timeout_logs then (
231+
debug
232+
"Connection to master died. I will continue to retry indefinitely \
233+
(supressing future logging of this message)." ;
234+
error
235+
"Connection to master died. I will continue to retry indefinitely \
236+
(supressing future logging of this message)."
237+
) ;
238+
surpress_no_timeout_logs := true
239+
) else
240+
debug
241+
"Connection to master died: time taken so far in this call '%f'; will \
242+
%s"
243+
time_sofar
244+
( if !connection_timeout < 0. then
245+
"never timeout"
246+
else
247+
Printf.sprintf "timeout after '%f'" !connection_timeout
248+
) ;
249+
if time_sofar > !connection_timeout && !connection_timeout >= 0. then
250+
if !restart_on_connection_timeout then (
251+
debug "Exceeded timeout for retrying master connection: restarting xapi" ;
252+
!Db_globs.restart_fn ()
253+
) else (
254+
debug
255+
"Exceeded timeout for retrying master connection: raising \
256+
Cannot_connect_to_master" ;
257+
raise Cannot_connect_to_master
258+
) ;
259+
debug "Sleeping %f seconds before retrying master connection..."
260+
!backoff_delay ;
261+
Thread.delay !backoff_delay ;
262+
update_backoff_delay () ;
263+
try open_secure_connection () with _ -> ()
264+
(* oh well, maybe nextime... *)
265+
in
217266
while not !write_ok do
218267
try
219268
let req_string = req in
@@ -259,66 +308,13 @@ let do_db_xml_rpc_persistent_with_reopen ~host ~path (req : string) :
259308
Db_globs.http_limit_max_rpc_size ;
260309
debug "Re-raising exception to caller." ;
261310
raise Http_svr.Client_requested_size_over_limit
262-
(* TODO: This http exception handler caused CA-36936 and can probably be removed now that there's backoff delay in the generic handler _ below *)
263311
| Http_client.Http_error (http_code, err_msg) ->
264-
error
265-
"Received HTTP error %s (%s) from master. This suggests our master \
266-
address is wrong. Sleeping for %.0fs and then executing restart_fn."
267-
http_code err_msg
268-
!Db_globs.permanent_master_failure_retry_interval ;
269-
Thread.delay !Db_globs.permanent_master_failure_retry_interval ;
270-
!Db_globs.restart_fn ()
271-
| e -> (
312+
error "Received HTTP error %s (%s) from the coordinator." http_code
313+
err_msg ;
314+
reconnect ()
315+
| e ->
272316
error "Caught %s" (Printexc.to_string e) ;
273-
(* RPC failed - there's no way we can recover from this so try reopening connection every 2s + backoff delay *)
274-
( match !my_connection with
275-
| None ->
276-
()
277-
| Some st_proc -> (
278-
my_connection := None ;
279-
(* don't want to try closing multiple times *)
280-
try Stunnel.disconnect st_proc with _ -> ()
281-
)
282-
) ;
283-
let time_sofar = Unix.gettimeofday () -. time_call_started in
284-
if !connection_timeout < 0. then (
285-
if not !surpress_no_timeout_logs then (
286-
debug
287-
"Connection to master died. I will continue to retry \
288-
indefinitely (supressing future logging of this message)." ;
289-
error
290-
"Connection to master died. I will continue to retry \
291-
indefinitely (supressing future logging of this message)."
292-
) ;
293-
surpress_no_timeout_logs := true
294-
) else
295-
debug
296-
"Connection to master died: time taken so far in this call '%f'; \
297-
will %s"
298-
time_sofar
299-
( if !connection_timeout < 0. then
300-
"never timeout"
301-
else
302-
Printf.sprintf "timeout after '%f'" !connection_timeout
303-
) ;
304-
if time_sofar > !connection_timeout && !connection_timeout >= 0. then
305-
if !restart_on_connection_timeout then (
306-
debug
307-
"Exceeded timeout for retrying master connection: restarting xapi" ;
308-
!Db_globs.restart_fn ()
309-
) else (
310-
debug
311-
"Exceeded timeout for retrying master connection: raising \
312-
Cannot_connect_to_master" ;
313-
raise Cannot_connect_to_master
314-
) ;
315-
debug "Sleeping %f seconds before retrying master connection..."
316-
!backoff_delay ;
317-
Thread.delay !backoff_delay ;
318-
update_backoff_delay () ;
319-
try open_secure_connection () with _ -> ()
320-
(* oh well, maybe nextime... *)
321-
)
317+
reconnect ()
322318
done ;
323319
!result
324320

0 commit comments

Comments
 (0)