-
Notifications
You must be signed in to change notification settings - Fork 294
CA-404062: Wrongly restart xapi when receiving HTTP errors #6201
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -204,6 +204,7 @@ let connection_timeout = ref !Db_globs.master_connection_default_timeout | |
| are exceeded *) | ||
| let restart_on_connection_timeout = ref true | ||
|
|
||
|
|
||
| exception Content_length_required | ||
|
|
||
| let do_db_xml_rpc_persistent_with_reopen ~host:_ ~path (req : string) : | ||
|
|
@@ -221,6 +222,59 @@ let do_db_xml_rpc_persistent_with_reopen ~host:_ ~path (req : string) : | |
| else if !backoff_delay > 256.0 then | ||
| backoff_delay := 256.0 | ||
| in | ||
| let reconnect () = | ||
| (* RPC failed - there's no way we can recover from this so try reopening connection every 2s + backoff delay *) | ||
| ( match !my_connection with | ||
| | None -> | ||
| () | ||
| | Some st_proc -> ( | ||
| my_connection := None ; | ||
| (* don't want to try closing multiple times *) | ||
| try Stunnel.disconnect st_proc with _ -> () | ||
| ) | ||
| ) ; | ||
| let time_sofar = Unix.gettimeofday () -. time_call_started in | ||
| if !connection_timeout < 0. then ( | ||
| if not !surpress_no_timeout_logs then ( | ||
| debug | ||
| "Connection to master died. I will continue to retry \ | ||
| indefinitely (supressing future logging of this message)." ; | ||
| error | ||
| "Connection to master died. I will continue to retry \ | ||
| indefinitely (supressing future logging of this message)." | ||
| ) ; | ||
| surpress_no_timeout_logs := true | ||
| ) else | ||
| debug | ||
| "Connection to master died: time taken so far in this call '%f'; \ | ||
| will %s" | ||
| time_sofar | ||
| ( if !connection_timeout < 0. then | ||
| "never timeout" | ||
| else | ||
| Printf.sprintf "timeout after '%f'" !connection_timeout | ||
| ) ; | ||
| if time_sofar > !connection_timeout && !connection_timeout >= 0. then | ||
| if !restart_on_connection_timeout then ( | ||
| debug | ||
| "Exceeded timeout for retrying master connection: restarting xapi" ; | ||
| !Db_globs.restart_fn () | ||
| ) else ( | ||
| debug | ||
| "Exceeded timeout for retrying master connection: raising \ | ||
| Cannot_connect_to_master" ; | ||
| raise Cannot_connect_to_master | ||
| ) ; | ||
| debug "Sleeping %f seconds before retrying master connection..." | ||
| !backoff_delay ; | ||
| let timed_out = Scheduler.PipeDelay.wait delay !backoff_delay in | ||
| if not timed_out then | ||
| debug "%s: Sleep interrupted, retrying master connection now" | ||
| __FUNCTION__ ; | ||
| update_backoff_delay () ; | ||
| D.log_and_ignore_exn open_secure_connection | ||
| in | ||
|
|
||
| while not !write_ok do | ||
| try | ||
| let req_string = req in | ||
|
|
@@ -266,67 +320,12 @@ let do_db_xml_rpc_persistent_with_reopen ~host:_ ~path (req : string) : | |
| Db_globs.http_limit_max_rpc_size ; | ||
| debug "Re-raising exception to caller." ; | ||
| raise Http.Client_requested_size_over_limit | ||
| (* TODO: This http exception handler caused CA-36936 and can probably be removed now that there's backoff delay in the generic handler _ below *) | ||
| | Http_client.Http_error (http_code, err_msg) -> | ||
| error | ||
| "Received HTTP error %s (%s) from master. This suggests our master \ | ||
| address is wrong. Sleeping for %.0fs and then executing restart_fn." | ||
| http_code err_msg | ||
| !Db_globs.permanent_master_failure_retry_interval ; | ||
| Thread.delay !Db_globs.permanent_master_failure_retry_interval ; | ||
| !Db_globs.restart_fn () | ||
| error "Received HTTP error %s (%s) from the coordinator" http_code err_msg ; | ||
| reconnect () | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible that the HTTP error is because "our master address is wrong" as the original comment?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, I see the detailed explanation in the commit message. |
||
| | e -> | ||
| error "Caught %s" (Printexc.to_string e) ; | ||
| (* RPC failed - there's no way we can recover from this so try reopening connection every 2s + backoff delay *) | ||
| ( match !my_connection with | ||
| | None -> | ||
| () | ||
| | Some st_proc -> ( | ||
| my_connection := None ; | ||
| (* don't want to try closing multiple times *) | ||
| try Stunnel.disconnect st_proc with _ -> () | ||
| ) | ||
| ) ; | ||
| let time_sofar = Unix.gettimeofday () -. time_call_started in | ||
| if !connection_timeout < 0. then ( | ||
| if not !surpress_no_timeout_logs then ( | ||
| debug | ||
| "Connection to master died. I will continue to retry \ | ||
| indefinitely (supressing future logging of this message)." ; | ||
| error | ||
| "Connection to master died. I will continue to retry \ | ||
| indefinitely (supressing future logging of this message)." | ||
| ) ; | ||
| surpress_no_timeout_logs := true | ||
| ) else | ||
| debug | ||
| "Connection to master died: time taken so far in this call '%f'; \ | ||
| will %s" | ||
| time_sofar | ||
| ( if !connection_timeout < 0. then | ||
| "never timeout" | ||
| else | ||
| Printf.sprintf "timeout after '%f'" !connection_timeout | ||
| ) ; | ||
| if time_sofar > !connection_timeout && !connection_timeout >= 0. then | ||
| if !restart_on_connection_timeout then ( | ||
| debug | ||
| "Exceeded timeout for retrying master connection: restarting xapi" ; | ||
| !Db_globs.restart_fn () | ||
| ) else ( | ||
| debug | ||
| "Exceeded timeout for retrying master connection: raising \ | ||
| Cannot_connect_to_master" ; | ||
| raise Cannot_connect_to_master | ||
| ) ; | ||
| debug "Sleeping %f seconds before retrying master connection..." | ||
| !backoff_delay ; | ||
| let timed_out = Scheduler.PipeDelay.wait delay !backoff_delay in | ||
| if not timed_out then | ||
| debug "%s: Sleep interrupted, retrying master connection now" | ||
| __FUNCTION__ ; | ||
| update_backoff_delay () ; | ||
| D.log_and_ignore_exn open_secure_connection | ||
| reconnect () | ||
| done ; | ||
| !result | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might be good to update to "coordinator" for those logs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The most of the change is copy-and-paste. So I kept the code as it was.