From 6118d47249aa9bd27a7ed5961b8c0bbdf2891437 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Mon, 30 Jun 2025 18:26:32 +0100 Subject: [PATCH 1/2] xapi_vm_lifecycle: Stop assuming PV driver's presence implies all features A 10+ year old note justified assuming that feature-suspend and others are always available by referring to a buggy Windows guest agent. It has been correctly writing 1 to control/feature-suspend for at least a decade now (tested on old Windows PV drivers), so there is no reason to maintain this workaround. This allows PV drivers to potentially disable such features. Signed-off-by: Andrii Sultanov --- ocaml/xapi/xapi_vm_lifecycle.ml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ocaml/xapi/xapi_vm_lifecycle.ml b/ocaml/xapi/xapi_vm_lifecycle.ml index 5ec4ca6d792..cfbc8162e53 100644 --- a/ocaml/xapi/xapi_vm_lifecycle.ml +++ b/ocaml/xapi/xapi_vm_lifecycle.ml @@ -166,10 +166,6 @@ let has_definitely_booted_pv ~vmmr = ) (** Return an error iff vmr is an HVM guest and lacks a needed feature. - * Note: it turned out that the Windows guest agent does not write "feature-suspend" - * on resume (only on startup), so we cannot rely just on that flag. We therefore - * add a clause that enables all features when PV drivers are present using the - * old-style check. * The "strict" param should be true for determining the allowed_operations list * (which is advisory only) and false (more permissive) when we are potentially about * to perform an operation. This makes a difference for ops that require the guest to @@ -180,8 +176,6 @@ let check_op_for_feature ~__context ~vmr:_ ~vmmr ~vmgmr ~power_state ~op ~ref power_state <> `Running (* PV guests offer support implicitly *) || has_definitely_booted_pv ~vmmr - || Xapi_pv_driver_version.(has_pv_drivers (of_guest_metrics vmgmr)) - (* Full PV drivers imply all features *) then None else From c895469f5499b0cfb68001e37dca8b1ce294b249 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Wed, 2 Jul 2025 15:20:44 +0100 Subject: [PATCH 2/2] xapi_vm_lifecycle: Disallow suspend when cant_suspend_reason is present When data/cant_suspend_reason is present in xenstore (renamed data-cant-suspend-reason in "other" guest metrics), it would exclude operations involving suspend from the allowed operations list, but still allow actually suspending (strict=true during allowed operations checks but could be false otherwise. This was additionally overriden by assuming that PV driver presence guarantees feature-suspend). Suspending in such a situation would always result in a crash, usually with the following error: ``` $ xe vm-suspend uuid=d546c8c2-bcb1-0ed6-7931-d0fc9393ccb2 The server failed to handle your request, due to an internal error. The given message may give details useful for debugging the problem. message: xenopsd internal error: Device_common.QMP_Error(8, "{\"error\": {\"class\":\"GenericError\", \"desc\":\"State blocked by non-migratable device '0000:00:07.0/nvme'\", \"data\":{} }, \"id\":\"qmp-000006-8\"}") ``` So disallow suspending altogether when cant_suspend_reason is present. Log the QEMU error directly in the XAPI error as well: ``` $ xe vm-suspend uuid=d546c8c2-bcb1-0ed6-7931-d0fc9393ccb2 You attempted an operation on a VM which lacks the feature. vm: d546c8c2-bcb1-0ed6-7931-d0fc9393ccb2 (Windows 10 (64-bit)) reason: {"error":{"class":"GenericError","desc":"State blocked by non-migratable device '0000:00:07.0/nvme'","data":{}},"id":"qmp-000012-9"} ``` Signed-off-by: Andrii Sultanov --- ocaml/idl/datamodel_errors.ml | 2 ++ ocaml/xapi-consts/api_errors.ml | 2 ++ ocaml/xapi/xapi_vm_lifecycle.ml | 18 +++++++++++++++--- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/ocaml/idl/datamodel_errors.ml b/ocaml/idl/datamodel_errors.ml index 30e185ac192..b205b670159 100644 --- a/ocaml/idl/datamodel_errors.ml +++ b/ocaml/idl/datamodel_errors.ml @@ -534,6 +534,8 @@ let _ = () ; error Api_errors.vm_lacks_feature ["vm"] ~doc:"You attempted an operation on a VM which lacks the feature." () ; + error Api_errors.vm_non_suspendable ["vm"; "reason"] + ~doc:"You attempted an operation on a VM which is not suspendable." () ; error Api_errors.vm_is_template ["vm"] ~doc:"The operation attempted is not valid for a template VM" () ; error Api_errors.other_operation_in_progress ["class"; "object"] diff --git a/ocaml/xapi-consts/api_errors.ml b/ocaml/xapi-consts/api_errors.ml index 077a8dacbf9..89ab735194b 100644 --- a/ocaml/xapi-consts/api_errors.ml +++ b/ocaml/xapi-consts/api_errors.ml @@ -440,6 +440,8 @@ let vm_old_pv_drivers = add_error "VM_OLD_PV_DRIVERS" let vm_lacks_feature = add_error "VM_LACKS_FEATURE" +let vm_non_suspendable = add_error "VM_NON_SUSPENDABLE" + let vm_cannot_delete_default_template = add_error "VM_CANNOT_DELETE_DEFAULT_TEMPLATE" diff --git a/ocaml/xapi/xapi_vm_lifecycle.ml b/ocaml/xapi/xapi_vm_lifecycle.ml index cfbc8162e53..f5a9d7dfd57 100644 --- a/ocaml/xapi/xapi_vm_lifecycle.ml +++ b/ocaml/xapi/xapi_vm_lifecycle.ml @@ -151,6 +151,12 @@ let has_feature ~vmgmr ~feature = try List.assoc feature other = "1" with Not_found -> false ) +let get_feature ~vmgmr ~feature = + Option.bind vmgmr (fun gmr -> + let other = gmr.Db_actions.vM_guest_metrics_other in + List.assoc_opt feature other + ) + (* Returns `true` only if we are certain that the VM has booted PV (if there * is no metrics record, then we can't tell) *) let has_definitely_booted_pv ~vmmr = @@ -194,9 +200,15 @@ let check_op_for_feature ~__context ~vmr:_ ~vmmr ~vmgmr ~power_state ~op ~ref some_err Api_errors.vm_lacks_feature | `changing_VCPUs_live when lack_feature "feature-vcpu-hotplug" -> some_err Api_errors.vm_lacks_feature - | (`suspend | `checkpoint | `pool_migrate | `migrate_send) - when strict && lack_feature "feature-suspend" -> - some_err Api_errors.vm_lacks_feature + | `suspend | `checkpoint | `pool_migrate | `migrate_send -> ( + match get_feature ~vmgmr ~feature:"data-cant-suspend-reason" with + | Some reason -> + Some (Api_errors.vm_non_suspendable, [Ref.string_of ref; reason]) + | None when strict && lack_feature "feature-suspend" -> + some_err Api_errors.vm_lacks_feature + | None -> + None + ) | _ -> None