Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 49 additions & 17 deletions ocaml/xapi/vgpuops.ml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ open Stdext
open Listext
open Xstringext

type vgpu = {
type vgpu_t = {
vgpu_ref: API.ref_VGPU;
gpu_group_ref: API.ref_GPU_group;
devid: int;
Expand All @@ -27,7 +27,7 @@ type vgpu = {
requires_passthrough: [ `PF | `VF ] option;
}

let vgpu_of_vgpu ~__context vm_r vgpu =
let vgpu_of_ref ~__context vgpu =
let vgpu_r = Db.VGPU.get_record ~__context ~self:vgpu in
{
vgpu_ref = vgpu;
Expand All @@ -39,7 +39,7 @@ let vgpu_of_vgpu ~__context vm_r vgpu =
}

let vgpus_of_vm ~__context vm_r =
List.map (vgpu_of_vgpu ~__context vm_r) vm_r.API.vM_VGPUs
List.map (vgpu_of_ref ~__context ) vm_r.API.vM_VGPUs

let fail_creation vm vgpu =
match vgpu.requires_passthrough with
Expand All @@ -55,10 +55,40 @@ let fail_creation vm vgpu =
Ref.string_of vgpu.gpu_group_ref
]))

let allocate_vgpu_to_gpu ~__context vm host vgpu =
let allocate_vgpu_to_gpu ?dry_run ?pre_allocate_list ~__context vm host vgpu =
(* Get all pGPU from the host *)
let available_pgpus = Db.Host.get_PGPUs ~__context ~self:host in
(* Get all pGPU from the required groups *)
let compatible_pgpus = Db.GPU_group.get_PGPUs ~__context ~self:vgpu.gpu_group_ref in
let pgpus = List.intersect compatible_pgpus available_pgpus in

let pgpu_can_hold_vgpu pgpu vgpu =
try Xapi_pgpu.assert_can_run_VGPU ~__context ~self:pgpu ~vgpu;
true
with e -> false in
(* Get all pGPU that can hold the vGPU *)
let active_pgpus = List.filter ( fun pgpu -> pgpu_can_hold_vgpu pgpu vgpu.vgpu_ref ) pgpus in

let remaining_capacity_for_vgpu_from_pgpu vgpu pgpu =
let db_remaining = Helpers.call_api_functions ~__context
(fun rpc session_id ->
Client.Client.PGPU.get_remaining_capacity ~rpc ~session_id
~self:pgpu ~vgpu_type:vgpu.type_ref) in
(* Check if any pre_allocation existed, the pre_allocation is a set of vGPU allocation that
* not reflected in the database, usually in the dry run mode, with following format
* [(v1,p1);(v2,p2);(v3,p1)...]*)
match pre_allocate_list with
| Some pre_allocate_list ->
let virtul_allocation = List.fold_left (fun num ele ->
match ele with
|(_,cpgpu) when cpgpu = pgpu -> Int64.add num 1L
|(_,_) -> num )
0L pre_allocate_list in
Int64.sub db_remaining virtul_allocation
(* Probablly need check here, assert >0*)
| _ -> db_remaining
in

(* Sort the pgpus in lists of equal optimality for vGPU placement based on
* the GPU groups allocation algorithm *)
let sort_desc =
Expand All @@ -67,11 +97,8 @@ let allocate_vgpu_to_gpu ~__context vm host vgpu =
| `breadth_first -> true
in
let sorted_pgpus = Helpers.sort_by_schwarzian ~descending:sort_desc
(fun pgpu ->
Helpers.call_api_functions ~__context (fun rpc session_id ->
Client.Client.PGPU.get_remaining_capacity ~rpc ~session_id
~self:pgpu ~vgpu_type:vgpu.type_ref))
pgpus
(fun pgpu -> remaining_capacity_for_vgpu_from_pgpu vgpu pgpu )
active_pgpus
in
let rec choose_pgpu = function
| [] -> None
Expand All @@ -85,9 +112,14 @@ let allocate_vgpu_to_gpu ~__context vm host vgpu =
match choose_pgpu sorted_pgpus with
| None -> fail_creation vm vgpu
| Some pgpu ->
Db.VGPU.set_scheduled_to_be_resident_on ~__context
~self:vgpu.vgpu_ref ~value:pgpu;
pgpu
begin match dry_run with
|Some true -> ()
|_ -> Db.VGPU.set_scheduled_to_be_resident_on ~__context ~self:vgpu.vgpu_ref ~value:pgpu
end;
let pre_list = match pre_allocate_list with
| Some pre_allocate_list -> pre_allocate_list
| _ -> [] in
(vgpu.vgpu_ref,pgpu)::pre_list

(* Take a PCI device and assign it, and any dependent devices, to the VM *)
let add_pcis_to_vm ~__context host vm pci =
Expand Down Expand Up @@ -117,8 +149,8 @@ let reserve_free_virtual_function ~__context vm pf =
(* We may still need to load the driver... do that and try again *)
let pf_host = Db.PCI.get_host ~__context ~self:pf in
Helpers.call_api_functions ~__context (fun rpc session_id ->
Client.Client.Host.mxgpu_vf_setup rpc session_id pf_host
);
Client.Client.Host.mxgpu_vf_setup rpc session_id pf_host
);
get false
end else
(* This probably means that our capacity checking went wrong! *)
Expand All @@ -135,14 +167,14 @@ let add_vgpus_to_vm ~__context host vm vgpus vgpu_manual_setup =
match vgpu.requires_passthrough with
| Some `PF ->
debug "Creating passthrough VGPUs";
let pgpu = allocate_vgpu_to_gpu ~__context vm host vgpu in
let pgpu = List.assoc vgpu.vgpu_ref (allocate_vgpu_to_gpu ~__context vm host vgpu) in
let pci = Db.PGPU.get_PCI ~__context ~self:pgpu in
add_pcis_to_vm ~__context host vm pci
| Some `VF ->
Pool_features.assert_enabled ~__context ~f:Features.VGPU;
debug "Creating SR-IOV VGPUs";
if not vgpu_manual_setup then
let pgpu = allocate_vgpu_to_gpu ~__context vm host vgpu in
let pgpu = List.assoc vgpu.vgpu_ref (allocate_vgpu_to_gpu ~__context vm host vgpu) in
Db.PGPU.get_PCI ~__context ~self:pgpu
|> reserve_free_virtual_function ~__context vm
|> add_pcis_to_vm ~__context host vm
Expand All @@ -166,7 +198,7 @@ let vgpu_manual_setup_of_vm vm_r =
let create_vgpus ~__context host (vm, vm_r) hvm =
let vgpus = vgpus_of_vm ~__context vm_r in
if vgpus <> [] && not hvm then
raise (Api_errors.Server_error (Api_errors.feature_requires_hvm, ["vGPU- and GPU-passthrough needs HVM"]));
raise (Api_errors.Server_error (Api_errors.feature_requires_hvm, ["vGPU- and GPU-passthrough needs HVM"]));
add_vgpus_to_vm ~__context host vm vgpus (vgpu_manual_setup_of_vm vm_r)

(* This function is called from Xapi_xenops, after forwarding, so possibly on a slave. *)
Expand Down
17 changes: 17 additions & 0 deletions ocaml/xapi/vgpuops.mli
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,20 @@ val vgpu_manual_setup_of_vm : API.vM_t -> bool
(** Return a list of the GPU PCI devices which have been assigned to this VM *)
val list_pcis_for_passthrough :
__context:Context.t -> vm:API.ref_VM -> (int * (int * int * int * int)) list

(** Allocate a vGPU to a pGPU of a host for the VM
* return a list indicate which pGPU is allocated for the vGPU in following format
* [(v1,p1);(v2,p2);(v3,p1)]
* Two additional arguments dry_run and pre_allocate_list is added to this fuction.
* They are designed to be optional to keep the arguments interface backward-compatibility
* dry_run set to "false", pre_allocate_list set to "[]" by default.
* if dry_run mode is specified, the function just dry run the allocation process
* without any database operation. pre_allocate_list is used to record the dry run
* states *)
type vgpu_t
val allocate_vgpu_to_gpu :
?dry_run:bool -> ?pre_allocate_list:(API.ref_VGPU * API.ref_PGPU) list ->
__context:Context.t -> API.ref_VM -> API.ref_host -> vgpu_t -> (API.ref_VGPU * API.ref_PGPU) list

(** Get a vgpu record from vgpu ref *)
val vgpu_of_ref : __context:Context.t -> API.ref_VGPU -> vgpu_t
1 change: 1 addition & 0 deletions ocaml/xapi/xapi_globs.ml
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ let vgpu_manual_setup_key = "vgpu_manual_setup"
let vgpu_pci_key = "vgpu_pci_id"
let vgpu_config_key = "vgpu_config"
let vgpu_extra_args_key = "vgpu_extra_args"
let vgpu_pci_prefix = "0000:0:"

let igd_passthru_key = "igd_passthrough"

Expand Down
9 changes: 8 additions & 1 deletion ocaml/xapi/xapi_pgpu.ml
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,14 @@ let get_remaining_capacity ~__context ~self ~vgpu_type =

let assert_can_run_VGPU ~__context ~self ~vgpu =
let vgpu_type = Db.VGPU.get_type ~__context ~self:vgpu in
Xapi_pgpu_helpers.assert_capacity_exists_for_VGPU_type ~__context ~self ~vgpu_type
Xapi_pgpu_helpers.assert_capacity_exists_for_VGPU_type ~__context ~self ~vgpu_type;

(** Check whether Nvidia NVML allow the vGPU by gpumon *)
let nvidia_compatible = Xapi_gpumon.Nvidia.vgpu_pgpu_are_compatible ~__context ~vgpu ~pgpu:self in
if not nvidia_compatible then raise (Api_errors.Server_error
(** This should be a new exception **)
(Api_errors.pgpu_insufficient_capacity_for_vgpu, [Ref.string_of self;Ref.string_of vgpu_type]))


let update_dom0_access ~__context ~self ~action =
let db_current = Db.PGPU.get_dom0_access ~__context ~self in
Expand Down
44 changes: 27 additions & 17 deletions ocaml/xapi/xapi_pgpu_helpers.ml
Original file line number Diff line number Diff line change
Expand Up @@ -50,20 +50,30 @@ let get_allocated_VGPUs ~__context ~self =

let assert_VGPU_type_allowed ~__context ~self ~vgpu_type =
assert_VGPU_type_enabled ~__context ~self ~vgpu_type;
(match get_allocated_VGPUs ~__context ~self with
| [] -> ()
| resident_VGPU :: _ ->
let running_type =
Db.VGPU.get_type ~__context ~self:resident_VGPU
in
if running_type <> vgpu_type
then raise (Api_errors.Server_error (
Api_errors.vgpu_type_not_compatible_with_running_type,
[
Ref.string_of self;
Ref.string_of vgpu_type;
Ref.string_of running_type;
])))
let allocated_vgpu_list = get_allocated_VGPUs ~__context ~self in
(** Now check whether the requested type is permitted *)
match allocated_vgpu_list with
| [] -> () (* Not allocated on this pgpu, does not need to check compatibility*)
| hd::tail ->
let grant_vgpu_type_list = List.fold_left
(fun grant_list current_list -> Listext.List.intersect grant_list current_list)
(Db.VGPU_type.get_compatible_types_on_pgpu ~__context ~self:(Db.VGPU.get_type ~__context ~self:hd))
(
List.map (fun self -> Db.VGPU.get_type ~__context ~self) tail
|> List.sort_uniq Pervasives.compare (* Remove the duplicated elements *)
|> List.map (fun self -> Db.VGPU_type.get_compatible_types_on_pgpu ~__context ~self)
) in
if not (List.mem vgpu_type (List.map Ref.of_string (*Remove this when String-> Ref *) grant_vgpu_type_list)) then
let sep = ";" in
raise (Api_errors.Server_error (
Api_errors.vgpu_type_not_compatible_with_running_type, [
Ref.string_of self;
Ref.string_of vgpu_type;
List.map (fun self-> Db.VGPU.get_type ~__context ~self) allocated_vgpu_list
|> List.sort_uniq Pervasives.compare
|> List.map (fun vgpu_ref -> Ref.string_of vgpu_ref)
|> String.concat sep
]))

let assert_no_resident_VGPUs_of_type ~__context ~self ~vgpu_type =
let open Db_filter_types in
Expand Down Expand Up @@ -212,9 +222,9 @@ let assert_destination_has_pgpu_compatible_with_vm ~__context ~vm ~vgpu_map ~hos
| `nvidia ->
Db.VGPU.get_GPU_group ~__context ~self:vgpu
|> fun self -> Db.GPU_group.get_GPU_types ~__context ~self
|> fun pgpu_types -> get_first_suitable_pgpu pgpu_types vgpu pgpus
|> fun pgpu ->
assert_destination_pgpu_is_compatible_with_vm ~__context ~vm ~vgpu ~pgpu ~host ?remote ()
|> fun pgpu_types -> get_first_suitable_pgpu pgpu_types vgpu pgpus
|> fun pgpu ->
assert_destination_pgpu_is_compatible_with_vm ~__context ~vm ~vgpu ~pgpu ~host ?remote ()
in
let vgpus = Db.VM.get_VGPUs ~__context ~self:vm in
let _mapped, unmapped = List.partition (fun vgpu -> List.mem_assoc vgpu vgpu_map) vgpus in
Expand Down
12 changes: 9 additions & 3 deletions ocaml/xapi/xapi_vgpu_type.ml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ module Identifier = struct
psubdev_id : int option;
vdev_id : int;
vsubdev_id : int;
type_id : string;
}

type gvt_g_id = {
Expand Down Expand Up @@ -64,13 +65,14 @@ module Identifier = struct
match id with
| Passthrough -> "passthrough"
| Nvidia nvidia_id ->
Printf.sprintf "nvidia,%04x,%s,%04x,%04x"
Printf.sprintf "nvidia,%04x,%s,%04x,%04x,%s"
nvidia_id.pdev_id
(match nvidia_id.psubdev_id with
| Some id -> Printf.sprintf "%04x" id
| None -> "")
nvidia_id.vdev_id
nvidia_id.vsubdev_id
nvidia_id.type_id
| GVT_g gvt_g_id ->
Printf.sprintf "gvt-g,%04x,%Lx,%Lx,%Lx"
gvt_g_id.pdev_id
Expand Down Expand Up @@ -306,11 +308,14 @@ module Nvidia_old = struct
(List.assoc "plugin0.num_heads" args) in
let max_instance = Int64.of_string
(List.assoc "plugin0.max_instance" args) in
(* Define "0" here because now we are using new way to read from nvidia conf file *)
let type_id = "0" in
let identifier = Identifier.({
pdev_id;
psubdev_id;
vdev_id;
vsubdev_id;
type_id;
}) in
let compatible_types_in_vm = [] in
let compatible_types_on_pgpu = [] in
Expand Down Expand Up @@ -625,6 +630,7 @@ module Vendor_nvidia = struct
psubdev_id;
vdev_id = int_of_string (get_attr "deviceId" devid);
vsubdev_id = int_of_string (get_attr "subsystemId" devid);
type_id = id;
} in
let file_path = whitelist in
(* Multiple vgpu support:
Expand Down Expand Up @@ -668,8 +674,8 @@ module Vendor_nvidia = struct

let vgpu_type_of_conf pci_access vendor_name _ conf =
let open Identifier in
debug "Pci.lookup_subsystem_device_name: vendor=%04x device=%04x subdev=%04x"
vendor_id conf.identifier.vdev_id conf.identifier.vsubdev_id;
debug "Pci.lookup_subsystem_device_name: vendor=%04x device=%04x subdev=%04x type_id=%s"
vendor_id conf.identifier.vdev_id conf.identifier.vsubdev_id conf.identifier.type_id;
let default v =
match v with
| Some v -> v
Expand Down
Loading