Skip to content

Commit d76fb40

Browse files
sean-jcbonzini
authored andcommitted
KVM: VMX: Handle PI descriptor updates during vcpu_put/load
Move the posted interrupt pre/post_block logic into vcpu_put/load respectively, using the kvm_vcpu_is_blocking() to determining whether or not the wakeup handler needs to be set (and unset). This avoids updating the PI descriptor if halt-polling is successful, reduces the number of touchpoints for updating the descriptor, and eliminates the confusing behavior of intentionally leaving a "stale" PI.NDST when a blocking vCPU is scheduled back in after preemption. The downside is that KVM will do the PID update twice if the vCPU is preempted after prepare_to_rcuwait() but before schedule(), but that's a rare case (and non-existent on !PREEMPT kernels). The notable wart is the need to send a self-IPI on the wakeup vector if an outstanding notification is pending after configuring the wakeup vector. Ideally, KVM would just do a kvm_vcpu_wake_up() in this case, but the scheduler doesn't support waking a task from its preemption notifier callback, i.e. while the task is right in the middle of being scheduled out. Note, setting the wakeup vector before halt-polling is not necessary: once the pending IRQ will be recorded in the PIR, kvm_vcpu_has_events() will detect this (via kvm_cpu_get_interrupt(), kvm_apic_get_interrupt(), apic_has_interrupt_for_ppr() and finally vmx_sync_pir_to_irr()) and terminate the polling. Signed-off-by: Sean Christopherson <[email protected]> Reviewed-by: Maxim Levitsky <[email protected]> Message-Id: <[email protected]> Signed-off-by: Paolo Bonzini <[email protected]>
1 parent 4f5a884 commit d76fb40

File tree

3 files changed

+70
-93
lines changed

3 files changed

+70
-93
lines changed

arch/x86/kvm/vmx/posted_intr.c

Lines changed: 64 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
5252
{
5353
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
5454
struct pi_desc old, new;
55+
unsigned long flags;
5556
unsigned int dest;
5657

5758
/*
@@ -62,34 +63,57 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
6263
if (!enable_apicv || !lapic_in_kernel(vcpu))
6364
return;
6465

65-
/* Nothing to do if PI.SN and PI.NDST both have the desired value. */
66-
if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
66+
/*
67+
* If the vCPU wasn't on the wakeup list and wasn't migrated, then the
68+
* full update can be skipped as neither the vector nor the destination
69+
* needs to be changed.
70+
*/
71+
if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
72+
/*
73+
* Clear SN if it was set due to being preempted. Again, do
74+
* this even if there is no assigned device for simplicity.
75+
*/
76+
if (pi_test_and_clear_sn(pi_desc))
77+
goto after_clear_sn;
6778
return;
79+
}
80+
81+
local_irq_save(flags);
6882

6983
/*
70-
* If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
71-
* PI.NDST: pi_post_block is the one expected to change PID.NDST and the
72-
* wakeup handler expects the vCPU to be on the blocked_vcpu_list that
73-
* matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
74-
* correctly.
84+
* If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
85+
* list of the _previous_ pCPU, which will not be the same as the
86+
* current pCPU if the task was migrated.
7587
*/
76-
if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
77-
pi_clear_sn(pi_desc);
78-
goto after_clear_sn;
88+
if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
89+
raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
90+
list_del(&vcpu->blocked_vcpu_list);
91+
raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
7992
}
8093

81-
/* The full case. Set the new destination and clear SN. */
8294
dest = cpu_physical_id(cpu);
8395
if (!x2apic_mode)
8496
dest = (dest << 8) & 0xFF00;
8597

8698
do {
8799
old.control = new.control = READ_ONCE(pi_desc->control);
88100

101+
/*
102+
* Clear SN (as above) and refresh the destination APIC ID to
103+
* handle task migration (@cpu != vcpu->cpu).
104+
*/
89105
new.ndst = dest;
90106
new.sn = 0;
107+
108+
/*
109+
* Restore the notification vector; in the blocking case, the
110+
* descriptor was modified on "put" to use the wakeup vector.
111+
*/
112+
new.nv = POSTED_INTR_VECTOR;
91113
} while (pi_try_set_control(pi_desc, old.control, new.control));
92114

115+
local_irq_restore(flags);
116+
93117
after_clear_sn:
94118

95119
/*
@@ -111,83 +135,24 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm)
111135
irq_remapping_cap(IRQ_POSTING_CAP);
112136
}
113137

114-
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
115-
{
116-
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
117-
118-
if (!vmx_can_use_vtd_pi(vcpu->kvm))
119-
return;
120-
121-
/* Set SN when the vCPU is preempted */
122-
if (vcpu->preempted)
123-
pi_set_sn(pi_desc);
124-
}
125-
126-
static void __pi_post_block(struct kvm_vcpu *vcpu)
127-
{
128-
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
129-
struct pi_desc old, new;
130-
unsigned int dest;
131-
132-
/*
133-
* Remove the vCPU from the wakeup list of the _previous_ pCPU, which
134-
* will not be the same as the current pCPU if the task was migrated.
135-
*/
136-
raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
137-
list_del(&vcpu->blocked_vcpu_list);
138-
raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
139-
140-
dest = cpu_physical_id(vcpu->cpu);
141-
if (!x2apic_mode)
142-
dest = (dest << 8) & 0xFF00;
143-
144-
WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR,
145-
"Wakeup handler not enabled while the vCPU was blocking");
146-
147-
do {
148-
old.control = new.control = READ_ONCE(pi_desc->control);
149-
150-
new.ndst = dest;
151-
152-
/* set 'NV' to 'notification vector' */
153-
new.nv = POSTED_INTR_VECTOR;
154-
} while (pi_try_set_control(pi_desc, old.control, new.control));
155-
156-
vcpu->pre_pcpu = -1;
157-
}
158-
159138
/*
160-
* This routine does the following things for vCPU which is going
161-
* to be blocked if VT-d PI is enabled.
162-
* - Store the vCPU to the wakeup list, so when interrupts happen
163-
* we can find the right vCPU to wake up.
164-
* - Change the Posted-interrupt descriptor as below:
165-
* 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
166-
* - If 'ON' is set during this process, which means at least one
167-
* interrupt is posted for this vCPU, we cannot block it, in
168-
* this case, return 1, otherwise, return 0.
169-
*
139+
* Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
140+
* WAKEUP as the notification vector in the PI descriptor.
170141
*/
171-
int pi_pre_block(struct kvm_vcpu *vcpu)
142+
static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
172143
{
173-
struct pi_desc old, new;
174144
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
145+
struct pi_desc old, new;
175146
unsigned long flags;
176147

177-
if (!vmx_can_use_vtd_pi(vcpu->kvm) ||
178-
vmx_interrupt_blocked(vcpu))
179-
return 0;
180-
181148
local_irq_save(flags);
182149

183-
vcpu->pre_pcpu = vcpu->cpu;
184150
raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
185151
list_add_tail(&vcpu->blocked_vcpu_list,
186152
&per_cpu(blocked_vcpu_on_cpu, vcpu->cpu));
187153
raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
188154

189-
WARN(pi_desc->sn == 1,
190-
"Posted Interrupt Suppress Notification set before blocking");
155+
WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
191156

192157
do {
193158
old.control = new.control = READ_ONCE(pi_desc->control);
@@ -196,24 +161,37 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
196161
new.nv = POSTED_INTR_WAKEUP_VECTOR;
197162
} while (pi_try_set_control(pi_desc, old.control, new.control));
198163

199-
/* We should not block the vCPU if an interrupt is posted for it. */
200-
if (pi_test_on(pi_desc))
201-
__pi_post_block(vcpu);
164+
/*
165+
* Send a wakeup IPI to this CPU if an interrupt may have been posted
166+
* before the notification vector was updated, in which case the IRQ
167+
* will arrive on the non-wakeup vector. An IPI is needed as calling
168+
* try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
169+
* enabled until it is safe to call try_to_wake_up() on the task being
170+
* scheduled out).
171+
*/
172+
if (pi_test_on(&new))
173+
apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
202174

203175
local_irq_restore(flags);
204-
return (vcpu->pre_pcpu == -1);
205176
}
206177

207-
void pi_post_block(struct kvm_vcpu *vcpu)
178+
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
208179
{
209-
unsigned long flags;
180+
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
210181

211-
if (vcpu->pre_pcpu == -1)
182+
if (!vmx_can_use_vtd_pi(vcpu->kvm))
212183
return;
213184

214-
local_irq_save(flags);
215-
__pi_post_block(vcpu);
216-
local_irq_restore(flags);
185+
if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
186+
pi_enable_wakeup_handler(vcpu);
187+
188+
/*
189+
* Set SN when the vCPU is preempted. Note, the vCPU can both be seen
190+
* as blocking and preempted, e.g. if it's preempted between setting
191+
* its wait state and manually scheduling out.
192+
*/
193+
if (vcpu->preempted)
194+
pi_set_sn(pi_desc);
217195
}
218196

219197
/*
@@ -254,7 +232,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
254232
* Bail out of the block loop if the VM has an assigned
255233
* device, but the blocking vCPU didn't reconfigure the
256234
* PI.NV to the wakeup vector, i.e. the assigned device
257-
* came along after the initial check in pi_pre_block().
235+
* came along after the initial check in vmx_vcpu_pi_put().
258236
*/
259237
void vmx_pi_start_assignment(struct kvm *kvm)
260238
{

arch/x86/kvm/vmx/posted_intr.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
4040
(unsigned long *)&pi_desc->control);
4141
}
4242

43+
static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
44+
{
45+
return test_and_clear_bit(POSTED_INTR_SN,
46+
(unsigned long *)&pi_desc->control);
47+
}
48+
4349
static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
4450
{
4551
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
@@ -88,8 +94,6 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc)
8894

8995
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
9096
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
91-
int pi_pre_block(struct kvm_vcpu *vcpu);
92-
void pi_post_block(struct kvm_vcpu *vcpu);
9397
void pi_wakeup_handler(void);
9498
void __init pi_init_cpu(int cpu);
9599
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);

arch/x86/kvm/vmx/vmx.c

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7566,9 +7566,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
75667566

75677567
static int vmx_pre_block(struct kvm_vcpu *vcpu)
75687568
{
7569-
if (pi_pre_block(vcpu))
7570-
return 1;
7571-
75727569
if (kvm_lapic_hv_timer_in_use(vcpu))
75737570
kvm_lapic_switch_to_sw_timer(vcpu);
75747571

@@ -7579,8 +7576,6 @@ static void vmx_post_block(struct kvm_vcpu *vcpu)
75797576
{
75807577
if (kvm_x86_ops.set_hv_timer)
75817578
kvm_lapic_switch_to_hv_timer(vcpu);
7582-
7583-
pi_post_block(vcpu);
75847579
}
75857580

75867581
static void vmx_setup_mce(struct kvm_vcpu *vcpu)

0 commit comments

Comments
 (0)