Skip to content

Commit fd89499

Browse files
committed
KVM: x86/pmu: Track emulated counter events instead of previous counter
Explicitly track emulated counter events instead of using the common counter value that's shared with the hardware counter owned by perf. Bumping the common counter requires snapshotting the pre-increment value in order to detect overflow from emulation, and the snapshot approach is inherently flawed. Snapshotting the previous counter at every increment assumes that there is at most one emulated counter event per emulated instruction (or rather, between checks for KVM_REQ_PMU). That's mostly holds true today because KVM only emulates (branch) instructions retired, but the approach will fall apart if KVM ever supports event types that don't have a 1:1 relationship with instructions. And KVM already has a relevant bug, as handle_invalid_guest_state() emulates multiple instructions without checking KVM_REQ_PMU, i.e. could miss an overflow event due to clobbering pmc->prev_counter. Not checking KVM_REQ_PMU is problematic in both cases, but at least with the emulated counter approach, the resulting behavior is delayed overflow detection, as opposed to completely lost detection. Tracking the emulated count fixes another bug where the snapshot approach can signal spurious overflow due to incorporating both the emulated count and perf's count in the check, i.e. if overflow is detected by perf, then KVM's emulation will also incorrectly signal overflow. Add a comment in the related code to call out the need to process emulated events *after* pausing the perf event (big kudos to Mingwei for figuring out that particular wrinkle). Cc: Mingwei Zhang <mizhang@google.com> Cc: Roman Kagan <rkagan@amazon.de> Cc: Jim Mattson <jmattson@google.com> Cc: Dapeng Mi <dapeng1.mi@linux.intel.com> Cc: Like Xu <like.xu.linux@gmail.com> Reviewed-by: Mingwei Zhang <mizhang@google.com> Link: https://lore.kernel.org/r/20231103230541.352265-7-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
1 parent 89acf12 commit fd89499

3 files changed

Lines changed: 53 additions & 15 deletions

File tree

arch/x86/include/asm/kvm_host.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -500,8 +500,23 @@ struct kvm_pmc {
500500
u8 idx;
501501
bool is_paused;
502502
bool intr;
503+
/*
504+
* Base value of the PMC counter, relative to the *consumed* count in
505+
* the associated perf_event. This value includes counter updates from
506+
* the perf_event and emulated_count since the last time the counter
507+
* was reprogrammed, but it is *not* the current value as seen by the
508+
* guest or userspace.
509+
*
510+
* The count is relative to the associated perf_event so that KVM
511+
* doesn't need to reprogram the perf_event every time the guest writes
512+
* to the counter.
513+
*/
503514
u64 counter;
504-
u64 prev_counter;
515+
/*
516+
* PMC events triggered by KVM emulation that haven't been fully
517+
* processed, i.e. haven't undergone overflow detection.
518+
*/
519+
u64 emulated_counter;
505520
u64 eventsel;
506521
struct perf_event *perf_event;
507522
struct kvm_vcpu *vcpu;

arch/x86/kvm/pmu.c

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -127,9 +127,9 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
127127
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
128128

129129
/*
130-
* Ignore overflow events for counters that are scheduled to be
131-
* reprogrammed, e.g. if a PMI for the previous event races with KVM's
132-
* handling of a related guest WRMSR.
130+
* Ignore asynchronous overflow events for counters that are scheduled
131+
* to be reprogrammed, e.g. if a PMI for the previous event races with
132+
* KVM's handling of a related guest WRMSR.
133133
*/
134134
if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
135135
return;
@@ -224,17 +224,30 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
224224
return 0;
225225
}
226226

227-
static void pmc_pause_counter(struct kvm_pmc *pmc)
227+
static bool pmc_pause_counter(struct kvm_pmc *pmc)
228228
{
229229
u64 counter = pmc->counter;
230-
231-
if (!pmc->perf_event || pmc->is_paused)
232-
return;
230+
u64 prev_counter;
233231

234232
/* update counter, reset event value to avoid redundant accumulation */
235-
counter += perf_event_pause(pmc->perf_event, true);
233+
if (pmc->perf_event && !pmc->is_paused)
234+
counter += perf_event_pause(pmc->perf_event, true);
235+
236+
/*
237+
* Snapshot the previous counter *after* accumulating state from perf.
238+
* If overflow already happened, hardware (via perf) is responsible for
239+
* generating a PMI. KVM just needs to detect overflow on emulated
240+
* counter events that haven't yet been processed.
241+
*/
242+
prev_counter = counter & pmc_bitmask(pmc);
243+
244+
counter += pmc->emulated_counter;
236245
pmc->counter = counter & pmc_bitmask(pmc);
246+
247+
pmc->emulated_counter = 0;
237248
pmc->is_paused = true;
249+
250+
return pmc->counter < prev_counter;
238251
}
239252

240253
static bool pmc_resume_counter(struct kvm_pmc *pmc)
@@ -289,6 +302,15 @@ static void pmc_update_sample_period(struct kvm_pmc *pmc)
289302

290303
void pmc_write_counter(struct kvm_pmc *pmc, u64 val)
291304
{
305+
/*
306+
* Drop any unconsumed accumulated counts, the WRMSR is a write, not a
307+
* read-modify-write. Adjust the counter value so that its value is
308+
* relative to the current count, as reading the current count from
309+
* perf is faster than pausing and repgrogramming the event in order to
310+
* reset it to '0'. Note, this very sneakily offsets the accumulated
311+
* emulated count too, by using pmc_read_counter()!
312+
*/
313+
pmc->emulated_counter = 0;
292314
pmc->counter += val - pmc_read_counter(pmc);
293315
pmc->counter &= pmc_bitmask(pmc);
294316
pmc_update_sample_period(pmc);
@@ -428,14 +450,15 @@ static void reprogram_counter(struct kvm_pmc *pmc)
428450
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
429451
u64 eventsel = pmc->eventsel;
430452
u64 new_config = eventsel;
453+
bool emulate_overflow;
431454
u8 fixed_ctr_ctrl;
432455

433-
pmc_pause_counter(pmc);
456+
emulate_overflow = pmc_pause_counter(pmc);
434457

435458
if (!pmc_event_is_allowed(pmc))
436459
goto reprogram_complete;
437460

438-
if (pmc->counter < pmc->prev_counter)
461+
if (emulate_overflow)
439462
__kvm_perf_overflow(pmc, false);
440463

441464
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
@@ -475,7 +498,6 @@ static void reprogram_counter(struct kvm_pmc *pmc)
475498

476499
reprogram_complete:
477500
clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
478-
pmc->prev_counter = 0;
479501
}
480502

481503
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
@@ -701,6 +723,7 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
701723

702724
pmc_stop_counter(pmc);
703725
pmc->counter = 0;
726+
pmc->emulated_counter = 0;
704727

705728
if (pmc_is_gp(pmc))
706729
pmc->eventsel = 0;
@@ -772,8 +795,7 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
772795

773796
static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
774797
{
775-
pmc->prev_counter = pmc->counter;
776-
pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
798+
pmc->emulated_counter++;
777799
kvm_pmu_request_counter_reprogram(pmc);
778800
}
779801

arch/x86/kvm/pmu.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
6666
{
6767
u64 counter, enabled, running;
6868

69-
counter = pmc->counter;
69+
counter = pmc->counter + pmc->emulated_counter;
70+
7071
if (pmc->perf_event && !pmc->is_paused)
7172
counter += perf_event_read_value(pmc->perf_event,
7273
&enabled, &running);

0 commit comments

Comments
 (0)