Skip to content

Commit 56bb273

Browse files
Dapeng Misean-jc
authored andcommitted
KVM: x86/pmu: Load/put mediated PMU context when entering/exiting guest
Implement the PMU "world switch" between host perf and guest mediated PMU. When loading guest state, call into perf to switch from host to guest, and then load guest state into hardware, and then reverse those actions when putting guest state. On the KVM side, when loading guest state, zero PERF_GLOBAL_CTRL to ensure all counters are disabled, then load selectors and counters, and finally call into vendor code to load control/status information. While VMX and SVM use different mechanisms to avoid counting host activity while guest controls are loaded, both implementations require PERF_GLOBAL_CTRL to be zeroed when the event selectors are in flux. When putting guest state, reverse the order, and save and zero controls and status prior to saving+zeroing selectors and counters. Defer clearing PERF_GLOBAL_CTRL to vendor code, as only SVM needs to manually clear the MSR; VMX configures PERF_GLOBAL_CTRL to be atomically cleared by the CPU on VM-Exit. Handle the difference in MSR layouts between Intel and AMD by communicating the bases and stride via kvm_pmu_ops. Because KVM requires Intel v4 (and full-width writes) and AMD v2, the MSRs to load/save are constant for a given vendor, i.e. do not vary based on the guest PMU, and do not vary based on host PMU (because KVM will simply disable mediated PMU support if the necessary MSRs are unsupported). Except for retrieving the guest's PERF_GLOBAL_CTRL, which needs to be read before invoking any fastpath handler (spoiler alert), perform the context switch around KVM's inner run loop. State only needs to be synchronized from hardware before KVM can access the software "caches". Note, VMX already grabs the guest's PERF_GLOBAL_CTRL immediately after VM-Exit, as hardware saves value into the VMCS. Co-developed-by: Mingwei Zhang <mizhang@google.com> Signed-off-by: Mingwei Zhang <mizhang@google.com> Co-developed-by: Sandipan Das <sandipan.das@amd.com> Signed-off-by: Sandipan Das <sandipan.das@amd.com> Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com> Tested-by: Xudong Hao <xudong.hao@intel.com> Co-developed-by: Sean Christopherson <seanjc@google.com> Tested-by: Manali Shukla <manali.shukla@amd.com> Link: https://patch.msgid.link/20251206001720.468579-28-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
1 parent a2f4ba5 commit 56bb273

8 files changed

Lines changed: 225 additions & 3 deletions

File tree

arch/x86/include/asm/kvm-x86-pmu-ops.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ KVM_X86_PMU_OP_OPTIONAL(deliver_pmi)
2424
KVM_X86_PMU_OP_OPTIONAL(cleanup)
2525

2626
KVM_X86_PMU_OP_OPTIONAL(write_global_ctrl)
27+
KVM_X86_PMU_OP(mediated_load)
28+
KVM_X86_PMU_OP(mediated_put)
2729

2830
#undef KVM_X86_PMU_OP
2931
#undef KVM_X86_PMU_OP_OPTIONAL

arch/x86/include/asm/msr-index.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1219,6 +1219,7 @@
12191219
#define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e
12201220
#define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f
12211221
#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390
1222+
#define MSR_CORE_PERF_GLOBAL_STATUS_SET 0x00000391
12221223

12231224
#define MSR_PERF_METRICS 0x00000329
12241225

arch/x86/kvm/pmu.c

Lines changed: 127 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -880,10 +880,13 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
880880
diff = pmu->global_ctrl ^ data;
881881
pmu->global_ctrl = data;
882882
reprogram_counters(pmu, diff);
883-
884-
if (kvm_vcpu_has_mediated_pmu(vcpu))
885-
kvm_pmu_call(write_global_ctrl)(data);
886883
}
884+
/*
885+
* Unconditionally forward writes to vendor code, i.e. to the
886+
* VMC{B,S}, as pmu->global_ctrl is per-VCPU, not per-VMC{B,S}.
887+
*/
888+
if (kvm_vcpu_has_mediated_pmu(vcpu))
889+
kvm_pmu_call(write_global_ctrl)(data);
887890
break;
888891
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
889892
/*
@@ -1244,3 +1247,124 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
12441247
kfree(filter);
12451248
return r;
12461249
}
1250+
1251+
static __always_inline u32 fixed_counter_msr(u32 idx)
1252+
{
1253+
return kvm_pmu_ops.FIXED_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE;
1254+
}
1255+
1256+
static __always_inline u32 gp_counter_msr(u32 idx)
1257+
{
1258+
return kvm_pmu_ops.GP_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE;
1259+
}
1260+
1261+
static __always_inline u32 gp_eventsel_msr(u32 idx)
1262+
{
1263+
return kvm_pmu_ops.GP_EVENTSEL_BASE + idx * kvm_pmu_ops.MSR_STRIDE;
1264+
}
1265+
1266+
static void kvm_pmu_load_guest_pmcs(struct kvm_vcpu *vcpu)
1267+
{
1268+
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
1269+
struct kvm_pmc *pmc;
1270+
u32 i;
1271+
1272+
/*
1273+
* No need to zero out unexposed GP/fixed counters/selectors since RDPMC
1274+
* is intercepted if hardware has counters that aren't visible to the
1275+
* guest (KVM will inject #GP as appropriate).
1276+
*/
1277+
for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
1278+
pmc = &pmu->gp_counters[i];
1279+
1280+
wrmsrl(gp_counter_msr(i), pmc->counter);
1281+
wrmsrl(gp_eventsel_msr(i), pmc->eventsel_hw);
1282+
}
1283+
for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
1284+
pmc = &pmu->fixed_counters[i];
1285+
1286+
wrmsrl(fixed_counter_msr(i), pmc->counter);
1287+
}
1288+
}
1289+
1290+
void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu)
1291+
{
1292+
if (!kvm_vcpu_has_mediated_pmu(vcpu) ||
1293+
KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm))
1294+
return;
1295+
1296+
lockdep_assert_irqs_disabled();
1297+
1298+
perf_load_guest_context();
1299+
1300+
/*
1301+
* Explicitly clear PERF_GLOBAL_CTRL, as "loading" the guest's context
1302+
* disables all individual counters (if any were enabled), but doesn't
1303+
* globally disable the entire PMU. Loading event selectors and PMCs
1304+
* with guest values while PERF_GLOBAL_CTRL is non-zero will generate
1305+
* unexpected events and PMIs.
1306+
*
1307+
* VMX will enable/disable counters at VM-Enter/VM-Exit by atomically
1308+
* loading PERF_GLOBAL_CONTROL. SVM effectively performs the switch by
1309+
* configuring all events to be GUEST_ONLY. Clear PERF_GLOBAL_CONTROL
1310+
* even for SVM to minimize the damage if a perf event is left enabled,
1311+
* and to ensure a consistent starting state.
1312+
*/
1313+
wrmsrq(kvm_pmu_ops.PERF_GLOBAL_CTRL, 0);
1314+
1315+
perf_load_guest_lvtpc(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTPC));
1316+
1317+
kvm_pmu_load_guest_pmcs(vcpu);
1318+
1319+
kvm_pmu_call(mediated_load)(vcpu);
1320+
}
1321+
1322+
static void kvm_pmu_put_guest_pmcs(struct kvm_vcpu *vcpu)
1323+
{
1324+
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
1325+
struct kvm_pmc *pmc;
1326+
u32 i;
1327+
1328+
/*
1329+
* Clear selectors and counters to ensure hardware doesn't count using
1330+
* guest controls when the host (perf) restores its state.
1331+
*/
1332+
for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
1333+
pmc = &pmu->gp_counters[i];
1334+
1335+
pmc->counter = rdpmc(i);
1336+
if (pmc->counter)
1337+
wrmsrq(gp_counter_msr(i), 0);
1338+
if (pmc->eventsel_hw)
1339+
wrmsrq(gp_eventsel_msr(i), 0);
1340+
}
1341+
1342+
for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
1343+
pmc = &pmu->fixed_counters[i];
1344+
1345+
pmc->counter = rdpmc(INTEL_PMC_FIXED_RDPMC_BASE | i);
1346+
if (pmc->counter)
1347+
wrmsrq(fixed_counter_msr(i), 0);
1348+
}
1349+
}
1350+
1351+
void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu)
1352+
{
1353+
if (!kvm_vcpu_has_mediated_pmu(vcpu) ||
1354+
KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm))
1355+
return;
1356+
1357+
lockdep_assert_irqs_disabled();
1358+
1359+
/*
1360+
* Defer handling of PERF_GLOBAL_CTRL to vendor code. On Intel, it's
1361+
* atomically cleared on VM-Exit, i.e. doesn't need to be clear here.
1362+
*/
1363+
kvm_pmu_call(mediated_put)(vcpu);
1364+
1365+
kvm_pmu_put_guest_pmcs(vcpu);
1366+
1367+
perf_put_guest_lvtpc();
1368+
1369+
perf_put_guest_context();
1370+
}

arch/x86/kvm/pmu.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,19 @@ struct kvm_pmu_ops {
3838
void (*cleanup)(struct kvm_vcpu *vcpu);
3939

4040
bool (*is_mediated_pmu_supported)(struct x86_pmu_capability *host_pmu);
41+
void (*mediated_load)(struct kvm_vcpu *vcpu);
42+
void (*mediated_put)(struct kvm_vcpu *vcpu);
4143
void (*write_global_ctrl)(u64 global_ctrl);
4244

4345
const u64 EVENTSEL_EVENT;
4446
const int MAX_NR_GP_COUNTERS;
4547
const int MIN_NR_GP_COUNTERS;
48+
49+
const u32 PERF_GLOBAL_CTRL;
50+
const u32 GP_EVENTSEL_BASE;
51+
const u32 GP_COUNTER_BASE;
52+
const u32 FIXED_COUNTER_BASE;
53+
const u32 MSR_STRIDE;
4654
};
4755

4856
void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
@@ -240,6 +248,8 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
240248
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
241249
void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu);
242250
void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu);
251+
void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu);
252+
void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu);
243253

244254
bool is_vmware_backdoor_pmc(u32 pmc_idx);
245255
bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu);

arch/x86/kvm/svm/pmu.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,32 @@ static bool amd_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_pm
234234
return host_pmu->version >= 2;
235235
}
236236

237+
static void amd_mediated_pmu_load(struct kvm_vcpu *vcpu)
238+
{
239+
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
240+
u64 global_status;
241+
242+
rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, global_status);
243+
/* Clear host global_status MSR if non-zero. */
244+
if (global_status)
245+
wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, global_status);
246+
247+
wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, pmu->global_status);
248+
wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, pmu->global_ctrl);
249+
}
250+
251+
static void amd_mediated_pmu_put(struct kvm_vcpu *vcpu)
252+
{
253+
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
254+
255+
wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0);
256+
rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, pmu->global_status);
257+
258+
/* Clear global status bits if non-zero */
259+
if (pmu->global_status)
260+
wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, pmu->global_status);
261+
}
262+
237263
struct kvm_pmu_ops amd_pmu_ops __initdata = {
238264
.rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
239265
.msr_idx_to_pmc = amd_msr_idx_to_pmc,
@@ -245,8 +271,16 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = {
245271
.init = amd_pmu_init,
246272

247273
.is_mediated_pmu_supported = amd_pmu_is_mediated_pmu_supported,
274+
.mediated_load = amd_mediated_pmu_load,
275+
.mediated_put = amd_mediated_pmu_put,
248276

249277
.EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT,
250278
.MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS,
251279
.MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS,
280+
281+
.PERF_GLOBAL_CTRL = MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
282+
.GP_EVENTSEL_BASE = MSR_F15H_PERF_CTL0,
283+
.GP_COUNTER_BASE = MSR_F15H_PERF_CTR0,
284+
.FIXED_COUNTER_BASE = 0,
285+
.MSR_STRIDE = 2,
252286
};

arch/x86/kvm/svm/svm.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4367,6 +4367,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
43674367

43684368
vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
43694369

4370+
if (!msr_write_intercepted(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL))
4371+
rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, vcpu_to_pmu(vcpu)->global_ctrl);
4372+
43704373
trace_kvm_exit(vcpu, KVM_ISA_SVM);
43714374

43724375
svm_complete_interrupts(vcpu);

arch/x86/kvm/vmx/pmu_intel.c

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -792,6 +792,42 @@ static void intel_pmu_write_global_ctrl(u64 global_ctrl)
792792
vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, global_ctrl);
793793
}
794794

795+
796+
static void intel_mediated_pmu_load(struct kvm_vcpu *vcpu)
797+
{
798+
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
799+
u64 global_status, toggle;
800+
801+
rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, global_status);
802+
toggle = pmu->global_status ^ global_status;
803+
if (global_status & toggle)
804+
wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status & toggle);
805+
if (pmu->global_status & toggle)
806+
wrmsrq(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status & toggle);
807+
808+
wrmsrq(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl_hw);
809+
}
810+
811+
static void intel_mediated_pmu_put(struct kvm_vcpu *vcpu)
812+
{
813+
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
814+
815+
/* MSR_CORE_PERF_GLOBAL_CTRL is already saved at VM-exit. */
816+
rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status);
817+
818+
/* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */
819+
if (pmu->global_status)
820+
wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status);
821+
822+
/*
823+
* Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and
824+
* also to avoid accidentally enabling fixed counters (based on guest
825+
* state) while running in the host, e.g. when setting global ctrl.
826+
*/
827+
if (pmu->fixed_ctr_ctrl_hw)
828+
wrmsrq(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
829+
}
830+
795831
struct kvm_pmu_ops intel_pmu_ops __initdata = {
796832
.rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
797833
.msr_idx_to_pmc = intel_msr_idx_to_pmc,
@@ -805,9 +841,17 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = {
805841
.cleanup = intel_pmu_cleanup,
806842

807843
.is_mediated_pmu_supported = intel_pmu_is_mediated_pmu_supported,
844+
.mediated_load = intel_mediated_pmu_load,
845+
.mediated_put = intel_mediated_pmu_put,
808846
.write_global_ctrl = intel_pmu_write_global_ctrl,
809847

810848
.EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
811849
.MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS,
812850
.MIN_NR_GP_COUNTERS = 1,
851+
852+
.PERF_GLOBAL_CTRL = MSR_CORE_PERF_GLOBAL_CTRL,
853+
.GP_EVENTSEL_BASE = MSR_P6_EVNTSEL0,
854+
.GP_COUNTER_BASE = MSR_IA32_PMC0,
855+
.FIXED_COUNTER_BASE = MSR_CORE_PERF_FIXED_CTR0,
856+
.MSR_STRIDE = 1,
813857
};

arch/x86/kvm/x86.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11334,6 +11334,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
1133411334
run_flags |= KVM_RUN_LOAD_DEBUGCTL;
1133511335
vcpu->arch.host_debugctl = debug_ctl;
1133611336

11337+
kvm_mediated_pmu_load(vcpu);
11338+
1133711339
guest_timing_enter_irqoff();
1133811340

1133911341
/*
@@ -11372,6 +11374,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
1137211374

1137311375
kvm_load_host_pkru(vcpu);
1137411376

11377+
kvm_mediated_pmu_put(vcpu);
11378+
1137511379
/*
1137611380
* Do this here before restoring debug registers on the host. And
1137711381
* since we do this before handling the vmexit, a DR access vmexit

0 commit comments

Comments
 (0)