Skip to content

Commit 1a14928

Browse files
committed
Merge tag 'kvm-x86-misc-6.17' of https://github.com/kvm-x86/linux into HEAD
KVM x86 misc changes for 6.17 - Prevert the host's DEBUGCTL.FREEZE_IN_SMM (Intel only) when running the guest. Failure to honor FREEZE_IN_SMM can bleed host state into the guest. - Explicitly check vmcs12.GUEST_DEBUGCTL on nested VM-Enter (Intel only) to prevent L1 from running L2 with features that KVM doesn't support, e.g. BTF. - Intercept SPEC_CTRL on AMD if the MSR shouldn't exist according to the vCPU's CPUID model. - Rework the MSR interception code so that the SVM and VMX APIs are more or less identical. - Recalculate all MSR intercepts from the "source" on MSR filter changes, and drop the dedicated "shadow" bitmaps (and their awful "max" size defines). - WARN and reject loading kvm-amd.ko instead of panicking the kernel if the nested SVM MSRPM offsets tracker can't handle an MSR. - Advertise support for LKGS (Load Kernel GS base), a new instruction that's loosely related to FRED, but is supported and enumerated independently. - Fix a user-triggerable WARN that syzkaller found by stuffing INIT_RECEIVED, a.k.a. WFS, and then putting the vCPU into VMX Root Mode (post-VMXON). Use the same approach KVM uses for dealing with "impossible" emulation when running a !URG guest, and simply wait until KVM_RUN to detect that the vCPU has architecturally impossible state. - Add KVM_X86_DISABLE_EXITS_APERFMPERF to allow disabling interception of APERF/MPERF reads, so that a "properly" configured VM can "virtualize" APERF/MPERF (with many caveats). - Reject KVM_SET_TSC_KHZ if vCPUs have been created, as changing the "default" frequency is unsupported for VMs with a "secure" TSC, and there's no known use case for changing the default frequency for other VM types.
2 parents 9de1395 + dcbe5a4 commit 1a14928

30 files changed

Lines changed: 928 additions & 748 deletions

File tree

Documentation/virt/kvm/api.rst

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2006,7 +2006,7 @@ frequency is KHz.
20062006

20072007
If the KVM_CAP_VM_TSC_CONTROL capability is advertised, this can also
20082008
be used as a vm ioctl to set the initial tsc frequency of subsequently
2009-
created vCPUs.
2009+
created vCPUs. Note, the vm ioctl is only allowed prior to creating vCPUs.
20102010

20112011
For TSC protected Confidential Computing (CoCo) VMs where TSC frequency
20122012
is configured once at VM scope and remains unchanged during VM's
@@ -7851,6 +7851,7 @@ Valid bits in args[0] are::
78517851
#define KVM_X86_DISABLE_EXITS_HLT (1 << 1)
78527852
#define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2)
78537853
#define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3)
7854+
#define KVM_X86_DISABLE_EXITS_APERFMPERF (1 << 4)
78547855

78557856
Enabling this capability on a VM provides userspace with a way to no
78567857
longer intercept some instructions for improved latency in some
@@ -7861,6 +7862,28 @@ all such vmexits.
78617862

78627863
Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits.
78637864

7865+
Virtualizing the ``IA32_APERF`` and ``IA32_MPERF`` MSRs requires more
7866+
than just disabling APERF/MPERF exits. While both Intel and AMD
7867+
document strict usage conditions for these MSRs--emphasizing that only
7868+
the ratio of their deltas over a time interval (T0 to T1) is
7869+
architecturally defined--simply passing through the MSRs can still
7870+
produce an incorrect ratio.
7871+
7872+
This erroneous ratio can occur if, between T0 and T1:
7873+
7874+
1. The vCPU thread migrates between logical processors.
7875+
2. Live migration or suspend/resume operations take place.
7876+
3. Another task shares the vCPU's logical processor.
7877+
4. C-states lower than C0 are emulated (e.g., via HLT interception).
7878+
5. The guest TSC frequency doesn't match the host TSC frequency.
7879+
7880+
Due to these complexities, KVM does not automatically associate this
7881+
passthrough capability with the guest CPUID bit,
7882+
``CPUID.6:ECX.APERFMPERF[bit 0]``. Userspace VMMs that deem this
7883+
mechanism adequate for virtualizing the ``IA32_APERF`` and
7884+
``IA32_MPERF`` MSRs must set the guest CPUID bit explicitly.
7885+
7886+
78647887
7.14 KVM_CAP_S390_HPAGE_1M
78657888
--------------------------
78667889

arch/x86/include/asm/kvm-x86-ops.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ KVM_X86_OP(set_idt)
4949
KVM_X86_OP(get_gdt)
5050
KVM_X86_OP(set_gdt)
5151
KVM_X86_OP(sync_dirty_debug_regs)
52-
KVM_X86_OP(set_dr6)
5352
KVM_X86_OP(set_dr7)
5453
KVM_X86_OP(cache_reg)
5554
KVM_X86_OP(get_rflags)
@@ -139,7 +138,7 @@ KVM_X86_OP(check_emulate_instruction)
139138
KVM_X86_OP(apic_init_signal_blocked)
140139
KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)
141140
KVM_X86_OP_OPTIONAL(migrate_timers)
142-
KVM_X86_OP(msr_filter_changed)
141+
KVM_X86_OP(recalc_msr_intercepts)
143142
KVM_X86_OP(complete_emulated_msr)
144143
KVM_X86_OP(vcpu_deliver_sipi_vector)
145144
KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);

arch/x86/include/asm/kvm_host.h

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1408,10 +1408,7 @@ struct kvm_arch {
14081408

14091409
gpa_t wall_clock;
14101410

1411-
bool mwait_in_guest;
1412-
bool hlt_in_guest;
1413-
bool pause_in_guest;
1414-
bool cstate_in_guest;
1411+
u64 disabled_exits;
14151412

14161413
s64 kvmclock_offset;
14171414

@@ -1687,6 +1684,12 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
16871684
return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
16881685
}
16891686

1687+
enum kvm_x86_run_flags {
1688+
KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
1689+
KVM_RUN_LOAD_GUEST_DR6 = BIT(1),
1690+
KVM_RUN_LOAD_DEBUGCTL = BIT(2),
1691+
};
1692+
16901693
struct kvm_x86_ops {
16911694
const char *name;
16921695

@@ -1715,6 +1718,12 @@ struct kvm_x86_ops {
17151718
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
17161719
void (*vcpu_put)(struct kvm_vcpu *vcpu);
17171720

1721+
/*
1722+
* Mask of DEBUGCTL bits that are owned by the host, i.e. that need to
1723+
* match the host's value even while the guest is active.
1724+
*/
1725+
const u64 HOST_OWNED_DEBUGCTL;
1726+
17181727
void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
17191728
int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
17201729
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -1737,7 +1746,6 @@ struct kvm_x86_ops {
17371746
void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
17381747
void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
17391748
void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
1740-
void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
17411749
void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
17421750
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
17431751
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
@@ -1768,7 +1776,7 @@ struct kvm_x86_ops {
17681776

17691777
int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
17701778
enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
1771-
bool force_immediate_exit);
1779+
u64 run_flags);
17721780
int (*handle_exit)(struct kvm_vcpu *vcpu,
17731781
enum exit_fastpath_completion exit_fastpath);
17741782
int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
@@ -1900,7 +1908,7 @@ struct kvm_x86_ops {
19001908
int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu);
19011909

19021910
void (*migrate_timers)(struct kvm_vcpu *vcpu);
1903-
void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
1911+
void (*recalc_msr_intercepts)(struct kvm_vcpu *vcpu);
19041912
int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
19051913

19061914
void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);

arch/x86/include/asm/msr-index.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,7 @@
419419
#define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12)
420420
#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
421421
#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
422+
#define DEBUGCTLMSR_RTM_DEBUG BIT(15)
422423

423424
#define MSR_PEBS_FRONTEND 0x000003f7
424425

arch/x86/kvm/cpuid.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -979,6 +979,7 @@ void kvm_set_cpu_caps(void)
979979
F(FSRS),
980980
F(FSRC),
981981
F(WRMSRNS),
982+
X86_64_F(LKGS),
982983
F(AMX_FP16),
983984
F(AVX_IFMA),
984985
F(LAM),

arch/x86/kvm/lapic.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
#define APIC_BROADCAST 0xFF
2222
#define X2APIC_BROADCAST 0xFFFFFFFFul
2323

24+
#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
25+
2426
enum lapic_mode {
2527
LAPIC_MODE_DISABLED = 0,
2628
LAPIC_MODE_INVALID = X2APIC_ENABLE,

arch/x86/kvm/svm/nested.c

Lines changed: 96 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -184,13 +184,88 @@ void recalc_intercepts(struct vcpu_svm *svm)
184184
}
185185
}
186186

187+
/*
188+
* This array (and its actual size) holds the set of offsets (indexing by chunk
189+
* size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the
190+
* set of MSRs for which interception is disabled in vmcb01 is per-vCPU, e.g.
191+
* based on CPUID features. This array only tracks MSRs that *might* be passed
192+
* through to the guest.
193+
*
194+
* Hardcode the capacity of the array based on the maximum number of _offsets_.
195+
* MSRs are batched together, so there are fewer offsets than MSRs.
196+
*/
197+
static int nested_svm_msrpm_merge_offsets[7] __ro_after_init;
198+
static int nested_svm_nr_msrpm_merge_offsets __ro_after_init;
199+
typedef unsigned long nsvm_msrpm_merge_t;
200+
201+
int __init nested_svm_init_msrpm_merge_offsets(void)
202+
{
203+
static const u32 merge_msrs[] __initconst = {
204+
MSR_STAR,
205+
MSR_IA32_SYSENTER_CS,
206+
MSR_IA32_SYSENTER_EIP,
207+
MSR_IA32_SYSENTER_ESP,
208+
#ifdef CONFIG_X86_64
209+
MSR_GS_BASE,
210+
MSR_FS_BASE,
211+
MSR_KERNEL_GS_BASE,
212+
MSR_LSTAR,
213+
MSR_CSTAR,
214+
MSR_SYSCALL_MASK,
215+
#endif
216+
MSR_IA32_SPEC_CTRL,
217+
MSR_IA32_PRED_CMD,
218+
MSR_IA32_FLUSH_CMD,
219+
MSR_IA32_APERF,
220+
MSR_IA32_MPERF,
221+
MSR_IA32_LASTBRANCHFROMIP,
222+
MSR_IA32_LASTBRANCHTOIP,
223+
MSR_IA32_LASTINTFROMIP,
224+
MSR_IA32_LASTINTTOIP,
225+
};
226+
int i, j;
227+
228+
for (i = 0; i < ARRAY_SIZE(merge_msrs); i++) {
229+
int bit_nr = svm_msrpm_bit_nr(merge_msrs[i]);
230+
u32 offset;
231+
232+
if (WARN_ON(bit_nr < 0))
233+
return -EIO;
234+
235+
/*
236+
* Merging is done in chunks to reduce the number of accesses
237+
* to L1's bitmap.
238+
*/
239+
offset = bit_nr / BITS_PER_BYTE / sizeof(nsvm_msrpm_merge_t);
240+
241+
for (j = 0; j < nested_svm_nr_msrpm_merge_offsets; j++) {
242+
if (nested_svm_msrpm_merge_offsets[j] == offset)
243+
break;
244+
}
245+
246+
if (j < nested_svm_nr_msrpm_merge_offsets)
247+
continue;
248+
249+
if (WARN_ON(j >= ARRAY_SIZE(nested_svm_msrpm_merge_offsets)))
250+
return -EIO;
251+
252+
nested_svm_msrpm_merge_offsets[j] = offset;
253+
nested_svm_nr_msrpm_merge_offsets++;
254+
}
255+
256+
return 0;
257+
}
258+
187259
/*
188260
* Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function
189261
* is optimized in that it only merges the parts where KVM MSR permission bitmap
190262
* may contain zero bits.
191263
*/
192-
static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
264+
static bool nested_svm_merge_msrpm(struct kvm_vcpu *vcpu)
193265
{
266+
struct vcpu_svm *svm = to_svm(vcpu);
267+
nsvm_msrpm_merge_t *msrpm02 = svm->nested.msrpm;
268+
nsvm_msrpm_merge_t *msrpm01 = svm->msrpm;
194269
int i;
195270

196271
/*
@@ -205,7 +280,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
205280
if (!svm->nested.force_msr_bitmap_recalc) {
206281
struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
207282

208-
if (kvm_hv_hypercall_enabled(&svm->vcpu) &&
283+
if (kvm_hv_hypercall_enabled(vcpu) &&
209284
hve->hv_enlightenments_control.msr_bitmap &&
210285
(svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS)))
211286
goto set_msrpm_base_pa;
@@ -215,25 +290,17 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
215290
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
216291
return true;
217292

218-
for (i = 0; i < MSRPM_OFFSETS; i++) {
219-
u32 value, p;
220-
u64 offset;
293+
for (i = 0; i < nested_svm_nr_msrpm_merge_offsets; i++) {
294+
const int p = nested_svm_msrpm_merge_offsets[i];
295+
nsvm_msrpm_merge_t l1_val;
296+
gpa_t gpa;
221297

222-
if (msrpm_offsets[i] == 0xffffffff)
223-
break;
298+
gpa = svm->nested.ctl.msrpm_base_pa + (p * sizeof(l1_val));
224299

225-
p = msrpm_offsets[i];
226-
227-
/* x2apic msrs are intercepted always for the nested guest */
228-
if (is_x2apic_msrpm_offset(p))
229-
continue;
230-
231-
offset = svm->nested.ctl.msrpm_base_pa + (p * 4);
232-
233-
if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
300+
if (kvm_vcpu_read_guest(vcpu, gpa, &l1_val, sizeof(l1_val)))
234301
return false;
235302

236-
svm->nested.msrpm[p] = svm->msrpm[p] | value;
303+
msrpm02[p] = msrpm01[p] | l1_val;
237304
}
238305

239306
svm->nested.force_msr_bitmap_recalc = false;
@@ -937,7 +1004,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
9371004
if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true))
9381005
goto out_exit_err;
9391006

940-
if (nested_svm_vmrun_msrpm(svm))
1007+
if (nested_svm_merge_msrpm(vcpu))
9411008
goto out;
9421009

9431010
out_exit_err:
@@ -1230,7 +1297,6 @@ int svm_allocate_nested(struct vcpu_svm *svm)
12301297
svm->nested.msrpm = svm_vcpu_alloc_msrpm();
12311298
if (!svm->nested.msrpm)
12321299
goto err_free_vmcb02;
1233-
svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
12341300

12351301
svm->nested.initialized = true;
12361302
return 0;
@@ -1290,26 +1356,26 @@ void svm_leave_nested(struct kvm_vcpu *vcpu)
12901356

12911357
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
12921358
{
1293-
u32 offset, msr, value;
1294-
int write, mask;
1359+
gpa_t base = svm->nested.ctl.msrpm_base_pa;
1360+
int write, bit_nr;
1361+
u8 value, mask;
1362+
u32 msr;
12951363

12961364
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
12971365
return NESTED_EXIT_HOST;
12981366

12991367
msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1300-
offset = svm_msrpm_offset(msr);
1368+
bit_nr = svm_msrpm_bit_nr(msr);
13011369
write = svm->vmcb->control.exit_info_1 & 1;
1302-
mask = 1 << ((2 * (msr & 0xf)) + write);
13031370

1304-
if (offset == MSR_INVALID)
1371+
if (bit_nr < 0)
13051372
return NESTED_EXIT_DONE;
13061373

1307-
/* Offset is in 32 bit units but need in 8 bit units */
1308-
offset *= 4;
1309-
1310-
if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4))
1374+
if (kvm_vcpu_read_guest(&svm->vcpu, base + bit_nr / BITS_PER_BYTE,
1375+
&value, sizeof(value)))
13111376
return NESTED_EXIT_DONE;
13121377

1378+
mask = BIT(write) << (bit_nr & (BITS_PER_BYTE - 1));
13131379
return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
13141380
}
13151381

@@ -1819,13 +1885,11 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
18191885

18201886
static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
18211887
{
1822-
struct vcpu_svm *svm = to_svm(vcpu);
1823-
18241888
if (WARN_ON(!is_guest_mode(vcpu)))
18251889
return true;
18261890

18271891
if (!vcpu->arch.pdptrs_from_userspace &&
1828-
!nested_npt_enabled(svm) && is_pae_paging(vcpu))
1892+
!nested_npt_enabled(to_svm(vcpu)) && is_pae_paging(vcpu))
18291893
/*
18301894
* Reload the guest's PDPTRs since after a migration
18311895
* the guest CR3 might be restored prior to setting the nested
@@ -1834,7 +1898,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
18341898
if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
18351899
return false;
18361900

1837-
if (!nested_svm_vmrun_msrpm(svm)) {
1901+
if (!nested_svm_merge_msrpm(vcpu)) {
18381902
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
18391903
vcpu->run->internal.suberror =
18401904
KVM_INTERNAL_ERROR_EMULATION;

0 commit comments

Comments
 (0)