Skip to content

Commit 50ac57c

Browse files
committed
Merge tag 'x86_tdx_for_6.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 TDX updates from Dave Hansen: "The biggest change here is making TDX and kexec play nicely together. Before this, the memory encryption hardware (which doesn't respect cache coherency) could write back old cachelines on top of data in the new kernel, so kexec and TDX were made mutually exclusive. This removes the limitation. There is also some work to tighten up a hardware bug workaround and some MAINTAINERS updates. - Make TDX and kexec work together - Skip TDX bug workaround when the bug is not present - Update maintainers entries" * tag 'x86_tdx_for_6.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/virt/tdx: Use precalculated TDVPR page physical address KVM/TDX: Explicitly do WBINVD when no more TDX SEAMCALLs x86/virt/tdx: Update the kexec section in the TDX documentation x86/virt/tdx: Remove the !KEXEC_CORE dependency x86/kexec: Disable kexec/kdump on platforms with TDX partial write erratum x86/virt/tdx: Mark memory cache state incoherent when making SEAMCALL x86/sme: Use percpu boolean to control WBINVD during kexec x86/kexec: Consolidate relocate_kernel() function parameters x86/tdx: Skip clearing reclaimed pages unless X86_BUG_TDX_PW_MCE is present x86/tdx: Tidy reset_pamt functions x86/tdx: Eliminate duplicate code in tdx_clear_page() MAINTAINERS: Add KVM mail list to the TDX entry MAINTAINERS: Add Rick Edgecombe as a TDX reviewer MAINTAINERS: Update the file list in the TDX entry.
2 parents 5b7ce93 + e414b10 commit 50ac57c

12 files changed

Lines changed: 214 additions & 106 deletions

File tree

Documentation/arch/x86/tdx.rst

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -142,13 +142,6 @@ but depends on the BIOS to behave correctly.
142142
Note TDX works with CPU logical online/offline, thus the kernel still
143143
allows to offline logical CPU and online it again.
144144

145-
Kexec()
146-
~~~~~~~
147-
148-
TDX host support currently lacks the ability to handle kexec. For
149-
simplicity only one of them can be enabled in the Kconfig. This will be
150-
fixed in the future.
151-
152145
Erratum
153146
~~~~~~~
154147

@@ -171,6 +164,13 @@ If the platform has such erratum, the kernel prints additional message in
171164
machine check handler to tell user the machine check may be caused by
172165
kernel bug on TDX private memory.
173166

167+
Kexec
168+
~~~~~~~
169+
170+
Currently kexec doesn't work on the TDX platforms with the aforementioned
171+
erratum. It fails when loading the kexec kernel image. Otherwise it
172+
works normally.
173+
174174
Interaction vs S3 and deeper states
175175
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
176176

MAINTAINERS

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27723,17 +27723,14 @@ F: arch/x86/kernel/unwind_*.c
2772327723
X86 TRUST DOMAIN EXTENSIONS (TDX)
2772427724
M: Kirill A. Shutemov <kas@kernel.org>
2772527725
R: Dave Hansen <dave.hansen@linux.intel.com>
27726+
R: Rick Edgecombe <rick.p.edgecombe@intel.com>
2772627727
L: x86@kernel.org
2772727728
L: linux-coco@lists.linux.dev
27729+
L: kvm@vger.kernel.org
2772827730
S: Supported
2772927731
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/tdx
27730-
F: Documentation/ABI/testing/sysfs-devices-virtual-misc-tdx_guest
27731-
F: arch/x86/boot/compressed/tdx*
27732-
F: arch/x86/coco/tdx/
27733-
F: arch/x86/include/asm/shared/tdx.h
27734-
F: arch/x86/include/asm/tdx.h
27735-
F: arch/x86/virt/vmx/tdx/
27736-
F: drivers/virt/coco/tdx-guest
27732+
N: tdx
27733+
K: \b(tdx)
2773727734

2773827735
X86 VDSO
2773927736
M: Andy Lutomirski <luto@kernel.org>

arch/x86/Kconfig

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1902,7 +1902,6 @@ config INTEL_TDX_HOST
19021902
depends on X86_X2APIC
19031903
select ARCH_KEEP_MEMBLOCK
19041904
depends on CONTIG_ALLOC
1905-
depends on !KEXEC_CORE
19061905
depends on X86_MCE
19071906
help
19081907
Intel Trust Domain Extensions (TDX) protects guest VMs from malicious

arch/x86/include/asm/kexec.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,15 @@
1313
# define KEXEC_DEBUG_EXC_HANDLER_SIZE 6 /* PUSHI, PUSHI, 2-byte JMP */
1414
#endif
1515

16+
#ifdef CONFIG_X86_64
17+
18+
#include <linux/bits.h>
19+
20+
#define RELOC_KERNEL_PRESERVE_CONTEXT BIT(0)
21+
#define RELOC_KERNEL_CACHE_INCOHERENT BIT(1)
22+
23+
#endif
24+
1625
# define KEXEC_CONTROL_PAGE_SIZE 4096
1726
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
1827

@@ -121,8 +130,7 @@ typedef unsigned long
121130
relocate_kernel_fn(unsigned long indirection_page,
122131
unsigned long pa_control_page,
123132
unsigned long start_address,
124-
unsigned int preserve_context,
125-
unsigned int host_mem_enc_active);
133+
unsigned int flags);
126134
#endif
127135
extern relocate_kernel_fn relocate_kernel;
128136
#define ARCH_HAS_KIMAGE_ARCH

arch/x86/include/asm/processor.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,8 @@ void __noreturn stop_this_cpu(void *dummy);
731731
void microcode_check(struct cpuinfo_x86 *prev_info);
732732
void store_cpu_caps(struct cpuinfo_x86 *info);
733733

734+
DECLARE_PER_CPU(bool, cache_state_incoherent);
735+
734736
enum l1tf_mitigations {
735737
L1TF_MITIGATION_OFF,
736738
L1TF_MITIGATION_AUTO,

arch/x86/include/asm/tdx.h

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,18 +102,41 @@ u64 __seamcall_ret(u64 fn, struct tdx_module_args *args);
102102
u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args);
103103
void tdx_init(void);
104104

105+
#include <linux/preempt.h>
105106
#include <asm/archrandom.h>
107+
#include <asm/processor.h>
106108

107109
typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args);
108110

111+
static __always_inline u64 __seamcall_dirty_cache(sc_func_t func, u64 fn,
112+
struct tdx_module_args *args)
113+
{
114+
lockdep_assert_preemption_disabled();
115+
116+
/*
117+
* SEAMCALLs are made to the TDX module and can generate dirty
118+
* cachelines of TDX private memory. Mark cache state incoherent
119+
* so that the cache can be flushed during kexec.
120+
*
121+
* This needs to be done before actually making the SEAMCALL,
122+
* because kexec-ing CPU could send NMI to stop remote CPUs,
123+
* in which case even disabling IRQ won't help here.
124+
*/
125+
this_cpu_write(cache_state_incoherent, true);
126+
127+
return func(fn, args);
128+
}
129+
109130
static __always_inline u64 sc_retry(sc_func_t func, u64 fn,
110131
struct tdx_module_args *args)
111132
{
112133
int retry = RDRAND_RETRY_LOOPS;
113134
u64 ret;
114135

115136
do {
116-
ret = func(fn, args);
137+
preempt_disable();
138+
ret = __seamcall_dirty_cache(func, fn, args);
139+
preempt_enable();
117140
} while (ret == TDX_RND_NO_ENTROPY && --retry);
118141

119142
return ret;
@@ -131,6 +154,8 @@ int tdx_guest_keyid_alloc(void);
131154
u32 tdx_get_nr_guest_keyids(void);
132155
void tdx_guest_keyid_free(unsigned int keyid);
133156

157+
void tdx_quirk_reset_page(struct page *page);
158+
134159
struct tdx_td {
135160
/* TD root structure: */
136161
struct page *tdr_page;
@@ -146,6 +171,8 @@ struct tdx_td {
146171
struct tdx_vp {
147172
/* TDVP root page */
148173
struct page *tdvpr_page;
174+
/* precalculated page_to_phys(tdvpr_page) for use in noinstr code */
175+
phys_addr_t tdvpr_pa;
149176

150177
/* TD vCPU control structure: */
151178
struct page **tdcx_pages;
@@ -203,5 +230,11 @@ static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; }
203230
static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; }
204231
#endif /* CONFIG_INTEL_TDX_HOST */
205232

233+
#ifdef CONFIG_KEXEC_CORE
234+
void tdx_cpu_flush_cache_for_kexec(void);
235+
#else
236+
static inline void tdx_cpu_flush_cache_for_kexec(void) { }
237+
#endif
238+
206239
#endif /* !__ASSEMBLER__ */
207240
#endif /* _ASM_X86_TDX_H */

arch/x86/kernel/cpu/amd.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,23 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
545545
{
546546
u64 msr;
547547

548+
/*
549+
* Mark using WBINVD is needed during kexec on processors that
550+
* support SME. This provides support for performing a successful
551+
* kexec when going from SME inactive to SME active (or vice-versa).
552+
*
553+
* The cache must be cleared so that if there are entries with the
554+
* same physical address, both with and without the encryption bit,
555+
* they don't race each other when flushed and potentially end up
556+
* with the wrong entry being committed to memory.
557+
*
558+
* Test the CPUID bit directly because with mem_encrypt=off the
559+
* BSP will clear the X86_FEATURE_SME bit and the APs will not
560+
* see it set after that.
561+
*/
562+
if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
563+
__this_cpu_write(cache_state_incoherent, true);
564+
548565
/*
549566
* BIOS support is required for SME and SEV.
550567
* For SME: If BIOS has enabled SME then adjust x86_phys_bits by

arch/x86/kernel/machine_kexec_64.c

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <asm/set_memory.h>
3030
#include <asm/cpu.h>
3131
#include <asm/efi.h>
32+
#include <asm/processor.h>
3233

3334
#ifdef CONFIG_ACPI
3435
/*
@@ -346,6 +347,22 @@ int machine_kexec_prepare(struct kimage *image)
346347
unsigned long reloc_end = (unsigned long)__relocate_kernel_end;
347348
int result;
348349

350+
/*
351+
* Some early TDX-capable platforms have an erratum. A kernel
352+
* partial write (a write transaction of less than cacheline
353+
* lands at memory controller) to TDX private memory poisons that
354+
* memory, and a subsequent read triggers a machine check.
355+
*
356+
* On those platforms the old kernel must reset TDX private
357+
* memory before jumping to the new kernel otherwise the new
358+
* kernel may see unexpected machine check. For simplicity
359+
* just fail kexec/kdump on those platforms.
360+
*/
361+
if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) {
362+
pr_info_once("Not allowed on platform with tdx_pw_mce bug\n");
363+
return -EOPNOTSUPP;
364+
}
365+
349366
/* Setup the identity mapped 64bit page table */
350367
result = init_pgtable(image, __pa(control_page));
351368
if (result)
@@ -384,16 +401,10 @@ void __nocfi machine_kexec(struct kimage *image)
384401
{
385402
unsigned long reloc_start = (unsigned long)__relocate_kernel_start;
386403
relocate_kernel_fn *relocate_kernel_ptr;
387-
unsigned int host_mem_enc_active;
404+
unsigned int relocate_kernel_flags;
388405
int save_ftrace_enabled;
389406
void *control_page;
390407

391-
/*
392-
* This must be done before load_segments() since if call depth tracking
393-
* is used then GS must be valid to make any function calls.
394-
*/
395-
host_mem_enc_active = cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT);
396-
397408
#ifdef CONFIG_KEXEC_JUMP
398409
if (image->preserve_context)
399410
save_processor_state();
@@ -427,6 +438,17 @@ void __nocfi machine_kexec(struct kimage *image)
427438
*/
428439
relocate_kernel_ptr = control_page + (unsigned long)relocate_kernel - reloc_start;
429440

441+
relocate_kernel_flags = 0;
442+
if (image->preserve_context)
443+
relocate_kernel_flags |= RELOC_KERNEL_PRESERVE_CONTEXT;
444+
445+
/*
446+
* This must be done before load_segments() since it resets
447+
* GS to 0 and percpu data needs the correct GS to work.
448+
*/
449+
if (this_cpu_read(cache_state_incoherent))
450+
relocate_kernel_flags |= RELOC_KERNEL_CACHE_INCOHERENT;
451+
430452
/*
431453
* The segment registers are funny things, they have both a
432454
* visible and an invisible part. Whenever the visible part is
@@ -436,15 +458,19 @@ void __nocfi machine_kexec(struct kimage *image)
436458
*
437459
* Take advantage of this here by force loading the segments,
438460
* before the GDT is zapped with an invalid value.
461+
*
462+
* load_segments() resets GS to 0. Don't make any function call
463+
* after here since call depth tracking uses percpu variables to
464+
* operate (relocate_kernel() is explicitly ignored by call depth
465+
* tracking).
439466
*/
440467
load_segments();
441468

442469
/* now call it */
443470
image->start = relocate_kernel_ptr((unsigned long)image->head,
444471
virt_to_phys(control_page),
445472
image->start,
446-
image->preserve_context,
447-
host_mem_enc_active);
473+
relocate_kernel_flags);
448474

449475
#ifdef CONFIG_KEXEC_JUMP
450476
if (image->preserve_context)

arch/x86/kernel/process.c

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,16 @@ EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
8888
DEFINE_PER_CPU(bool, __tss_limit_invalid);
8989
EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
9090

91+
/*
92+
* The cache may be in an incoherent state and needs flushing during kexec.
93+
* E.g., on SME/TDX platforms, dirty cacheline aliases with and without
94+
* encryption bit(s) can coexist and the cache needs to be flushed before
95+
* booting to the new kernel to avoid the silent memory corruption due to
96+
* dirty cachelines with different encryption property being written back
97+
* to the memory.
98+
*/
99+
DEFINE_PER_CPU(bool, cache_state_incoherent);
100+
91101
/*
92102
* this gets called so that we can store lazy state into memory and copy the
93103
* current task into the new thread.
@@ -827,19 +837,7 @@ void __noreturn stop_this_cpu(void *dummy)
827837
disable_local_APIC();
828838
mcheck_cpu_clear(c);
829839

830-
/*
831-
* Use wbinvd on processors that support SME. This provides support
832-
* for performing a successful kexec when going from SME inactive
833-
* to SME active (or vice-versa). The cache must be cleared so that
834-
* if there are entries with the same physical address, both with and
835-
* without the encryption bit, they don't race each other when flushed
836-
* and potentially end up with the wrong entry being committed to
837-
* memory.
838-
*
839-
* Test the CPUID bit directly because the machine might've cleared
840-
* X86_FEATURE_SME due to cmdline options.
841-
*/
842-
if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
840+
if (this_cpu_read(cache_state_incoherent))
843841
wbinvd();
844842

845843
/*

0 commit comments

Comments
 (0)