Skip to content

Commit da23ea1

Browse files
committed
Merge tag 'mm-stable-2025-08-03-12-35' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull more MM updates from Andrew Morton: "Significant patch series in this pull request: - "mseal cleanups" (Lorenzo Stoakes) Some mseal cleaning with no intended functional change. - "Optimizations for khugepaged" (David Hildenbrand) Improve khugepaged throughput by batching PTE operations for large folios. This gain is mainly for arm64. - "x86: enable EXECMEM_ROX_CACHE for ftrace and kprobes" (Mike Rapoport) A bugfix, additional debug code and cleanups to the execmem code. - "mm/shmem, swap: bugfix and improvement of mTHP swap in" (Kairui Song) Bugfixes, cleanups and performance improvememnts to the mTHP swapin code" * tag 'mm-stable-2025-08-03-12-35' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (38 commits) mm: mempool: fix crash in mempool_free() for zero-minimum pools mm: correct type for vmalloc vm_flags fields mm/shmem, swap: fix major fault counting mm/shmem, swap: rework swap entry and index calculation for large swapin mm/shmem, swap: simplify swapin path and result handling mm/shmem, swap: never use swap cache and readahead for SWP_SYNCHRONOUS_IO mm/shmem, swap: tidy up swap entry splitting mm/shmem, swap: tidy up THP swapin checks mm/shmem, swap: avoid redundant Xarray lookup during swapin x86/ftrace: enable EXECMEM_ROX_CACHE for ftrace allocations x86/kprobes: enable EXECMEM_ROX_CACHE for kprobes allocations execmem: drop writable parameter from execmem_fill_trapping_insns() execmem: add fallback for failures in vmalloc(VM_ALLOW_HUGE_VMAP) execmem: move execmem_force_rw() and execmem_restore_rox() before use execmem: rework execmem_cache_free() execmem: introduce execmem_alloc_rw() execmem: drop unused execmem_update_copy() mm: fix a UAF when vma->mm is freed after vma->vm_refcnt got dropped mm/rmap: add anon_vma lifetime debug check mm: remove mm/io-mapping.c ...
2 parents 7e161a9 + a2152fe commit da23ea1

42 files changed

Lines changed: 1080 additions & 511 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Documentation/core-api/mm-api.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,4 +133,3 @@ More Memory Management Functions
133133
.. kernel-doc:: mm/mmu_notifier.c
134134
.. kernel-doc:: mm/balloon_compaction.c
135135
.. kernel-doc:: mm/huge_memory.c
136-
.. kernel-doc:: mm/io-mapping.c

arch/arm64/mm/mmu.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,7 @@ void mark_rodata_ro(void)
721721

722722
static void __init declare_vma(struct vm_struct *vma,
723723
void *va_start, void *va_end,
724-
vm_flags_t vm_flags)
724+
unsigned long vm_flags)
725725
{
726726
phys_addr_t pa_start = __pa_symbol(va_start);
727727
unsigned long size = va_end - va_start;
@@ -1528,7 +1528,7 @@ early_initcall(prevent_bootmem_remove_init);
15281528
pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr,
15291529
pte_t *ptep, unsigned int nr)
15301530
{
1531-
pte_t pte = get_and_clear_full_ptes(vma->vm_mm, addr, ptep, nr, /* full = */ 0);
1531+
pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr);
15321532

15331533
if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) {
15341534
/*

arch/x86/kernel/alternative.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ struct its_array its_pages;
120120

121121
static void *__its_alloc(struct its_array *pages)
122122
{
123-
void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
123+
void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE);
124124
if (!page)
125125
return NULL;
126126

@@ -237,7 +237,6 @@ static void *its_alloc(void)
237237
if (!page)
238238
return NULL;
239239

240-
execmem_make_temp_rw(page, PAGE_SIZE);
241240
if (pages == &its_pages)
242241
set_memory_x((unsigned long)page, 1);
243242

arch/x86/kernel/ftrace.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ void arch_ftrace_update_code(int command)
263263

264264
static inline void *alloc_tramp(unsigned long size)
265265
{
266-
return execmem_alloc(EXECMEM_FTRACE, size);
266+
return execmem_alloc_rw(EXECMEM_FTRACE, size);
267267
}
268268
static inline void tramp_free(void *tramp)
269269
{

arch/x86/kernel/kprobes/core.c

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -481,24 +481,6 @@ static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p,
481481
return len;
482482
}
483483

484-
/* Make page to RO mode when allocate it */
485-
void *alloc_insn_page(void)
486-
{
487-
void *page;
488-
489-
page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
490-
if (!page)
491-
return NULL;
492-
493-
/*
494-
* TODO: Once additional kernel code protection mechanisms are set, ensure
495-
* that the page was not maliciously altered and it is still zeroed.
496-
*/
497-
set_memory_rox((unsigned long)page, 1);
498-
499-
return page;
500-
}
501-
502484
/* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */
503485

504486
static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs)

arch/x86/mm/init.c

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1063,13 +1063,9 @@ unsigned long arch_max_swapfile_size(void)
10631063
static struct execmem_info execmem_info __ro_after_init;
10641064

10651065
#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
1066-
void execmem_fill_trapping_insns(void *ptr, size_t size, bool writeable)
1066+
void execmem_fill_trapping_insns(void *ptr, size_t size)
10671067
{
1068-
/* fill memory with INT3 instructions */
1069-
if (writeable)
1070-
memset(ptr, INT3_INSN_OPCODE, size);
1071-
else
1072-
text_poke_set(ptr, INT3_INSN_OPCODE, size);
1068+
memset(ptr, INT3_INSN_OPCODE, size);
10731069
}
10741070
#endif
10751071

@@ -1102,7 +1098,21 @@ struct execmem_info __init *execmem_arch_setup(void)
11021098
.pgprot = pgprot,
11031099
.alignment = MODULE_ALIGN,
11041100
},
1105-
[EXECMEM_KPROBES ... EXECMEM_BPF] = {
1101+
[EXECMEM_KPROBES] = {
1102+
.flags = flags,
1103+
.start = start,
1104+
.end = MODULES_END,
1105+
.pgprot = PAGE_KERNEL_ROX,
1106+
.alignment = MODULE_ALIGN,
1107+
},
1108+
[EXECMEM_FTRACE] = {
1109+
.flags = flags,
1110+
.start = start,
1111+
.end = MODULES_END,
1112+
.pgprot = pgprot,
1113+
.alignment = MODULE_ALIGN,
1114+
},
1115+
[EXECMEM_BPF] = {
11061116
.flags = EXECMEM_KASAN_SHADOW,
11071117
.start = start,
11081118
.end = MODULES_END,

include/linux/execmem.h

Lines changed: 23 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -60,27 +60,11 @@ enum execmem_range_flags {
6060
* will trap
6161
* @ptr: pointer to memory to fill
6262
* @size: size of the range to fill
63-
* @writable: is the memory poited by @ptr is writable or ROX
6463
*
6564
* A hook for architecures to fill execmem ranges with invalid instructions.
6665
* Architectures that use EXECMEM_ROX_CACHE must implement this.
6766
*/
68-
void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable);
69-
70-
/**
71-
* execmem_make_temp_rw - temporarily remap region with read-write
72-
* permissions
73-
* @ptr: address of the region to remap
74-
* @size: size of the region to remap
75-
*
76-
* Remaps a part of the cached large page in the ROX cache in the range
77-
* [@ptr, @ptr + @size) as writable and not executable. The caller must
78-
* have exclusive ownership of this range and ensure nothing will try to
79-
* execute code in this range.
80-
*
81-
* Return: 0 on success or negative error code on failure.
82-
*/
83-
int execmem_make_temp_rw(void *ptr, size_t size);
67+
void execmem_fill_trapping_insns(void *ptr, size_t size);
8468

8569
/**
8670
* execmem_restore_rox - restore read-only-execute permissions
@@ -95,7 +79,6 @@ int execmem_make_temp_rw(void *ptr, size_t size);
9579
*/
9680
int execmem_restore_rox(void *ptr, size_t size);
9781
#else
98-
static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; }
9982
static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; }
10083
#endif
10184

@@ -165,6 +148,28 @@ struct execmem_info *execmem_arch_setup(void);
165148
*/
166149
void *execmem_alloc(enum execmem_type type, size_t size);
167150

151+
/**
152+
* execmem_alloc_rw - allocate writable executable memory
153+
* @type: type of the allocation
154+
* @size: how many bytes of memory are required
155+
*
156+
* Allocates memory that will contain executable code, either generated or
157+
* loaded from kernel modules.
158+
*
159+
* Allocates memory that will contain data coupled with executable code,
160+
* like data sections in kernel modules.
161+
*
162+
* Forces writable permissions on the allocated memory and the caller is
163+
* responsible to manage the permissions afterwards.
164+
*
165+
* For architectures that use ROX cache the permissions will be set to R+W.
166+
* For architectures that don't use ROX cache the default permissions for @type
167+
* will be used as they must be writable.
168+
*
169+
* Return: a pointer to the allocated memory or %NULL
170+
*/
171+
void *execmem_alloc_rw(enum execmem_type type, size_t size);
172+
168173
/**
169174
* execmem_free - free executable memory
170175
* @ptr: pointer to the memory that should be freed
@@ -185,19 +190,6 @@ DEFINE_FREE(execmem, void *, if (_T) execmem_free(_T));
185190
struct vm_struct *execmem_vmap(size_t size);
186191
#endif
187192

188-
/**
189-
* execmem_update_copy - copy an update to executable memory
190-
* @dst: destination address to update
191-
* @src: source address containing the data
192-
* @size: how many bytes of memory shold be copied
193-
*
194-
* Copy @size bytes from @src to @dst using text poking if the memory at
195-
* @dst is read-only.
196-
*
197-
* Return: a pointer to @dst or NULL on error
198-
*/
199-
void *execmem_update_copy(void *dst, const void *src, size_t size);
200-
201193
/**
202194
* execmem_is_rox - check if execmem is read-only
203195
* @type - the execmem type to check

include/linux/io-mapping.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,4 @@ io_mapping_free(struct io_mapping *iomap)
225225
kfree(iomap);
226226
}
227227

228-
int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
229-
unsigned long addr, unsigned long pfn, unsigned long size);
230-
231228
#endif /* _LINUX_IO_MAPPING_H */

include/linux/mm.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -414,8 +414,10 @@ extern unsigned int kobjsize(const void *objp);
414414
#endif
415415

416416
#ifdef CONFIG_64BIT
417-
/* VM is sealed, in vm_flags */
418-
#define VM_SEALED _BITUL(63)
417+
#define VM_SEALED_BIT 42
418+
#define VM_SEALED BIT(VM_SEALED_BIT)
419+
#else
420+
#define VM_SEALED VM_NONE
419421
#endif
420422

421423
/* Bits set in the VMA until the stack is in its final location */

include/linux/mmap_lock.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ extern int rcuwait_wake_up(struct rcuwait *w);
1212
#include <linux/tracepoint-defs.h>
1313
#include <linux/types.h>
1414
#include <linux/cleanup.h>
15+
#include <linux/sched/mm.h>
1516

1617
#define MMAP_LOCK_INITIALIZER(name) \
1718
.mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
@@ -154,6 +155,10 @@ static inline void vma_refcount_put(struct vm_area_struct *vma)
154155
* reused and attached to a different mm before we lock it.
155156
* Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
156157
* detached.
158+
*
159+
* WARNING! The vma passed to this function cannot be used if the function
160+
* fails to lock it because in certain cases RCU lock is dropped and then
161+
* reacquired. Once RCU lock is dropped the vma can be concurently freed.
157162
*/
158163
static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
159164
struct vm_area_struct *vma)
@@ -183,6 +188,31 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
183188
}
184189

185190
rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
191+
192+
/*
193+
* If vma got attached to another mm from under us, that mm is not
194+
* stable and can be freed in the narrow window after vma->vm_refcnt
195+
* is dropped and before rcuwait_wake_up(mm) is called. Grab it before
196+
* releasing vma->vm_refcnt.
197+
*/
198+
if (unlikely(vma->vm_mm != mm)) {
199+
/* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
200+
struct mm_struct *other_mm = vma->vm_mm;
201+
202+
/*
203+
* __mmdrop() is a heavy operation and we don't need RCU
204+
* protection here. Release RCU lock during these operations.
205+
* We reinstate the RCU read lock as the caller expects it to
206+
* be held when this function returns even on error.
207+
*/
208+
rcu_read_unlock();
209+
mmgrab(other_mm);
210+
vma_refcount_put(vma);
211+
mmdrop(other_mm);
212+
rcu_read_lock();
213+
return NULL;
214+
}
215+
186216
/*
187217
* Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
188218
* False unlocked result is impossible because we modify and check

0 commit comments

Comments
 (0)