Skip to content

Commit b3039c5

Browse files
hansendcgregkh
authored andcommitted
mm: introduce deferred freeing for kernel page tables
commit 5ba2f0a upstream. This introduces a conditional asynchronous mechanism, enabled by CONFIG_ASYNC_KERNEL_PGTABLE_FREE. When enabled, this mechanism defers the freeing of pages that are used as page tables for kernel address mappings. These pages are now queued to a work struct instead of being freed immediately. This deferred freeing allows for batch-freeing of page tables, providing a safe context for performing a single expensive operation (TLB flush) for a batch of kernel page tables instead of performing that expensive operation for each page table. Link: https://lkml.kernel.org/r/20251022082635.2462433-8-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Acked-by: David Hildenbrand <david@redhat.com> Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org> Cc: Alistair Popple <apopple@nvidia.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Betkov <bp@alien8.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jann Horn <jannh@google.com> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org> Cc: Joerg Roedel <joro@8bytes.org> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michal Hocko <mhocko@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Robin Murohy <robin.murphy@arm.com> Cc: Thomas Gleinxer <tglx@linutronix.de> Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com> Cc: Vasant Hegde <vasant.hegde@amd.com> Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Will Deacon <will@kernel.org> Cc: Yi Lai <yi1.lai@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent a1593c9 commit b3039c5

3 files changed

Lines changed: 53 additions & 3 deletions

File tree

include/linux/mm.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3038,6 +3038,14 @@ static inline void __pagetable_free(struct ptdesc *pt)
30383038
__free_pages(page, compound_order(page));
30393039
}
30403040

3041+
#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
3042+
void pagetable_free_kernel(struct ptdesc *pt);
3043+
#else
3044+
static inline void pagetable_free_kernel(struct ptdesc *pt)
3045+
{
3046+
__pagetable_free(pt);
3047+
}
3048+
#endif
30413049
/**
30423050
* pagetable_free - Free pagetables
30433051
* @pt: The page table descriptor
@@ -3047,10 +3055,12 @@ static inline void __pagetable_free(struct ptdesc *pt)
30473055
*/
30483056
static inline void pagetable_free(struct ptdesc *pt)
30493057
{
3050-
if (ptdesc_test_kernel(pt))
3058+
if (ptdesc_test_kernel(pt)) {
30513059
ptdesc_clear_kernel(pt);
3052-
3053-
__pagetable_free(pt);
3060+
pagetable_free_kernel(pt);
3061+
} else {
3062+
__pagetable_free(pt);
3063+
}
30543064
}
30553065

30563066
#if defined(CONFIG_SPLIT_PTE_PTLOCKS)

mm/Kconfig

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -915,6 +915,9 @@ config HAVE_GIGANTIC_FOLIOS
915915
def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \
916916
(ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
917917

918+
config ASYNC_KERNEL_PGTABLE_FREE
919+
def_bool n
920+
918921
# TODO: Allow to be enabled without THP
919922
config ARCH_SUPPORTS_HUGE_PFNMAP
920923
def_bool n

mm/pgtable-generic.c

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,3 +406,40 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
406406
pte_unmap_unlock(pte, ptl);
407407
goto again;
408408
}
409+
410+
#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
411+
static void kernel_pgtable_work_func(struct work_struct *work);
412+
413+
static struct {
414+
struct list_head list;
415+
/* protect above ptdesc lists */
416+
spinlock_t lock;
417+
struct work_struct work;
418+
} kernel_pgtable_work = {
419+
.list = LIST_HEAD_INIT(kernel_pgtable_work.list),
420+
.lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock),
421+
.work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func),
422+
};
423+
424+
static void kernel_pgtable_work_func(struct work_struct *work)
425+
{
426+
struct ptdesc *pt, *next;
427+
LIST_HEAD(page_list);
428+
429+
spin_lock(&kernel_pgtable_work.lock);
430+
list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
431+
spin_unlock(&kernel_pgtable_work.lock);
432+
433+
list_for_each_entry_safe(pt, next, &page_list, pt_list)
434+
__pagetable_free(pt);
435+
}
436+
437+
void pagetable_free_kernel(struct ptdesc *pt)
438+
{
439+
spin_lock(&kernel_pgtable_work.lock);
440+
list_add(&pt->pt_list, &kernel_pgtable_work.list);
441+
spin_unlock(&kernel_pgtable_work.lock);
442+
443+
schedule_work(&kernel_pgtable_work.work);
444+
}
445+
#endif

0 commit comments

Comments
 (0)