Skip to content

Commit 52e054f

Browse files
Baolin Wangakpm00
authored andcommitted
mm: rmap: support batched checks of the references for large folios
Patch series "support batch checking of references and unmapping for large folios", v6. Currently, folio_referenced_one() always checks the young flag for each PTE sequentially, which is inefficient for large folios. This inefficiency is especially noticeable when reclaiming clean file-backed large folios, where folio_referenced() is observed as a significant performance hotspot. Moreover, on Arm architecture, which supports contiguous PTEs, there is already an optimization to clear the young flags for PTEs within a contiguous range. However, this is not sufficient. We can extend this to perform batched operations for the entire large folio (which might exceed the contiguous range: CONT_PTE_SIZE). Similar to folio_referenced_one(), we can also apply batched unmapping for large file folios to optimize the performance of file folio reclamation. By supporting batched checking of the young flags, flushing TLB entries, and unmapping, I can observed a significant performance improvements in my performance tests for file folios reclamation. Please check the performance data in the commit message of each patch. This patch (of 5): Currently, folio_referenced_one() always checks the young flag for each PTE sequentially, which is inefficient for large folios. This inefficiency is especially noticeable when reclaiming clean file-backed large folios, where folio_referenced() is observed as a significant performance hotspot. Moreover, on Arm64 architecture, which supports contiguous PTEs, there is already an optimization to clear the young flags for PTEs within a contiguous range. However, this is not sufficient. We can extend this to perform batched operations for the entire large folio (which might exceed the contiguous range: CONT_PTE_SIZE). Introduce a new API: clear_flush_young_ptes() to facilitate batched checking of the young flags and flushing TLB entries, thereby improving performance during large folio reclamation. And it will be overridden by the architecture that implements a more efficient batch operation in the following patches. While we are at it, rename ptep_clear_flush_young_notify() to clear_flush_young_ptes_notify() to indicate that this is a batch operation. Link: https://lkml.kernel.org/r/cover.1770645603.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/12132694536834262062d1fb304f8f8a064b6750.1770645603.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com> Reviewed-by: Harry Yoo <harry.yoo@oracle.com> Reviewed-by: Ryan Roberts <ryan.roberts@arm.com> Acked-by: David Hildenbrand (Arm) <david@kernel.org> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Jann Horn <jannh@google.com> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Rapoport <rppt@kernel.org> Cc: Rik van Riel <riel@surriel.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Will Deacon <will@kernel.org> Cc: Barry Song <baohua@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
1 parent f615cc9 commit 52e054f

3 files changed

Lines changed: 65 additions & 7 deletions

File tree

include/linux/mmu_notifier.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -515,16 +515,17 @@ static inline void mmu_notifier_range_init_owner(
515515
range->owner = owner;
516516
}
517517

518-
#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \
518+
#define clear_flush_young_ptes_notify(__vma, __address, __ptep, __nr) \
519519
({ \
520520
int __young; \
521521
struct vm_area_struct *___vma = __vma; \
522522
unsigned long ___address = __address; \
523-
__young = ptep_clear_flush_young(___vma, ___address, __ptep); \
523+
unsigned int ___nr = __nr; \
524+
__young = clear_flush_young_ptes(___vma, ___address, __ptep, ___nr); \
524525
__young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \
525526
___address, \
526527
___address + \
527-
PAGE_SIZE); \
528+
___nr * PAGE_SIZE); \
528529
__young; \
529530
})
530531

@@ -650,7 +651,7 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
650651

651652
#define mmu_notifier_range_update_to_read_only(r) false
652653

653-
#define ptep_clear_flush_young_notify ptep_clear_flush_young
654+
#define clear_flush_young_ptes_notify clear_flush_young_ptes
654655
#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
655656
#define ptep_clear_young_notify ptep_test_and_clear_young
656657
#define pmdp_clear_young_notify pmdp_test_and_clear_young

include/linux/pgtable.h

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,6 +1068,41 @@ static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
10681068
}
10691069
#endif
10701070

1071+
#ifndef clear_flush_young_ptes
1072+
/**
1073+
* clear_flush_young_ptes - Mark PTEs that map consecutive pages of the same
1074+
* folio as old and flush the TLB.
1075+
* @vma: The virtual memory area the pages are mapped into.
1076+
* @addr: Address the first page is mapped at.
1077+
* @ptep: Page table pointer for the first entry.
1078+
* @nr: Number of entries to clear access bit.
1079+
*
1080+
* May be overridden by the architecture; otherwise, implemented as a simple
1081+
* loop over ptep_clear_flush_young().
1082+
*
1083+
* Note that PTE bits in the PTE range besides the PFN can differ. For example,
1084+
* some PTEs might be write-protected.
1085+
*
1086+
* Context: The caller holds the page table lock. The PTEs map consecutive
1087+
* pages that belong to the same folio. The PTEs are all in the same PMD.
1088+
*/
1089+
static inline int clear_flush_young_ptes(struct vm_area_struct *vma,
1090+
unsigned long addr, pte_t *ptep, unsigned int nr)
1091+
{
1092+
int young = 0;
1093+
1094+
for (;;) {
1095+
young |= ptep_clear_flush_young(vma, addr, ptep);
1096+
if (--nr == 0)
1097+
break;
1098+
ptep++;
1099+
addr += PAGE_SIZE;
1100+
}
1101+
1102+
return young;
1103+
}
1104+
#endif
1105+
10711106
/*
10721107
* On some architectures hardware does not set page access bit when accessing
10731108
* memory page, it is responsibility of software setting this bit. It brings

mm/rmap.c

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -913,9 +913,11 @@ static bool folio_referenced_one(struct folio *folio,
913913
struct folio_referenced_arg *pra = arg;
914914
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
915915
int ptes = 0, referenced = 0;
916+
unsigned int nr;
916917

917918
while (page_vma_mapped_walk(&pvmw)) {
918919
address = pvmw.address;
920+
nr = 1;
919921

920922
if (vma->vm_flags & VM_LOCKED) {
921923
ptes++;
@@ -960,9 +962,21 @@ static bool folio_referenced_one(struct folio *folio,
960962
if (lru_gen_look_around(&pvmw))
961963
referenced++;
962964
} else if (pvmw.pte) {
963-
if (ptep_clear_flush_young_notify(vma, address,
964-
pvmw.pte))
965+
if (folio_test_large(folio)) {
966+
unsigned long end_addr = pmd_addr_end(address, vma->vm_end);
967+
unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT;
968+
pte_t pteval = ptep_get(pvmw.pte);
969+
970+
nr = folio_pte_batch(folio, pvmw.pte,
971+
pteval, max_nr);
972+
}
973+
974+
ptes += nr;
975+
if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr))
965976
referenced++;
977+
/* Skip the batched PTEs */
978+
pvmw.pte += nr - 1;
979+
pvmw.address += (nr - 1) * PAGE_SIZE;
966980
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
967981
if (pmdp_clear_flush_young_notify(vma, address,
968982
pvmw.pmd))
@@ -972,7 +986,15 @@ static bool folio_referenced_one(struct folio *folio,
972986
WARN_ON_ONCE(1);
973987
}
974988

975-
pra->mapcount--;
989+
pra->mapcount -= nr;
990+
/*
991+
* If we are sure that we batched the entire folio,
992+
* we can just optimize and stop right here.
993+
*/
994+
if (ptes == pvmw.nr_pages) {
995+
page_vma_mapped_walk_done(&pvmw);
996+
break;
997+
}
976998
}
977999

9781000
if (referenced)

0 commit comments

Comments
 (0)