Skip to content

Commit 79301c7

Browse files
hying-caritasctmarinas
authored andcommitted
mm: add spurious fault fixing support for huge pmd
The page faults may be spurious because of the racy access to the page table. For example, a non-populated virtual page is accessed on 2 CPUs simultaneously, thus the page faults are triggered on both CPUs. However, it's possible that one CPU (say CPU A) cannot find the reason for the page fault if the other CPU (say CPU B) has changed the page table before the PTE is checked on CPU A. Most of the time, the spurious page faults can be ignored safely. However, if the page fault is for the write access, it's possible that a stale read-only TLB entry exists in the local CPU and needs to be flushed on some architectures. This is called the spurious page fault fixing. In the current kernel, there is spurious fault fixing support for pte, but not for huge pmd because no architectures need it. But in the next patch in the series, we will change the write protection fault handling logic on arm64, so that some stale huge pmd entries may remain in the TLB. These entries need to be flushed via the huge pmd spurious fault fixing mechanism. Signed-off-by: Huang Ying <ying.huang@linux.alibaba.com> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Acked-by: David Hildenbrand <david@redhat.com> Acked-by: Zi Yan <ziy@nvidia.com> Cc: Will Deacon <will@kernel.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Yang Shi <yang@os.amperecomputing.com> Cc: Christoph Lameter (Ampere) <cl@gentwo.org> Cc: Dev Jain <dev.jain@arm.com> Cc: Barry Song <baohua@kernel.org> Cc: Anshuman Khandual <anshuman.khandual@arm.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Kevin Brodsky <kevin.brodsky@arm.com> Cc: Yin Fengwei <fengwei_yin@linux.alibaba.com> Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
1 parent 3a86608 commit 79301c7

5 files changed

Lines changed: 73 additions & 30 deletions

File tree

include/linux/huge_mm.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
1111
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1212
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
1313
struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
14-
void huge_pmd_set_accessed(struct vm_fault *vmf);
14+
bool huge_pmd_set_accessed(struct vm_fault *vmf);
1515
int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1616
pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1717
struct vm_area_struct *vma);

include/linux/pgtable.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1232,6 +1232,10 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
12321232
#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
12331233
#endif
12341234

1235+
#ifndef flush_tlb_fix_spurious_fault_pmd
1236+
#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) do { } while (0)
1237+
#endif
1238+
12351239
/*
12361240
* When walking page tables, get the address of the next boundary,
12371241
* or the end address of the range if that comes earlier. Although no

mm/huge_memory.c

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1641,17 +1641,30 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
16411641
EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
16421642
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
16431643

1644-
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1644+
/**
1645+
* touch_pmd - Mark page table pmd entry as accessed and dirty (for write)
1646+
* @vma: The VMA covering @addr
1647+
* @addr: The virtual address
1648+
* @pmd: pmd pointer into the page table mapping @addr
1649+
* @write: Whether it's a write access
1650+
*
1651+
* Return: whether the pmd entry is changed
1652+
*/
1653+
bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
16451654
pmd_t *pmd, bool write)
16461655
{
1647-
pmd_t _pmd;
1656+
pmd_t entry;
16481657

1649-
_pmd = pmd_mkyoung(*pmd);
1658+
entry = pmd_mkyoung(*pmd);
16501659
if (write)
1651-
_pmd = pmd_mkdirty(_pmd);
1660+
entry = pmd_mkdirty(entry);
16521661
if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1653-
pmd, _pmd, write))
1662+
pmd, entry, write)) {
16541663
update_mmu_cache_pmd(vma, addr, pmd);
1664+
return true;
1665+
}
1666+
1667+
return false;
16551668
}
16561669

16571670
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1841,18 +1854,14 @@ void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
18411854
}
18421855
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
18431856

1844-
void huge_pmd_set_accessed(struct vm_fault *vmf)
1857+
bool huge_pmd_set_accessed(struct vm_fault *vmf)
18451858
{
18461859
bool write = vmf->flags & FAULT_FLAG_WRITE;
18471860

1848-
vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
18491861
if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
1850-
goto unlock;
1851-
1852-
touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
1862+
return false;
18531863

1854-
unlock:
1855-
spin_unlock(vmf->ptl);
1864+
return touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
18561865
}
18571866

18581867
static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)

mm/internal.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1402,7 +1402,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs,
14021402
*/
14031403
void touch_pud(struct vm_area_struct *vma, unsigned long addr,
14041404
pud_t *pud, bool write);
1405-
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1405+
bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
14061406
pmd_t *pmd, bool write);
14071407

14081408
/*

mm/memory.c

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6115,6 +6115,45 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
61156115
return VM_FAULT_FALLBACK;
61166116
}
61176117

6118+
/*
6119+
* The page faults may be spurious because of the racy access to the
6120+
* page table. For example, a non-populated virtual page is accessed
6121+
* on 2 CPUs simultaneously, thus the page faults are triggered on
6122+
* both CPUs. However, it's possible that one CPU (say CPU A) cannot
6123+
* find the reason for the page fault if the other CPU (say CPU B) has
6124+
* changed the page table before the PTE is checked on CPU A. Most of
6125+
* the time, the spurious page faults can be ignored safely. However,
6126+
* if the page fault is for the write access, it's possible that a
6127+
* stale read-only TLB entry exists in the local CPU and needs to be
6128+
* flushed on some architectures. This is called the spurious page
6129+
* fault fixing.
6130+
*
6131+
* Note: flush_tlb_fix_spurious_fault() is defined as flush_tlb_page()
6132+
* by default and used as such on most architectures, while
6133+
* flush_tlb_fix_spurious_fault_pmd() is defined as NOP by default and
6134+
* used as such on most architectures.
6135+
*/
6136+
static void fix_spurious_fault(struct vm_fault *vmf,
6137+
enum pgtable_level ptlevel)
6138+
{
6139+
/* Skip spurious TLB flush for retried page fault */
6140+
if (vmf->flags & FAULT_FLAG_TRIED)
6141+
return;
6142+
/*
6143+
* This is needed only for protection faults but the arch code
6144+
* is not yet telling us if this is a protection fault or not.
6145+
* This still avoids useless tlb flushes for .text page faults
6146+
* with threads.
6147+
*/
6148+
if (vmf->flags & FAULT_FLAG_WRITE) {
6149+
if (ptlevel == PGTABLE_LEVEL_PTE)
6150+
flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
6151+
vmf->pte);
6152+
else
6153+
flush_tlb_fix_spurious_fault_pmd(vmf->vma, vmf->address,
6154+
vmf->pmd);
6155+
}
6156+
}
61186157
/*
61196158
* These routines also need to handle stuff like marking pages dirty
61206159
* and/or accessed for architectures that don't do it in hardware (most
@@ -6196,23 +6235,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
61966235
}
61976236
entry = pte_mkyoung(entry);
61986237
if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
6199-
vmf->flags & FAULT_FLAG_WRITE)) {
6238+
vmf->flags & FAULT_FLAG_WRITE))
62006239
update_mmu_cache_range(vmf, vmf->vma, vmf->address,
62016240
vmf->pte, 1);
6202-
} else {
6203-
/* Skip spurious TLB flush for retried page fault */
6204-
if (vmf->flags & FAULT_FLAG_TRIED)
6205-
goto unlock;
6206-
/*
6207-
* This is needed only for protection faults but the arch code
6208-
* is not yet telling us if this is a protection fault or not.
6209-
* This still avoids useless tlb flushes for .text page faults
6210-
* with threads.
6211-
*/
6212-
if (vmf->flags & FAULT_FLAG_WRITE)
6213-
flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
6214-
vmf->pte);
6215-
}
6241+
else
6242+
fix_spurious_fault(vmf, PGTABLE_LEVEL_PTE);
62166243
unlock:
62176244
pte_unmap_unlock(vmf->pte, vmf->ptl);
62186245
return 0;
@@ -6309,7 +6336,10 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
63096336
if (!(ret & VM_FAULT_FALLBACK))
63106337
return ret;
63116338
} else {
6312-
huge_pmd_set_accessed(&vmf);
6339+
vmf.ptl = pmd_lock(mm, vmf.pmd);
6340+
if (!huge_pmd_set_accessed(&vmf))
6341+
fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD);
6342+
spin_unlock(vmf.ptl);
63136343
return 0;
63146344
}
63156345
}

0 commit comments

Comments
 (0)