Skip to content

Commit 1d8d146

Browse files
davidhildenbrandakpm00
authored andcommitted
mm/hugetlb: support write-faults in shared mappings
If we ever get a write-fault on a write-protected page in a shared mapping, we'd be in trouble (again). Instead, we can simply map the page writable. And in fact, there is even a way right now to trigger that code via uffd-wp ever since we stared to support it for shmem in 5.19: -------------------------------------------------------------------------- #include <stdio.h> #include <stdlib.h> #include <string.h> #include <fcntl.h> #include <unistd.h> #include <errno.h> #include <sys/mman.h> #include <sys/syscall.h> #include <sys/ioctl.h> #include <linux/userfaultfd.h> #define HUGETLB_SIZE (2 * 1024 * 1024u) static char *map; int uffd; static int temp_setup_uffd(void) { struct uffdio_api uffdio_api; struct uffdio_register uffdio_register; struct uffdio_writeprotect uffd_writeprotect; struct uffdio_range uffd_range; uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); if (uffd < 0) { fprintf(stderr, "syscall() failed: %d\n", errno); return -errno; } uffdio_api.api = UFFD_API; uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP; if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { fprintf(stderr, "UFFDIO_API failed: %d\n", errno); return -errno; } if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n"); return -ENOSYS; } /* Register UFFD-WP */ uffdio_register.range.start = (unsigned long) map; uffdio_register.range.len = HUGETLB_SIZE; uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) { fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno); return -errno; } /* Writeprotect a single page. */ uffd_writeprotect.range.start = (unsigned long) map; uffd_writeprotect.range.len = HUGETLB_SIZE; uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP; if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno); return -errno; } /* Unregister UFFD-WP without prior writeunprotection. */ uffd_range.start = (unsigned long) map; uffd_range.len = HUGETLB_SIZE; if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_range)) { fprintf(stderr, "UFFDIO_UNREGISTER failed: %d\n", errno); return -errno; } return 0; } int main(int argc, char **argv) { int fd; fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT); if (!fd) { fprintf(stderr, "open() failed\n"); return -errno; } if (ftruncate(fd, HUGETLB_SIZE)) { fprintf(stderr, "ftruncate() failed\n"); return -errno; } map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); if (map == MAP_FAILED) { fprintf(stderr, "mmap() failed\n"); return -errno; } *map = 0; if (temp_setup_uffd()) return 1; *map = 0; return 0; } -------------------------------------------------------------------------- Above test fails with SIGBUS when there is only a single free hugetlb page. # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages # ./test Bus error (core dumped) And worse, with sufficient free hugetlb pages it will map an anonymous page into a shared mapping, for example, messing up accounting during unmap and breaking MAP_SHARED semantics: # echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages # ./test # cat /proc/meminfo | grep HugePages_ HugePages_Total: 2 HugePages_Free: 1 HugePages_Rsvd: 18446744073709551615 HugePages_Surp: 0 Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when unregistering and consequently keeps the PTE writeprotected. Reason for this is to avoid the additional overhead when unregistering. Note that this is the case also for !hugetlb and that we will end up with writable PTEs that still have the uffd-wp PTE bit set once we return from hugetlb_wp(). I'm not touching the uffd-wp PTE bit for now, because it seems to be a generic thing -- wp_page_reuse() also doesn't clear it. VM_MAYSHARE handling in hugetlb_fault() for FAULT_FLAG_WRITE indicates that MAP_SHARED handling was at least envisioned, but could never have worked as expected. While at it, make sure that we never end up in hugetlb_wp() on write faults without VM_WRITE, because we don't support maybe_mkwrite() semantics as commonly used in the !hugetlb case -- for example, in wp_page_reuse(). Note that there is no need to do any kind of reservation in hugetlb_fault() in this case ... because we already have a hugetlb page mapped R/O that we will simply map writable and we are not dealing with COW/unsharing. Link: https://lkml.kernel.org/r/20220811103435.188481-3-david@redhat.com Fixes: b1f9e87 ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: David Hildenbrand <david@redhat.com> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Cc: Bjorn Helgaas <bhelgaas@google.com> Cc: Cyrill Gorcunov <gorcunov@openvz.org> Cc: Hugh Dickins <hughd@google.com> Cc: Jamie Liu <jamieliu@google.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Pavel Emelyanov <xemul@parallels.com> Cc: Peter Feiner <pfeiner@google.com> Cc: Peter Xu <peterx@redhat.com> Cc: <stable@vger.kernel.org> [5.19] Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
1 parent f96f7a4 commit 1d8d146

1 file changed

Lines changed: 19 additions & 7 deletions

File tree

mm/hugetlb.c

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5241,6 +5241,21 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
52415241
VM_BUG_ON(unshare && (flags & FOLL_WRITE));
52425242
VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
52435243

5244+
/*
5245+
* hugetlb does not support FOLL_FORCE-style write faults that keep the
5246+
* PTE mapped R/O such as maybe_mkwrite() would do.
5247+
*/
5248+
if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
5249+
return VM_FAULT_SIGSEGV;
5250+
5251+
/* Let's take out MAP_SHARED mappings first. */
5252+
if (vma->vm_flags & VM_MAYSHARE) {
5253+
if (unlikely(unshare))
5254+
return 0;
5255+
set_huge_ptep_writable(vma, haddr, ptep);
5256+
return 0;
5257+
}
5258+
52445259
pte = huge_ptep_get(ptep);
52455260
old_page = pte_page(pte);
52465261

@@ -5781,22 +5796,19 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
57815796
* If we are going to COW/unshare the mapping later, we examine the
57825797
* pending reservations for this page now. This will ensure that any
57835798
* allocations necessary to record that reservation occur outside the
5784-
* spinlock. For private mappings, we also lookup the pagecache
5785-
* page now as it is used to determine if a reservation has been
5786-
* consumed.
5799+
* spinlock. Also lookup the pagecache page now as it is used to
5800+
* determine if a reservation has been consumed.
57875801
*/
57885802
if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
5789-
!huge_pte_write(entry)) {
5803+
!(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
57905804
if (vma_needs_reservation(h, vma, haddr) < 0) {
57915805
ret = VM_FAULT_OOM;
57925806
goto out_mutex;
57935807
}
57945808
/* Just decrements count, does not deallocate */
57955809
vma_end_reservation(h, vma, haddr);
57965810

5797-
if (!(vma->vm_flags & VM_MAYSHARE))
5798-
pagecache_page = hugetlbfs_pagecache_page(h,
5799-
vma, haddr);
5811+
pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr);
58005812
}
58015813

58025814
ptl = huge_pte_lock(h, mm, ptep);

0 commit comments

Comments
 (0)