Skip to content

Commit c2508ec

Browse files
committed
mm: introduce new 'lock_mm_and_find_vma()' page fault helper
.. and make x86 use it. This basically extracts the existing x86 "find and expand faulting vma" code, but extends it to also take the mmap lock for writing in case we actually do need to expand the vma. We've historically short-circuited that case, and have some rather ugly special logic to serialize the stack segment expansion (since we only hold the mmap lock for reading) that doesn't match the normal VM locking. That slight violation of locking worked well, right up until it didn't: the maple tree code really does want proper locking even for simple extension of an existing vma. So extract the code for "look up the vma of the fault" from x86, fix it up to do the necessary write locking, and make it available as a helper function for other architectures that can use the common helper. Note: I say "common helper", but it really only handles the normal stack-grows-down case. Which is all architectures except for PA-RISC and IA64. So some rare architectures can't use the helper, but if they care they'll just need to open-code this logic. It's also worth pointing out that this code really would like to have an optimistic "mmap_upgrade_trylock()" to make it quicker to go from a read-lock (for the common case) to taking the write lock (for having to extend the vma) in the normal single-threaded situation where there is no other locking activity. But that _is_ all the very uncommon special case, so while it would be nice to have such an operation, it probably doesn't matter in reality. I did put in the skeleton code for such a possible future expansion, even if it only acts as pseudo-documentation for what we're doing. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 45a3e24 commit c2508ec

5 files changed

Lines changed: 130 additions & 50 deletions

File tree

arch/x86/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ config X86
276276
select HAVE_GENERIC_VDSO
277277
select HOTPLUG_SMT if SMP
278278
select IRQ_FORCED_THREADING
279+
select LOCK_MM_AND_FIND_VMA
279280
select NEED_PER_CPU_EMBED_FIRST_CHUNK
280281
select NEED_PER_CPU_PAGE_FIRST_CHUNK
281282
select NEED_SG_DMA_LENGTH

arch/x86/mm/fault.c

Lines changed: 2 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -880,12 +880,6 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
880880
__bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
881881
}
882882

883-
static noinline void
884-
bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
885-
{
886-
__bad_area(regs, error_code, address, 0, SEGV_MAPERR);
887-
}
888-
889883
static inline bool bad_area_access_from_pkeys(unsigned long error_code,
890884
struct vm_area_struct *vma)
891885
{
@@ -1366,59 +1360,17 @@ void do_user_addr_fault(struct pt_regs *regs,
13661360
lock_mmap:
13671361
#endif /* CONFIG_PER_VMA_LOCK */
13681362

1369-
/*
1370-
* Kernel-mode access to the user address space should only occur
1371-
* on well-defined single instructions listed in the exception
1372-
* tables. But, an erroneous kernel fault occurring outside one of
1373-
* those areas which also holds mmap_lock might deadlock attempting
1374-
* to validate the fault against the address space.
1375-
*
1376-
* Only do the expensive exception table search when we might be at
1377-
* risk of a deadlock. This happens if we
1378-
* 1. Failed to acquire mmap_lock, and
1379-
* 2. The access did not originate in userspace.
1380-
*/
1381-
if (unlikely(!mmap_read_trylock(mm))) {
1382-
if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
1383-
/*
1384-
* Fault from code in kernel from
1385-
* which we do not expect faults.
1386-
*/
1387-
bad_area_nosemaphore(regs, error_code, address);
1388-
return;
1389-
}
13901363
retry:
1391-
mmap_read_lock(mm);
1392-
} else {
1393-
/*
1394-
* The above down_read_trylock() might have succeeded in
1395-
* which case we'll have missed the might_sleep() from
1396-
* down_read():
1397-
*/
1398-
might_sleep();
1399-
}
1400-
1401-
vma = find_vma(mm, address);
1364+
vma = lock_mm_and_find_vma(mm, address, regs);
14021365
if (unlikely(!vma)) {
1403-
bad_area(regs, error_code, address);
1404-
return;
1405-
}
1406-
if (likely(vma->vm_start <= address))
1407-
goto good_area;
1408-
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1409-
bad_area(regs, error_code, address);
1410-
return;
1411-
}
1412-
if (unlikely(expand_stack(vma, address))) {
1413-
bad_area(regs, error_code, address);
1366+
bad_area_nosemaphore(regs, error_code, address);
14141367
return;
14151368
}
14161369

14171370
/*
14181371
* Ok, we have a good vm_area for this memory access, so
14191372
* we can handle it..
14201373
*/
1421-
good_area:
14221374
if (unlikely(access_error(error_code, vma))) {
14231375
bad_area_access_error(regs, error_code, address, vma);
14241376
return;

include/linux/mm.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2325,6 +2325,8 @@ void unmap_mapping_pages(struct address_space *mapping,
23252325
pgoff_t start, pgoff_t nr, bool even_cows);
23262326
void unmap_mapping_range(struct address_space *mapping,
23272327
loff_t const holebegin, loff_t const holelen, int even_cows);
2328+
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
2329+
unsigned long address, struct pt_regs *regs);
23282330
#else
23292331
static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
23302332
unsigned long address, unsigned int flags,

mm/Kconfig

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1206,6 +1206,10 @@ config PER_VMA_LOCK
12061206
This feature allows locking each virtual memory area separately when
12071207
handling page faults instead of taking mmap_lock.
12081208

1209+
config LOCK_MM_AND_FIND_VMA
1210+
bool
1211+
depends on !STACK_GROWSUP
1212+
12091213
source "mm/damon/Kconfig"
12101214

12111215
endmenu

mm/memory.c

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5262,6 +5262,127 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
52625262
}
52635263
EXPORT_SYMBOL_GPL(handle_mm_fault);
52645264

5265+
#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
5266+
#include <linux/extable.h>
5267+
5268+
static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
5269+
{
5270+
/* Even if this succeeds, make it clear we *might* have slept */
5271+
if (likely(mmap_read_trylock(mm))) {
5272+
might_sleep();
5273+
return true;
5274+
}
5275+
5276+
if (regs && !user_mode(regs)) {
5277+
unsigned long ip = instruction_pointer(regs);
5278+
if (!search_exception_tables(ip))
5279+
return false;
5280+
}
5281+
5282+
mmap_read_lock(mm);
5283+
return true;
5284+
}
5285+
5286+
static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
5287+
{
5288+
/*
5289+
* We don't have this operation yet.
5290+
*
5291+
* It should be easy enough to do: it's basically a
5292+
* atomic_long_try_cmpxchg_acquire()
5293+
* from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
5294+
* it also needs the proper lockdep magic etc.
5295+
*/
5296+
return false;
5297+
}
5298+
5299+
static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
5300+
{
5301+
mmap_read_unlock(mm);
5302+
if (regs && !user_mode(regs)) {
5303+
unsigned long ip = instruction_pointer(regs);
5304+
if (!search_exception_tables(ip))
5305+
return false;
5306+
}
5307+
mmap_write_lock(mm);
5308+
return true;
5309+
}
5310+
5311+
/*
5312+
* Helper for page fault handling.
5313+
*
5314+
* This is kind of equivalend to "mmap_read_lock()" followed
5315+
* by "find_extend_vma()", except it's a lot more careful about
5316+
* the locking (and will drop the lock on failure).
5317+
*
5318+
* For example, if we have a kernel bug that causes a page
5319+
* fault, we don't want to just use mmap_read_lock() to get
5320+
* the mm lock, because that would deadlock if the bug were
5321+
* to happen while we're holding the mm lock for writing.
5322+
*
5323+
* So this checks the exception tables on kernel faults in
5324+
* order to only do this all for instructions that are actually
5325+
* expected to fault.
5326+
*
5327+
* We can also actually take the mm lock for writing if we
5328+
* need to extend the vma, which helps the VM layer a lot.
5329+
*/
5330+
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
5331+
unsigned long addr, struct pt_regs *regs)
5332+
{
5333+
struct vm_area_struct *vma;
5334+
5335+
if (!get_mmap_lock_carefully(mm, regs))
5336+
return NULL;
5337+
5338+
vma = find_vma(mm, addr);
5339+
if (likely(vma && (vma->vm_start <= addr)))
5340+
return vma;
5341+
5342+
/*
5343+
* Well, dang. We might still be successful, but only
5344+
* if we can extend a vma to do so.
5345+
*/
5346+
if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
5347+
mmap_read_unlock(mm);
5348+
return NULL;
5349+
}
5350+
5351+
/*
5352+
* We can try to upgrade the mmap lock atomically,
5353+
* in which case we can continue to use the vma
5354+
* we already looked up.
5355+
*
5356+
* Otherwise we'll have to drop the mmap lock and
5357+
* re-take it, and also look up the vma again,
5358+
* re-checking it.
5359+
*/
5360+
if (!mmap_upgrade_trylock(mm)) {
5361+
if (!upgrade_mmap_lock_carefully(mm, regs))
5362+
return NULL;
5363+
5364+
vma = find_vma(mm, addr);
5365+
if (!vma)
5366+
goto fail;
5367+
if (vma->vm_start <= addr)
5368+
goto success;
5369+
if (!(vma->vm_flags & VM_GROWSDOWN))
5370+
goto fail;
5371+
}
5372+
5373+
if (expand_stack(vma, addr))
5374+
goto fail;
5375+
5376+
success:
5377+
mmap_write_downgrade(mm);
5378+
return vma;
5379+
5380+
fail:
5381+
mmap_write_unlock(mm);
5382+
return NULL;
5383+
}
5384+
#endif
5385+
52655386
#ifdef CONFIG_PER_VMA_LOCK
52665387
/*
52675388
* Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be

0 commit comments

Comments
 (0)