Skip to content

Commit 3098f8f

Browse files
tehcastergregkh
authored andcommitted
mm/page_alloc: prevent pcp corruption with SMP=n
commit 038a102 upstream. The kernel test robot has reported: BUG: spinlock trylock failure on UP on CPU#0, kcompactd0/28 lock: 0xffff888807e35ef0, .magic: dead4ead, .owner: kcompactd0/28, .owner_cpu: 0 CPU: 0 UID: 0 PID: 28 Comm: kcompactd0 Not tainted 6.18.0-rc5-00127-ga06157804399 #1 PREEMPT 8cc09ef94dcec767faa911515ce9e609c45db470 Call Trace: <IRQ> __dump_stack (lib/dump_stack.c:95) dump_stack_lvl (lib/dump_stack.c:123) dump_stack (lib/dump_stack.c:130) spin_dump (kernel/locking/spinlock_debug.c:71) do_raw_spin_trylock (kernel/locking/spinlock_debug.c:?) _raw_spin_trylock (include/linux/spinlock_api_smp.h:89 kernel/locking/spinlock.c:138) __free_frozen_pages (mm/page_alloc.c:2973) ___free_pages (mm/page_alloc.c:5295) __free_pages (mm/page_alloc.c:5334) tlb_remove_table_rcu (include/linux/mm.h:? include/linux/mm.h:3122 include/asm-generic/tlb.h:220 mm/mmu_gather.c:227 mm/mmu_gather.c:290) ? __cfi_tlb_remove_table_rcu (mm/mmu_gather.c:289) ? rcu_core (kernel/rcu/tree.c:?) rcu_core (include/linux/rcupdate.h:341 kernel/rcu/tree.c:2607 kernel/rcu/tree.c:2861) rcu_core_si (kernel/rcu/tree.c:2879) handle_softirqs (arch/x86/include/asm/jump_label.h:36 include/trace/events/irq.h:142 kernel/softirq.c:623) __irq_exit_rcu (arch/x86/include/asm/jump_label.h:36 kernel/softirq.c:725) irq_exit_rcu (kernel/softirq.c:741) sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1052) </IRQ> <TASK> RIP: 0010:_raw_spin_unlock_irqrestore (arch/x86/include/asm/preempt.h:95 include/linux/spinlock_api_smp.h:152 kernel/locking/spinlock.c:194) free_pcppages_bulk (mm/page_alloc.c:1494) drain_pages_zone (include/linux/spinlock.h:391 mm/page_alloc.c:2632) __drain_all_pages (mm/page_alloc.c:2731) drain_all_pages (mm/page_alloc.c:2747) kcompactd (mm/compaction.c:3115) kthread (kernel/kthread.c:465) ? __cfi_kcompactd (mm/compaction.c:3166) ? __cfi_kthread (kernel/kthread.c:412) ret_from_fork (arch/x86/kernel/process.c:164) ? __cfi_kthread (kernel/kthread.c:412) ret_from_fork_asm (arch/x86/entry/entry_64.S:255) </TASK> Matthew has analyzed the report and identified that in drain_page_zone() we are in a section protected by spin_lock(&pcp->lock) and then get an interrupt that attempts spin_trylock() on the same lock. The code is designed to work this way without disabling IRQs and occasionally fail the trylock with a fallback. However, the SMP=n spinlock implementation assumes spin_trylock() will always succeed, and thus it's normally a no-op. Here the enabled lock debugging catches the problem, but otherwise it could cause a corruption of the pcp structure. The problem has been introduced by commit 5749077 ("mm/page_alloc: leave IRQs enabled for per-cpu page allocations"). The pcp locking scheme recognizes the need for disabling IRQs to prevent nesting spin_trylock() sections on SMP=n, but the need to prevent the nesting in spin_lock() has not been recognized. Fix it by introducing local wrappers that change the spin_lock() to spin_lock_iqsave() with SMP=n and use them in all places that do spin_lock(&pcp->lock). [vbabka@suse.cz: add pcp_ prefix to the spin_lock_irqsave wrappers, per Steven] Link: https://lkml.kernel.org/r/20260105-fix-pcp-up-v1-1-5579662d2071@suse.cz Fixes: 5749077 ("mm/page_alloc: leave IRQs enabled for per-cpu page allocations") Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Reported-by: kernel test robot <oliver.sang@intel.com> Closes: https://lore.kernel.org/oe-lkp/202512101320.e2f2dd6f-lkp@intel.com Analyzed-by: Matthew Wilcox <willy@infradead.org> Link: https://lore.kernel.org/all/aUW05pyc9nZkvY-1@casper.infradead.org/ Acked-by: Mel Gorman <mgorman@techsingularity.net> Cc: Brendan Jackman <jackmanb@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Zi Yan <ziy@nvidia.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Sasha Levin <sashal@kernel.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent baea249 commit 3098f8f

1 file changed

Lines changed: 39 additions & 8 deletions

File tree

mm/page_alloc.c

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,33 @@ static DEFINE_MUTEX(pcp_batch_high_lock);
166166
#define pcp_spin_unlock(ptr) \
167167
pcpu_spin_unlock(lock, ptr)
168168

169+
/*
170+
* With the UP spinlock implementation, when we spin_lock(&pcp->lock) (for i.e.
171+
* a potentially remote cpu drain) and get interrupted by an operation that
172+
* attempts pcp_spin_trylock(), we can't rely on the trylock failure due to UP
173+
* spinlock assumptions making the trylock a no-op. So we have to turn that
174+
* spin_lock() to a spin_lock_irqsave(). This works because on UP there are no
175+
* remote cpu's so we can only be locking the only existing local one.
176+
*/
177+
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
178+
static inline void __flags_noop(unsigned long *flags) { }
179+
#define pcp_spin_lock_maybe_irqsave(ptr, flags) \
180+
({ \
181+
__flags_noop(&(flags)); \
182+
spin_lock(&(ptr)->lock); \
183+
})
184+
#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \
185+
({ \
186+
spin_unlock(&(ptr)->lock); \
187+
__flags_noop(&(flags)); \
188+
})
189+
#else
190+
#define pcp_spin_lock_maybe_irqsave(ptr, flags) \
191+
spin_lock_irqsave(&(ptr)->lock, flags)
192+
#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \
193+
spin_unlock_irqrestore(&(ptr)->lock, flags)
194+
#endif
195+
169196
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
170197
DEFINE_PER_CPU(int, numa_node);
171198
EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -2555,6 +2582,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
25552582
bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
25562583
{
25572584
int high_min, to_drain, to_drain_batched, batch;
2585+
unsigned long UP_flags;
25582586
bool todo = false;
25592587

25602588
high_min = READ_ONCE(pcp->high_min);
@@ -2574,9 +2602,9 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
25742602
to_drain = pcp->count - pcp->high;
25752603
while (to_drain > 0) {
25762604
to_drain_batched = min(to_drain, batch);
2577-
spin_lock(&pcp->lock);
2605+
pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
25782606
free_pcppages_bulk(zone, to_drain_batched, pcp, 0);
2579-
spin_unlock(&pcp->lock);
2607+
pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
25802608
todo = true;
25812609

25822610
to_drain -= to_drain_batched;
@@ -2593,14 +2621,15 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
25932621
*/
25942622
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
25952623
{
2624+
unsigned long UP_flags;
25962625
int to_drain, batch;
25972626

25982627
batch = READ_ONCE(pcp->batch);
25992628
to_drain = min(pcp->count, batch);
26002629
if (to_drain > 0) {
2601-
spin_lock(&pcp->lock);
2630+
pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
26022631
free_pcppages_bulk(zone, to_drain, pcp, 0);
2603-
spin_unlock(&pcp->lock);
2632+
pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
26042633
}
26052634
}
26062635
#endif
@@ -2611,10 +2640,11 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
26112640
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
26122641
{
26132642
struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
2643+
unsigned long UP_flags;
26142644
int count;
26152645

26162646
do {
2617-
spin_lock(&pcp->lock);
2647+
pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
26182648
count = pcp->count;
26192649
if (count) {
26202650
int to_drain = min(count,
@@ -2623,7 +2653,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
26232653
free_pcppages_bulk(zone, to_drain, pcp, 0);
26242654
count -= to_drain;
26252655
}
2626-
spin_unlock(&pcp->lock);
2656+
pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
26272657
} while (count);
26282658
}
26292659

@@ -6081,6 +6111,7 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
60816111
{
60826112
struct per_cpu_pages *pcp;
60836113
struct cpu_cacheinfo *cci;
6114+
unsigned long UP_flags;
60846115

60856116
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
60866117
cci = get_cpu_cacheinfo(cpu);
@@ -6091,12 +6122,12 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
60916122
* This can reduce zone lock contention without hurting
60926123
* cache-hot pages sharing.
60936124
*/
6094-
spin_lock(&pcp->lock);
6125+
pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
60956126
if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
60966127
pcp->flags |= PCPF_FREE_HIGH_BATCH;
60976128
else
60986129
pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
6099-
spin_unlock(&pcp->lock);
6130+
pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
61006131
}
61016132

61026133
void setup_pcp_cacheinfo(unsigned int cpu)

0 commit comments

Comments
 (0)