Skip to content

Commit 8c0d9e1

Browse files
author
Alexei Starovoitov
committed
Merge branch 'bpf-cpumap-devmap-fix-per-cpu-bulk-queue-races-on-preempt_rt'
Jiayuan Chen says: ==================== bpf: Fix per-CPU bulk queue races on PREEMPT_RT On PREEMPT_RT kernels, local_bh_disable() only calls migrate_disable() (when PREEMPT_RT_NEEDS_BH_LOCK is not set) and does not disable preemption. This means CFS scheduling can preempt a task inside the per-CPU bulk queue (bq) operations in cpumap and devmap, allowing another task on the same CPU to concurrently access the same bq, leading to use-after-free, list corruption, and kernel panics. Patch 1 fixes the cpumap race in bq_flush_to_queue(), originally reported by syzbot [1]. Patch 2 fixes the same class of race in devmap's bq_xmit_all(), identified by code inspection after Sebastian Andrzej Siewior pointed out that devmap has the same per-CPU bulk queue pattern [2]. Both patches use local_lock_nested_bh() to serialize access to the per-CPU bq. On non-RT this is a pure lockdep annotation with no overhead; on PREEMPT_RT it provides a per-CPU sleeping lock. To reproduce the devmap race, insert an mdelay(100) in bq_xmit_all() after "cnt = bq->count" and before the actual transmit loop. Then pin two threads to the same CPU, each running BPF_PROG_TEST_RUN with an XDP program that redirects to a DEVMAP entry (e.g. a veth pair). CFS timeslicing during the mdelay window causes interleaving. Without the fix, KASAN reports null-ptr-deref due to operating on freed frames: BUG: KASAN: null-ptr-deref in __build_skb_around+0x22d/0x340 Write of size 32 at addr 0000000000000d50 by task devmap_race_rep/449 CPU: 0 UID: 0 PID: 449 Comm: devmap_race_rep Not tainted 6.19.0+ #31 PREEMPT_RT Call Trace: <TASK> __build_skb_around+0x22d/0x340 build_skb_around+0x25/0x260 __xdp_build_skb_from_frame+0x103/0x860 veth_xdp_rcv_bulk_skb.isra.0+0x162/0x320 veth_xdp_rcv.constprop.0+0x61e/0xbb0 veth_poll+0x280/0xb50 __napi_poll.constprop.0+0xa5/0x590 net_rx_action+0x4b0/0xea0 handle_softirqs.isra.0+0x1b3/0x780 __local_bh_enable_ip+0x12a/0x240 xdp_test_run_batch.constprop.0+0xedd/0x1f60 bpf_test_run_xdp_live+0x304/0x640 bpf_prog_test_run_xdp+0xd24/0x1b70 __sys_bpf+0x61c/0x3e00 </TASK> Kernel panic - not syncing: Fatal exception in interrupt [1] https://lore.kernel.org/all/69369331.a70a0220.38f243.009d.GAE@google.com/T/ [2] https://lore.kernel.org/bpf/20260212023634.366343-1-jiayuan.chen@linux.dev/ v3 -> v4: https://lore.kernel.org/all/20260213034018.284146-1-jiayuan.chen@linux.dev/ - Move panic trace to cover letter. (Sebastian Andrzej Siewior) - Add Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> to both patches from cover letter. v2 -> v3: https://lore.kernel.org/bpf/20260212023634.366343-1-jiayuan.chen@linux.dev/ - Fix commit message: remove incorrect "spin_lock() becomes rt_mutex" claim, the per-CPU bq has no spin_lock at all. (Sebastian Andrzej Siewior) - Fix commit message: accurately describe local_lock_nested_bh() behavior instead of referencing local_lock(). (Sebastian Andrzej Siewior) - Remove incomplete discussion of snapshot alternative. (Sebastian Andrzej Siewior) - Remove panic trace from commit message. (Sebastian Andrzej Siewior) - Add patch 2/2 for devmap, same race pattern. (Sebastian Andrzej Siewior) v1 -> v2: https://lore.kernel.org/bpf/20260211064417.196401-1-jiayuan.chen@linux.dev/ - Use local_lock_nested_bh()/local_unlock_nested_bh() instead of local_lock()/local_unlock(), since these paths already run under local_bh_disable(). (Sebastian Andrzej Siewior) - Replace "Caller must hold bq->bq_lock" comment with lockdep_assert_held() in bq_flush_to_queue(). (Sebastian Andrzej Siewior) - Fix Fixes tag to 3253cb4 ("softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT") which is the actual commit that makes the race possible. (Sebastian Andrzej Siewior) ==================== Link: https://patch.msgid.link/20260225121459.183121-1-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2 parents 5263e30 + 1872e75 commit 8c0d9e1

2 files changed

Lines changed: 36 additions & 6 deletions

File tree

kernel/bpf/cpumap.c

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <linux/sched.h>
3030
#include <linux/workqueue.h>
3131
#include <linux/kthread.h>
32+
#include <linux/local_lock.h>
3233
#include <linux/completion.h>
3334
#include <trace/events/xdp.h>
3435
#include <linux/btf_ids.h>
@@ -52,6 +53,7 @@ struct xdp_bulk_queue {
5253
struct list_head flush_node;
5354
struct bpf_cpu_map_entry *obj;
5455
unsigned int count;
56+
local_lock_t bq_lock;
5557
};
5658

5759
/* Struct for every remote "destination" CPU in map */
@@ -451,6 +453,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
451453
for_each_possible_cpu(i) {
452454
bq = per_cpu_ptr(rcpu->bulkq, i);
453455
bq->obj = rcpu;
456+
local_lock_init(&bq->bq_lock);
454457
}
455458

456459
/* Alloc queue */
@@ -722,6 +725,8 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
722725
struct ptr_ring *q;
723726
int i;
724727

728+
lockdep_assert_held(&bq->bq_lock);
729+
725730
if (unlikely(!bq->count))
726731
return;
727732

@@ -749,11 +754,15 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
749754
}
750755

751756
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
752-
* Thus, safe percpu variable access.
757+
* Thus, safe percpu variable access. PREEMPT_RT relies on
758+
* local_lock_nested_bh() to serialise access to the per-CPU bq.
753759
*/
754760
static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
755761
{
756-
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
762+
struct xdp_bulk_queue *bq;
763+
764+
local_lock_nested_bh(&rcpu->bulkq->bq_lock);
765+
bq = this_cpu_ptr(rcpu->bulkq);
757766

758767
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
759768
bq_flush_to_queue(bq);
@@ -774,6 +783,8 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
774783

775784
list_add(&bq->flush_node, flush_list);
776785
}
786+
787+
local_unlock_nested_bh(&rcpu->bulkq->bq_lock);
777788
}
778789

779790
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
@@ -810,7 +821,9 @@ void __cpu_map_flush(struct list_head *flush_list)
810821
struct xdp_bulk_queue *bq, *tmp;
811822

812823
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
824+
local_lock_nested_bh(&bq->obj->bulkq->bq_lock);
813825
bq_flush_to_queue(bq);
826+
local_unlock_nested_bh(&bq->obj->bulkq->bq_lock);
814827

815828
/* If already running, costs spin_lock_irqsave + smb_mb */
816829
wake_up_process(bq->obj->kthread);

kernel/bpf/devmap.c

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
* types of devmap; only the lookup and insertion is different.
4646
*/
4747
#include <linux/bpf.h>
48+
#include <linux/local_lock.h>
4849
#include <net/xdp.h>
4950
#include <linux/filter.h>
5051
#include <trace/events/xdp.h>
@@ -60,6 +61,7 @@ struct xdp_dev_bulk_queue {
6061
struct net_device *dev_rx;
6162
struct bpf_prog *xdp_prog;
6263
unsigned int count;
64+
local_lock_t bq_lock;
6365
};
6466

6567
struct bpf_dtab_netdev {
@@ -381,6 +383,8 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
381383
int to_send = cnt;
382384
int i;
383385

386+
lockdep_assert_held(&bq->bq_lock);
387+
384388
if (unlikely(!cnt))
385389
return;
386390

@@ -425,10 +429,12 @@ void __dev_flush(struct list_head *flush_list)
425429
struct xdp_dev_bulk_queue *bq, *tmp;
426430

427431
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
432+
local_lock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
428433
bq_xmit_all(bq, XDP_XMIT_FLUSH);
429434
bq->dev_rx = NULL;
430435
bq->xdp_prog = NULL;
431436
__list_del_clearprev(&bq->flush_node);
437+
local_unlock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
432438
}
433439
}
434440

@@ -451,12 +457,16 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
451457

452458
/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
453459
* variable access, and map elements stick around. See comment above
454-
* xdp_do_flush() in filter.c.
460+
* xdp_do_flush() in filter.c. PREEMPT_RT relies on local_lock_nested_bh()
461+
* to serialise access to the per-CPU bq.
455462
*/
456463
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
457464
struct net_device *dev_rx, struct bpf_prog *xdp_prog)
458465
{
459-
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
466+
struct xdp_dev_bulk_queue *bq;
467+
468+
local_lock_nested_bh(&dev->xdp_bulkq->bq_lock);
469+
bq = this_cpu_ptr(dev->xdp_bulkq);
460470

461471
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
462472
bq_xmit_all(bq, 0);
@@ -477,6 +487,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
477487
}
478488

479489
bq->q[bq->count++] = xdpf;
490+
491+
local_unlock_nested_bh(&dev->xdp_bulkq->bq_lock);
480492
}
481493

482494
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
@@ -1127,8 +1139,13 @@ static int dev_map_notification(struct notifier_block *notifier,
11271139
if (!netdev->xdp_bulkq)
11281140
return NOTIFY_BAD;
11291141

1130-
for_each_possible_cpu(cpu)
1131-
per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
1142+
for_each_possible_cpu(cpu) {
1143+
struct xdp_dev_bulk_queue *bq;
1144+
1145+
bq = per_cpu_ptr(netdev->xdp_bulkq, cpu);
1146+
bq->dev = netdev;
1147+
local_lock_init(&bq->bq_lock);
1148+
}
11321149
break;
11331150
case NETDEV_UNREGISTER:
11341151
/* This rcu_read_lock/unlock pair is needed because

0 commit comments

Comments
 (0)