Skip to content

Commit 1872e75

Browse files
mrpreAlexei Starovoitov
authored andcommitted
bpf: Fix race in devmap on PREEMPT_RT
On PREEMPT_RT kernels, the per-CPU xdp_dev_bulk_queue (bq) can be accessed concurrently by multiple preemptible tasks on the same CPU. The original code assumes bq_enqueue() and __dev_flush() run atomically with respect to each other on the same CPU, relying on local_bh_disable() to prevent preemption. However, on PREEMPT_RT, local_bh_disable() only calls migrate_disable() (when PREEMPT_RT_NEEDS_BH_LOCK is not set) and does not disable preemption, which allows CFS scheduling to preempt a task during bq_xmit_all(), enabling another task on the same CPU to enter bq_enqueue() and operate on the same per-CPU bq concurrently. This leads to several races: 1. Double-free / use-after-free on bq->q[]: bq_xmit_all() snapshots cnt = bq->count, then iterates bq->q[0..cnt-1] to transmit frames. If preempted after the snapshot, a second task can call bq_enqueue() -> bq_xmit_all() on the same bq, transmitting (and freeing) the same frames. When the first task resumes, it operates on stale pointers in bq->q[], causing use-after-free. 2. bq->count and bq->q[] corruption: concurrent bq_enqueue() modifying bq->count and bq->q[] while bq_xmit_all() is reading them. 3. dev_rx/xdp_prog teardown race: __dev_flush() clears bq->dev_rx and bq->xdp_prog after bq_xmit_all(). If preempted between bq_xmit_all() return and bq->dev_rx = NULL, a preempting bq_enqueue() sees dev_rx still set (non-NULL), skips adding bq to the flush_list, and enqueues a frame. When __dev_flush() resumes, it clears dev_rx and removes bq from the flush_list, orphaning the newly enqueued frame. 4. __list_del_clearprev() on flush_node: similar to the cpumap race, both tasks can call __list_del_clearprev() on the same flush_node, the second dereferences the prev pointer already set to NULL. The race between task A (__dev_flush -> bq_xmit_all) and task B (bq_enqueue -> bq_xmit_all) on the same CPU: Task A (xdp_do_flush) Task B (ndo_xdp_xmit redirect) ---------------------- -------------------------------- __dev_flush(flush_list) bq_xmit_all(bq) cnt = bq->count /* e.g. 16 */ /* start iterating bq->q[] */ <-- CFS preempts Task A --> bq_enqueue(dev, xdpf) bq->count == DEV_MAP_BULK_SIZE bq_xmit_all(bq, 0) cnt = bq->count /* same 16! */ ndo_xdp_xmit(bq->q[]) /* frames freed by driver */ bq->count = 0 <-- Task A resumes --> ndo_xdp_xmit(bq->q[]) /* use-after-free: frames already freed! */ Fix this by adding a local_lock_t to xdp_dev_bulk_queue and acquiring it in bq_enqueue() and __dev_flush(). These paths already run under local_bh_disable(), so use local_lock_nested_bh() which on non-RT is a pure annotation with no overhead, and on PREEMPT_RT provides a per-CPU sleeping lock that serializes access to the bq. Fixes: 3253cb4 ("softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT") Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com> Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev> Link: https://lore.kernel.org/r/20260225121459.183121-3-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org>
1 parent 869c63d commit 1872e75

1 file changed

Lines changed: 21 additions & 4 deletions

File tree

kernel/bpf/devmap.c

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
* types of devmap; only the lookup and insertion is different.
4646
*/
4747
#include <linux/bpf.h>
48+
#include <linux/local_lock.h>
4849
#include <net/xdp.h>
4950
#include <linux/filter.h>
5051
#include <trace/events/xdp.h>
@@ -60,6 +61,7 @@ struct xdp_dev_bulk_queue {
6061
struct net_device *dev_rx;
6162
struct bpf_prog *xdp_prog;
6263
unsigned int count;
64+
local_lock_t bq_lock;
6365
};
6466

6567
struct bpf_dtab_netdev {
@@ -381,6 +383,8 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
381383
int to_send = cnt;
382384
int i;
383385

386+
lockdep_assert_held(&bq->bq_lock);
387+
384388
if (unlikely(!cnt))
385389
return;
386390

@@ -425,10 +429,12 @@ void __dev_flush(struct list_head *flush_list)
425429
struct xdp_dev_bulk_queue *bq, *tmp;
426430

427431
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
432+
local_lock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
428433
bq_xmit_all(bq, XDP_XMIT_FLUSH);
429434
bq->dev_rx = NULL;
430435
bq->xdp_prog = NULL;
431436
__list_del_clearprev(&bq->flush_node);
437+
local_unlock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
432438
}
433439
}
434440

@@ -451,12 +457,16 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
451457

452458
/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
453459
* variable access, and map elements stick around. See comment above
454-
* xdp_do_flush() in filter.c.
460+
* xdp_do_flush() in filter.c. PREEMPT_RT relies on local_lock_nested_bh()
461+
* to serialise access to the per-CPU bq.
455462
*/
456463
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
457464
struct net_device *dev_rx, struct bpf_prog *xdp_prog)
458465
{
459-
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
466+
struct xdp_dev_bulk_queue *bq;
467+
468+
local_lock_nested_bh(&dev->xdp_bulkq->bq_lock);
469+
bq = this_cpu_ptr(dev->xdp_bulkq);
460470

461471
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
462472
bq_xmit_all(bq, 0);
@@ -477,6 +487,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
477487
}
478488

479489
bq->q[bq->count++] = xdpf;
490+
491+
local_unlock_nested_bh(&dev->xdp_bulkq->bq_lock);
480492
}
481493

482494
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
@@ -1127,8 +1139,13 @@ static int dev_map_notification(struct notifier_block *notifier,
11271139
if (!netdev->xdp_bulkq)
11281140
return NOTIFY_BAD;
11291141

1130-
for_each_possible_cpu(cpu)
1131-
per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
1142+
for_each_possible_cpu(cpu) {
1143+
struct xdp_dev_bulk_queue *bq;
1144+
1145+
bq = per_cpu_ptr(netdev->xdp_bulkq, cpu);
1146+
bq->dev = netdev;
1147+
local_lock_init(&bq->bq_lock);
1148+
}
11321149
break;
11331150
case NETDEV_UNREGISTER:
11341151
/* This rcu_read_lock/unlock pair is needed because

0 commit comments

Comments
 (0)