Skip to content

Commit 869c63d

Browse files
mrpreAlexei Starovoitov
authored andcommitted
bpf: Fix race in cpumap on PREEMPT_RT
On PREEMPT_RT kernels, the per-CPU xdp_bulk_queue (bq) can be accessed concurrently by multiple preemptible tasks on the same CPU. The original code assumes bq_enqueue() and __cpu_map_flush() run atomically with respect to each other on the same CPU, relying on local_bh_disable() to prevent preemption. However, on PREEMPT_RT, local_bh_disable() only calls migrate_disable() (when PREEMPT_RT_NEEDS_BH_LOCK is not set) and does not disable preemption, which allows CFS scheduling to preempt a task during bq_flush_to_queue(), enabling another task on the same CPU to enter bq_enqueue() and operate on the same per-CPU bq concurrently. This leads to several races: 1. Double __list_del_clearprev(): after bq->count is reset in bq_flush_to_queue(), a preempting task can call bq_enqueue() -> bq_flush_to_queue() on the same bq when bq->count reaches CPU_MAP_BULK_SIZE. Both tasks then call __list_del_clearprev() on the same bq->flush_node, the second call dereferences the prev pointer that was already set to NULL by the first. 2. bq->count and bq->q[] races: concurrent bq_enqueue() can corrupt the packet queue while bq_flush_to_queue() is processing it. The race between task A (__cpu_map_flush -> bq_flush_to_queue) and task B (bq_enqueue -> bq_flush_to_queue) on the same CPU: Task A (xdp_do_flush) Task B (cpu_map_enqueue) ---------------------- ------------------------ bq_flush_to_queue(bq) spin_lock(&q->producer_lock) /* flush bq->q[] to ptr_ring */ bq->count = 0 spin_unlock(&q->producer_lock) bq_enqueue(rcpu, xdpf) <-- CFS preempts Task A --> bq->q[bq->count++] = xdpf /* ... more enqueues until full ... */ bq_flush_to_queue(bq) spin_lock(&q->producer_lock) /* flush to ptr_ring */ spin_unlock(&q->producer_lock) __list_del_clearprev(flush_node) /* sets flush_node.prev = NULL */ <-- Task A resumes --> __list_del_clearprev(flush_node) flush_node.prev->next = ... /* prev is NULL -> kernel oops */ Fix this by adding a local_lock_t to xdp_bulk_queue and acquiring it in bq_enqueue() and __cpu_map_flush(). These paths already run under local_bh_disable(), so use local_lock_nested_bh() which on non-RT is a pure annotation with no overhead, and on PREEMPT_RT provides a per-CPU sleeping lock that serializes access to the bq. To reproduce, insert an mdelay(100) between bq->count = 0 and __list_del_clearprev() in bq_flush_to_queue(), then run reproducer provided by syzkaller. Fixes: 3253cb4 ("softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT") Reported-by: syzbot+2b3391f44313b3983e91@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/69369331.a70a0220.38f243.009d.GAE@google.com/T/ Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com> Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev> Link: https://lore.kernel.org/r/20260225121459.183121-2-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org>
1 parent 5263e30 commit 869c63d

1 file changed

Lines changed: 15 additions & 2 deletions

File tree

kernel/bpf/cpumap.c

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <linux/sched.h>
3030
#include <linux/workqueue.h>
3131
#include <linux/kthread.h>
32+
#include <linux/local_lock.h>
3233
#include <linux/completion.h>
3334
#include <trace/events/xdp.h>
3435
#include <linux/btf_ids.h>
@@ -52,6 +53,7 @@ struct xdp_bulk_queue {
5253
struct list_head flush_node;
5354
struct bpf_cpu_map_entry *obj;
5455
unsigned int count;
56+
local_lock_t bq_lock;
5557
};
5658

5759
/* Struct for every remote "destination" CPU in map */
@@ -451,6 +453,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
451453
for_each_possible_cpu(i) {
452454
bq = per_cpu_ptr(rcpu->bulkq, i);
453455
bq->obj = rcpu;
456+
local_lock_init(&bq->bq_lock);
454457
}
455458

456459
/* Alloc queue */
@@ -722,6 +725,8 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
722725
struct ptr_ring *q;
723726
int i;
724727

728+
lockdep_assert_held(&bq->bq_lock);
729+
725730
if (unlikely(!bq->count))
726731
return;
727732

@@ -749,11 +754,15 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
749754
}
750755

751756
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
752-
* Thus, safe percpu variable access.
757+
* Thus, safe percpu variable access. PREEMPT_RT relies on
758+
* local_lock_nested_bh() to serialise access to the per-CPU bq.
753759
*/
754760
static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
755761
{
756-
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
762+
struct xdp_bulk_queue *bq;
763+
764+
local_lock_nested_bh(&rcpu->bulkq->bq_lock);
765+
bq = this_cpu_ptr(rcpu->bulkq);
757766

758767
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
759768
bq_flush_to_queue(bq);
@@ -774,6 +783,8 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
774783

775784
list_add(&bq->flush_node, flush_list);
776785
}
786+
787+
local_unlock_nested_bh(&rcpu->bulkq->bq_lock);
777788
}
778789

779790
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
@@ -810,7 +821,9 @@ void __cpu_map_flush(struct list_head *flush_list)
810821
struct xdp_bulk_queue *bq, *tmp;
811822

812823
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
824+
local_lock_nested_bh(&bq->obj->bulkq->bq_lock);
813825
bq_flush_to_queue(bq);
826+
local_unlock_nested_bh(&bq->obj->bulkq->bq_lock);
814827

815828
/* If already running, costs spin_lock_irqsave + smb_mb */
816829
wake_up_process(bq->obj->kthread);

0 commit comments

Comments
 (0)