Merge branch 'close-race-in-freeing-special-fields-and-map-value'

Alexei Starovoitov · Alexei Starovoitov · commit 5263e30fffbc · 2026-02-27T15:39:01.000-08:00
Kumar Kartikeya Dwivedi says: ==================== Close race in freeing special fields and map value There exists a race across various map types where the freeing of special fields (tw, timer, wq, kptr, etc.) can be done eagerly when a logical delete operation is done on a map value, such that the program which continues to have access to such a map value can recreate the fields and cause them to leak. The set contains fixes for this case. It is a continuation of Mykyta's previous attempt in [0], but applies to all fields. A test is included which reproduces the bug reliably in absence of the fixes. Local Storage Benchmarks ------------------------ Evaluation Setup: Benchmarked on a dual-socket Intel Xeon Gold 6348 (Ice Lake) @ 2.60GHz (56 cores / 112 threads), with the CPU governor set to performance. Bench was pinned to a single NUMA node throughout the test. Benchmark comes from [1] using the following command: ./bench -p 1 local-storage-create --storage-type <socket,task> --batch-size <16,32,64> Before the test, 10 runs of all cases ([socket|task] x 3 batch sizes x 7 iterations per batch size) are done to warm up and prime the machine. Then, 3 runs of all cases are done (with and without the patch, across reboots). For each comparison, we have 21 samples, i.e. per batch size (e.g. socket 16) of a given local storage, we have 3 runs x 7 iterations. The statistics (mean, median, stddev) and t-test is done for each scenario (local storage and batch size pair) individually (21 samples for either case). All values are for local storage creations in thousand creations / sec (k/s). Baseline (without patch) With patch Delta Case Median Mean Std. Dev. Median Mean Std. Dev. Median % --------------------------------------------------------------------------------------------------- socket 16 432.026 431.941 1.047 431.347 431.953 1.635 -0.679 -0.16% socket 32 432.641 432.818 1.535 432.488 432.302 1.508 -0.153 -0.04% socket 64 431.504 431.996 1.337 429.145 430.326 2.469 -2.359 -0.55% task 16 38.816 39.382 1.456 39.657 39.337 1.831 +0.841 +2.17% task 32 38.815 39.644 2.690 38.721 39.122 1.636 -0.094 -0.24% task 64 37.562 38.080 1.701 39.554 38.563 1.689 +1.992 +5.30% The cases for socket are within the range of noise, and improvements in task local storage are due to high variance (CV ~4%-6% across batch sizes). The only statistically significant case worth mentioning is socket with batch size 64 with p-value from t-test < 0.05, but the absolute difference is small (~2k/s). TL;DR there doesn't appear to be any significant regression or improvement. [0]: https://lore.kernel.org/bpf/20260216131341.1285427-1-mykyta.yatsenko5@gmail.com [1]: https://lore.kernel.org/bpf/20260205222916.1788211-1-ameryhung@gmail.com Changelog: ---------- v2 -> v3 v2: https://lore.kernel.org/bpf/20260227052031.3988575-1-memxor@gmail.com * Add syzbot Tested-by. * Add Amery's Reviewed-by. * Fix missing rcu_dereference_check() in __bpf_selem_free_rcu. (BPF CI Bot) * Remove migrate_disable() in bpf_selem_free_rcu. (Alexei) v1 -> v2 v1: https://lore.kernel.org/bpf/20260225185121.2057388-1-memxor@gmail.com * Add Paul's Reviewed-by. * Fix use-after-free in accessing bpf_mem_alloc embedded in map. (syzbot CI) * Add benchmark numbers for local storage. * Add extra test case for per-cpu hashmap coverage with up to 16 refcount leaks. * Target bpf tree. ==================== Link: https://patch.msgid.link/20260227224806.646888-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
@@ -124,7 +124,7 @@ struct bpf_map_ops {
 	u32 (*map_fd_sys_lookup_elem)(void *ptr);
 	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
 				  struct seq_file *m);
-	int (*map_check_btf)(const struct bpf_map *map,
+	int (*map_check_btf)(struct bpf_map *map,
 			     const struct btf *btf,
 			     const struct btf_type *key_type,
 			     const struct btf_type *value_type);
@@ -656,7 +656,7 @@ static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
 		map->ops->map_seq_show_elem;
 }
 
-int map_check_no_btf(const struct bpf_map *map,
+int map_check_no_btf(struct bpf_map *map,
 		     const struct btf *btf,
 		     const struct btf_type *key_type,
 		     const struct btf_type *value_type);
diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
@@ -176,7 +176,7 @@ u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage);
 void bpf_local_storage_map_free(struct bpf_map *map,
 				struct bpf_local_storage_cache *cache);
 
-int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+int bpf_local_storage_map_check_btf(struct bpf_map *map,
 				    const struct btf *btf,
 				    const struct btf_type *key_type,
 				    const struct btf_type *value_type);
diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
@@ -14,6 +14,8 @@ struct bpf_mem_alloc {
 	struct obj_cgroup *objcg;
 	bool percpu;
 	struct work_struct work;
+	void (*dtor_ctx_free)(void *ctx);
+	void *dtor_ctx;
 };
 
 /* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects.
@@ -32,6 +34,10 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg
 /* The percpu allocation with a specific unit size. */
 int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size);
 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
+void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma,
+			    void (*dtor)(void *obj, void *ctx),
+			    void (*dtor_ctx_free)(void *ctx),
+			    void *ctx);
 
 /* Check the allocation size for kmalloc equivalent allocator */
 int bpf_mem_alloc_check_size(bool percpu, size_t size);
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
@@ -303,7 +303,7 @@ static long arena_map_update_elem(struct bpf_map *map, void *key,
 	return -EOPNOTSUPP;
 }
 
-static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+static int arena_map_check_btf(struct bpf_map *map, const struct btf *btf,
 			       const struct btf_type *key_type, const struct btf_type *value_type)
 {
 	return 0;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
@@ -548,7 +548,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
 	rcu_read_unlock();
 }
 
-static int array_map_check_btf(const struct bpf_map *map,
+static int array_map_check_btf(struct bpf_map *map,
 			       const struct btf *btf,
 			       const struct btf_type *key_type,
 			       const struct btf_type *value_type)
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
@@ -180,7 +180,7 @@ static long bloom_map_update_elem(struct bpf_map *map, void *key,
 	return -EINVAL;
 }
 
-static int bloom_map_check_btf(const struct bpf_map *map,
+static int bloom_map_check_btf(struct bpf_map *map,
 			       const struct btf *btf,
 			       const struct btf_type *key_type,
 			       const struct btf_type *value_type)
diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c
@@ -98,7 +98,7 @@ static long insn_array_delete_elem(struct bpf_map *map, void *key)
 	return -EINVAL;
 }
 
-static int insn_array_check_btf(const struct bpf_map *map,
+static int insn_array_check_btf(struct bpf_map *map,
 			      const struct btf *btf,
 			      const struct btf_type *key_type,
 			      const struct btf_type *value_type)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
@@ -107,14 +107,12 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage *local_storage;
 
-	/* If RCU Tasks Trace grace period implies RCU grace period, do
-	 * kfree(), else do kfree_rcu().
+	/*
+	 * RCU Tasks Trace grace period implies RCU grace period, do
+	 * kfree() directly.
 	 */
 	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
-	if (rcu_trace_implies_rcu_gp())
-		kfree(local_storage);
-	else
-		kfree_rcu(local_storage, rcu);
+	kfree(local_storage);
 }
 
 /* Handle use_kmalloc_nolock == false */
@@ -138,10 +136,11 @@ static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
 
 static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
 {
-	if (rcu_trace_implies_rcu_gp())
-		bpf_local_storage_free_rcu(rcu);
-	else
-		call_rcu(rcu, bpf_local_storage_free_rcu);
+	/*
+	 * RCU Tasks Trace grace period implies RCU grace period, do
+	 * kfree() directly.
+	 */
+	bpf_local_storage_free_rcu(rcu);
 }
 
 static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
@@ -164,24 +163,37 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
 			     bpf_local_storage_free_trace_rcu);
 }
 
-/* rcu tasks trace callback for use_kmalloc_nolock == false */
-static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+/* rcu callback for use_kmalloc_nolock == false */
+static void __bpf_selem_free_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage_map *smap;
 
 	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
-	if (rcu_trace_implies_rcu_gp())
-		kfree(selem);
-	else
-		kfree_rcu(selem, rcu);
+	/* bpf_selem_unlink_nofail may have already cleared smap and freed fields. */
+	smap = rcu_dereference_check(SDATA(selem)->smap, 1);
+
+	if (smap)
+		bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+	kfree(selem);
+}
+
+/* rcu tasks trace callback for use_kmalloc_nolock == false */
+static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+	/*
+	 * RCU Tasks Trace grace period implies RCU grace period, do
+	 * kfree() directly.
+	 */
+	__bpf_selem_free_rcu(rcu);
 }
 
 /* Handle use_kmalloc_nolock == false */
 static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
 			     bool vanilla_rcu)
 {
 	if (vanilla_rcu)
-		kfree_rcu(selem, rcu);
+		call_rcu(&selem->rcu, __bpf_selem_free_rcu);
 	else
 		call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
 }
@@ -195,37 +207,29 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
 	/* The bpf_local_storage_map_free will wait for rcu_barrier */
 	smap = rcu_dereference_check(SDATA(selem)->smap, 1);
 
-	if (smap) {
-		migrate_disable();
+	if (smap)
 		bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
-		migrate_enable();
-	}
 	kfree_nolock(selem);
 }
 
 static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
 {
-	if (rcu_trace_implies_rcu_gp())
-		bpf_selem_free_rcu(rcu);
-	else
-		call_rcu(rcu, bpf_selem_free_rcu);
+	/*
+	 * RCU Tasks Trace grace period implies RCU grace period, do
+	 * kfree() directly.
+	 */
+	bpf_selem_free_rcu(rcu);
 }
 
 void bpf_selem_free(struct bpf_local_storage_elem *selem,
 		    bool reuse_now)
 {
-	struct bpf_local_storage_map *smap;
-
-	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
-
 	if (!selem->use_kmalloc_nolock) {
 		/*
 		 * No uptr will be unpin even when reuse_now == false since uptr
 		 * is only supported in task local storage, where
 		 * smap->use_kmalloc_nolock == true.
 		 */
-		if (smap)
-			bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
 		__bpf_selem_free(selem, reuse_now);
 		return;
 	}
@@ -797,7 +801,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
 	return 0;
 }
 
-int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+int bpf_local_storage_map_check_btf(struct bpf_map *map,
 				    const struct btf *btf,
 				    const struct btf_type *key_type,
 				    const struct btf_type *value_type)
@@ -958,10 +962,9 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 	 */
 	synchronize_rcu();
 
-	if (smap->use_kmalloc_nolock) {
-		rcu_barrier_tasks_trace();
-		rcu_barrier();
-	}
+	/* smap remains in use regardless of kmalloc_nolock, so wait unconditionally. */
+	rcu_barrier_tasks_trace();
+	rcu_barrier();
 	kvfree(smap->buckets);
 	bpf_map_area_free(smap);
 }
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
@@ -125,6 +125,11 @@ struct htab_elem {
 	char key[] __aligned(8);
 };
 
+struct htab_btf_record {
+	struct btf_record *record;
+	u32 key_size;
+};
+
 static inline bool htab_is_prealloc(const struct bpf_htab *htab)
 {
 	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
@@ -457,6 +462,83 @@ static int htab_map_alloc_check(union bpf_attr *attr)
 	return 0;
 }
 
+static void htab_mem_dtor(void *obj, void *ctx)
+{
+	struct htab_btf_record *hrec = ctx;
+	struct htab_elem *elem = obj;
+	void *map_value;
+
+	if (IS_ERR_OR_NULL(hrec->record))
+		return;
+
+	map_value = htab_elem_value(elem, hrec->key_size);
+	bpf_obj_free_fields(hrec->record, map_value);
+}
+
+static void htab_pcpu_mem_dtor(void *obj, void *ctx)
+{
+	void __percpu *pptr = *(void __percpu **)obj;
+	struct htab_btf_record *hrec = ctx;
+	int cpu;
+
+	if (IS_ERR_OR_NULL(hrec->record))
+		return;
+
+	for_each_possible_cpu(cpu)
+		bpf_obj_free_fields(hrec->record, per_cpu_ptr(pptr, cpu));
+}
+
+static void htab_dtor_ctx_free(void *ctx)
+{
+	struct htab_btf_record *hrec = ctx;
+
+	btf_record_free(hrec->record);
+	kfree(ctx);
+}
+
+static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *))
+{
+	u32 key_size = htab->map.key_size;
+	struct bpf_mem_alloc *ma;
+	struct htab_btf_record *hrec;
+	int err;
+
+	/* No need for dtors. */
+	if (IS_ERR_OR_NULL(htab->map.record))
+		return 0;
+
+	hrec = kzalloc(sizeof(*hrec), GFP_KERNEL);
+	if (!hrec)
+		return -ENOMEM;
+	hrec->key_size = key_size;
+	hrec->record = btf_record_dup(htab->map.record);
+	if (IS_ERR(hrec->record)) {
+		err = PTR_ERR(hrec->record);
+		kfree(hrec);
+		return err;
+	}
+	ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma;
+	bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec);
+	return 0;
+}
+
+static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf,
+			      const struct btf_type *key_type, const struct btf_type *value_type)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+	if (htab_is_prealloc(htab))
+		return 0;
+	/*
+	 * We must set the dtor using this callback, as map's BTF record is not
+	 * populated in htab_map_alloc(), so it will always appear as NULL.
+	 */
+	if (htab_is_percpu(htab))
+		return htab_set_dtor(htab, htab_pcpu_mem_dtor);
+	else
+		return htab_set_dtor(htab, htab_mem_dtor);
+}
+
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
 	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -2281,6 +2363,7 @@ const struct bpf_map_ops htab_map_ops = {
 	.map_seq_show_elem = htab_map_seq_show_elem,
 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
 	.map_for_each_callback = bpf_for_each_hash_elem,
+	.map_check_btf = htab_map_check_btf,
 	.map_mem_usage = htab_map_mem_usage,
 	BATCH_OPS(htab),
 	.map_btf_id = &htab_map_btf_ids[0],
@@ -2303,6 +2386,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
 	.map_seq_show_elem = htab_map_seq_show_elem,
 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
 	.map_for_each_callback = bpf_for_each_hash_elem,
+	.map_check_btf = htab_map_check_btf,
 	.map_mem_usage = htab_map_mem_usage,
 	BATCH_OPS(htab_lru),
 	.map_btf_id = &htab_map_btf_ids[0],
@@ -2482,6 +2566,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
 	.map_for_each_callback = bpf_for_each_hash_elem,
+	.map_check_btf = htab_map_check_btf,
 	.map_mem_usage = htab_map_mem_usage,
 	BATCH_OPS(htab_percpu),
 	.map_btf_id = &htab_map_btf_ids[0],
@@ -2502,6 +2587,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
 	.map_for_each_callback = bpf_for_each_hash_elem,
+	.map_check_btf = htab_map_check_btf,
 	.map_mem_usage = htab_map_mem_usage,
 	BATCH_OPS(htab_lru_percpu),
 	.map_btf_id = &htab_map_btf_ids[0],
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
@@ -364,7 +364,7 @@ static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)
 	return -EINVAL;
 }
 
-static int cgroup_storage_check_btf(const struct bpf_map *map,
+static int cgroup_storage_check_btf(struct bpf_map *map,
 				    const struct btf *btf,
 				    const struct btf_type *key_type,
 				    const struct btf_type *value_type)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
@@ -751,7 +751,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
 	return err;
 }
 
-static int trie_check_btf(const struct bpf_map *map,
+static int trie_check_btf(struct bpf_map *map,
 			  const struct btf *btf,
 			  const struct btf_type *key_type,
 			  const struct btf_type *value_type)
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr_race.c b/tools/testing/selftests/bpf/prog_tests/map_kptr_race.c
diff --git a/tools/testing/selftests/bpf/progs/map_kptr_race.c b/tools/testing/selftests/bpf/progs/map_kptr_race.c

Original file line number	Diff line number	Diff line change
`@@ -303,7 +303,7 @@ static long arena_map_update_elem(struct bpf_map map, void key,`
`303`	`303`	`return -EOPNOTSUPP;`
`304`	`304`	`}`
`305`	`305`
`306`		`-static int arena_map_check_btf(const struct bpf_map map, const struct btf btf,`
	`306`	`+static int arena_map_check_btf(struct bpf_map map, const struct btf btf,`
`307`	`307`	`const struct btf_type key_type, const struct btf_type value_type)`
`308`	`308`	`{`
`309`	`309`	`return 0;`
Original file line number	Diff line number	Diff line change
`@@ -548,7 +548,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map map, void key,`
`548`	`548`	`rcu_read_unlock();`
`549`	`549`	`}`
`550`	`550`
`551`		`-static int array_map_check_btf(const struct bpf_map *map,`
	`551`	`+static int array_map_check_btf(struct bpf_map *map,`
`552`	`552`	`const struct btf *btf,`
`553`	`553`	`const struct btf_type *key_type,`
`554`	`554`	`const struct btf_type *value_type)`
Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,7 @@ static long bloom_map_update_elem(struct bpf_map map, void key,`
`180`	`180`	`return -EINVAL;`
`181`	`181`	`}`
`182`	`182`
`183`		`-static int bloom_map_check_btf(const struct bpf_map *map,`
	`183`	`+static int bloom_map_check_btf(struct bpf_map *map,`
`184`	`184`	`const struct btf *btf,`
`185`	`185`	`const struct btf_type *key_type,`
`186`	`186`	`const struct btf_type *value_type)`
Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@ static long insn_array_delete_elem(struct bpf_map map, void key)`
`98`	`98`	`return -EINVAL;`
`99`	`99`	`}`
`100`	`100`
`101`		`-static int insn_array_check_btf(const struct bpf_map *map,`
	`101`	`+static int insn_array_check_btf(struct bpf_map *map,`
`102`	`102`	`const struct btf *btf,`
`103`	`103`	`const struct btf_type *key_type,`
`104`	`104`	`const struct btf_type *value_type)`
Original file line number	Diff line number	Diff line change
`@@ -364,7 +364,7 @@ static long cgroup_storage_delete_elem(struct bpf_map map, void key)`
`364`	`364`	`return -EINVAL;`
`365`	`365`	`}`
`366`	`366`
`367`		`-static int cgroup_storage_check_btf(const struct bpf_map *map,`
	`367`	`+static int cgroup_storage_check_btf(struct bpf_map *map,`
`368`	`368`	`const struct btf *btf,`
`369`	`369`	`const struct btf_type *key_type,`
`370`	`370`	`const struct btf_type *value_type)`