Skip to content

Commit 5d800f8

Browse files
ameryhungMartin KaFai Lau
authored andcommitted
bpf: Support lockless unlink when freeing map or local storage
Introduce bpf_selem_unlink_nofail() to properly handle errors returned from rqspinlock in bpf_local_storage_map_free() and bpf_local_storage_destroy() where the operation must succeeds. The idea of bpf_selem_unlink_nofail() is to allow an selem to be partially linked and use atomic operation on a bit field, selem->state, to determine when and who can free the selem if any unlink under lock fails. An selem initially is fully linked to a map and a local storage. Under normal circumstances, bpf_selem_unlink_nofail() will be able to grab locks and unlink a selem from map and local storage in sequeunce, just like bpf_selem_unlink(), and then free it after an RCU grace period. However, if any of the lock attempts fails, it will only clear SDATA(selem)->smap or selem->local_storage depending on the caller and set SELEM_MAP_UNLINKED or SELEM_STORAGE_UNLINKED according to the caller. Then, after both map_free() and destroy() see the selem and the state becomes SELEM_UNLINKED, one of two racing caller can succeed in cmpxchg the state from SELEM_UNLINKED to SELEM_TOFREE, ensuring no double free or memory leak. To make sure bpf_obj_free_fields() is done only once and when map is still present, it is called when unlinking an selem from b->list under b->lock. To make sure uncharging memory is done only when the owner is still present in map_free(), block destroy() from returning until there is no pending map_free(). Since smap may not be valid in destroy(), bpf_selem_unlink_nofail() skips bpf_selem_unlink_storage_nolock_misc() when called from destroy(). This is okay as bpf_local_storage_destroy() will return the remaining amount of memory charge tracked by mem_charge to the owner to uncharge. It is also safe to skip clearing local_storage->owner and owner_storage as the owner is being freed and no users or bpf programs should be able to reference the owner and using local_storage. Finally, access of selem, SDATA(selem)->smap and selem->local_storage are racy. Callers will protect these fields with RCU. Acked-by: Alexei Starovoitov <ast@kernel.org> Co-developed-by: Martin KaFai Lau <martin.lau@kernel.org> Signed-off-by: Amery Hung <ameryhung@gmail.com> Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org> Link: https://patch.msgid.link/20260205222916.1788211-11-ameryhung@gmail.com
1 parent c8be3da commit 5d800f8

2 files changed

Lines changed: 118 additions & 7 deletions

File tree

include/linux/bpf_local_storage.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,11 @@ struct bpf_local_storage_data {
6868
u8 data[] __aligned(8);
6969
};
7070

71+
#define SELEM_MAP_UNLINKED (1 << 0)
72+
#define SELEM_STORAGE_UNLINKED (1 << 1)
73+
#define SELEM_UNLINKED (SELEM_MAP_UNLINKED | SELEM_STORAGE_UNLINKED)
74+
#define SELEM_TOFREE (1 << 2)
75+
7176
/* Linked to bpf_local_storage and bpf_local_storage_map */
7277
struct bpf_local_storage_elem {
7378
struct hlist_node map_node; /* Linked to bpf_local_storage_map */
@@ -80,8 +85,9 @@ struct bpf_local_storage_elem {
8085
* after raw_spin_unlock
8186
*/
8287
};
88+
atomic_t state;
8389
bool use_kmalloc_nolock;
84-
/* 7 bytes hole */
90+
/* 3 bytes hole */
8591
/* The data is stored in another cacheline to minimize
8692
* the number of cachelines access during a cache hit.
8793
*/
@@ -97,6 +103,7 @@ struct bpf_local_storage {
97103
struct rcu_head rcu;
98104
rqspinlock_t lock; /* Protect adding/removing from the "list" */
99105
u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */
106+
refcount_t owner_refcnt;/* Used to pin owner when map_free is uncharging */
100107
bool use_kmalloc_nolock;
101108
};
102109

kernel/bpf/bpf_local_storage.c

Lines changed: 110 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
8585

8686
if (selem) {
8787
RCU_INIT_POINTER(SDATA(selem)->smap, smap);
88+
atomic_set(&selem->state, 0);
8889
selem->use_kmalloc_nolock = smap->use_kmalloc_nolock;
8990

9091
if (value) {
@@ -194,9 +195,11 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
194195
/* The bpf_local_storage_map_free will wait for rcu_barrier */
195196
smap = rcu_dereference_check(SDATA(selem)->smap, 1);
196197

197-
migrate_disable();
198-
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
199-
migrate_enable();
198+
if (smap) {
199+
migrate_disable();
200+
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
201+
migrate_enable();
202+
}
200203
kfree_nolock(selem);
201204
}
202205

@@ -221,7 +224,8 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
221224
* is only supported in task local storage, where
222225
* smap->use_kmalloc_nolock == true.
223226
*/
224-
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
227+
if (smap)
228+
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
225229
__bpf_selem_free(selem, reuse_now);
226230
return;
227231
}
@@ -255,7 +259,7 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
255259
static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem,
256260
struct bpf_local_storage_map *smap,
257261
struct bpf_local_storage *local_storage,
258-
bool free_local_storage)
262+
bool free_local_storage, bool pin_owner)
259263
{
260264
void *owner = local_storage->owner;
261265
u32 uncharge = smap->elem_size;
@@ -264,6 +268,9 @@ static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *
264268
SDATA(selem))
265269
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
266270

271+
if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt))
272+
return;
273+
267274
uncharge += free_local_storage ? sizeof(*local_storage) : 0;
268275
mem_uncharge(smap, local_storage->owner, uncharge);
269276
local_storage->mem_charge -= uncharge;
@@ -274,6 +281,9 @@ static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *
274281
/* After this RCU_INIT, owner may be freed and cannot be used */
275282
RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
276283
}
284+
285+
if (pin_owner)
286+
refcount_dec(&local_storage->owner_refcnt);
277287
}
278288

279289
/* local_storage->lock must be held and selem->local_storage == local_storage.
@@ -293,7 +303,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
293303
&local_storage->list);
294304

295305
bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage,
296-
free_local_storage);
306+
free_local_storage, false);
297307

298308
hlist_del_init_rcu(&selem->snode);
299309

@@ -409,6 +419,94 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
409419
return err;
410420
}
411421

422+
/*
423+
* Unlink an selem from map and local storage with lockless fallback if callers
424+
* are racing or rqspinlock returns error. It should only be called by
425+
* bpf_local_storage_destroy() or bpf_local_storage_map_free().
426+
*/
427+
static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem,
428+
struct bpf_local_storage_map_bucket *b)
429+
{
430+
bool in_map_free = !!b, free_storage = false;
431+
struct bpf_local_storage *local_storage;
432+
struct bpf_local_storage_map *smap;
433+
unsigned long flags;
434+
int err, unlink = 0;
435+
436+
local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held());
437+
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
438+
439+
if (smap) {
440+
b = b ? : select_bucket(smap, local_storage);
441+
err = raw_res_spin_lock_irqsave(&b->lock, flags);
442+
if (!err) {
443+
/*
444+
* Call bpf_obj_free_fields() under b->lock to make sure it is done
445+
* exactly once for an selem. Safe to free special fields immediately
446+
* as no BPF program should be referencing the selem.
447+
*/
448+
if (likely(selem_linked_to_map(selem))) {
449+
hlist_del_init_rcu(&selem->map_node);
450+
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
451+
unlink++;
452+
}
453+
raw_res_spin_unlock_irqrestore(&b->lock, flags);
454+
}
455+
/*
456+
* Highly unlikely scenario: resource leak
457+
*
458+
* When map_free(selem1), destroy(selem1) and destroy(selem2) are racing
459+
* and both selem belong to the same bucket, if destroy(selem2) acquired
460+
* b->lock and block for too long, neither map_free(selem1) and
461+
* destroy(selem1) will be able to free the special field associated
462+
* with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT.
463+
*/
464+
WARN_ON_ONCE(err && in_map_free);
465+
if (!err || in_map_free)
466+
RCU_INIT_POINTER(SDATA(selem)->smap, NULL);
467+
}
468+
469+
if (local_storage) {
470+
err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
471+
if (!err) {
472+
if (likely(selem_linked_to_storage(selem))) {
473+
free_storage = hlist_is_singular_node(&selem->snode,
474+
&local_storage->list);
475+
/*
476+
* Okay to skip clearing owner_storage and storage->owner in
477+
* destroy() since the owner is going away. No user or bpf
478+
* programs should be able to reference it.
479+
*/
480+
if (smap && in_map_free)
481+
bpf_selem_unlink_storage_nolock_misc(
482+
selem, smap, local_storage,
483+
free_storage, true);
484+
hlist_del_init_rcu(&selem->snode);
485+
unlink++;
486+
}
487+
raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
488+
}
489+
if (!err || !in_map_free)
490+
RCU_INIT_POINTER(selem->local_storage, NULL);
491+
}
492+
493+
if (unlink != 2)
494+
atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state);
495+
496+
/*
497+
* Normally, an selem can be unlinked under local_storage->lock and b->lock, and
498+
* then freed after an RCU grace period. However, if destroy() and map_free() are
499+
* racing or rqspinlock returns errors in unlikely situations (unlink != 2), free
500+
* the selem only after both map_free() and destroy() see the selem.
501+
*/
502+
if (unlink == 2 ||
503+
atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED)
504+
bpf_selem_free(selem, true);
505+
506+
if (free_storage)
507+
bpf_local_storage_free(local_storage, true);
508+
}
509+
412510
void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
413511
struct bpf_local_storage_map *smap,
414512
struct bpf_local_storage_elem *selem)
@@ -475,6 +573,7 @@ int bpf_local_storage_alloc(void *owner,
475573
storage->owner = owner;
476574
storage->mem_charge = sizeof(*storage);
477575
storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
576+
refcount_set(&storage->owner_refcnt, 1);
478577

479578
bpf_selem_link_storage_nolock(storage, first_selem);
480579

@@ -743,6 +842,11 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
743842

744843
if (free_storage)
745844
bpf_local_storage_free(local_storage, true);
845+
846+
if (!refcount_dec_and_test(&local_storage->owner_refcnt)) {
847+
while (refcount_read(&local_storage->owner_refcnt))
848+
cpu_relax();
849+
}
746850
}
747851

748852
u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)

0 commit comments

Comments
 (0)