Skip to content

Commit ed30c4a

Browse files
committed
slab: add optimized sheaf refill from partial list
At this point we have sheaves enabled for all caches, but their refill is done via __kmem_cache_alloc_bulk() which relies on cpu (partial) slabs - now a redundant caching layer that we are about to remove. The refill will thus be done from slabs on the node partial list. Introduce new functions that can do that in an optimized way as it's easier than modifying the __kmem_cache_alloc_bulk() call chain. Introduce struct partial_bulk_context, a variant of struct partial_context that can return a list of slabs from the partial list with the sum of free objects in them within the requested min and max. Introduce get_partial_node_bulk() that removes the slabs from freelist and returns them in the list. There is a racy read of slab->counters so make sure the non-atomic write in __update_freelist_slow() is not tearing. Introduce get_freelist_nofreeze() which grabs the freelist without freezing the slab. Introduce alloc_from_new_slab() which can allocate multiple objects from a newly allocated slab where we don't need to synchronize with freeing. In some aspects it's similar to alloc_single_from_new_slab() but assumes the cache is a non-debug one so it can avoid some actions. It supports the allow_spin parameter, which we always set true here, but the followup change will reuse the function in a context where it may be false. Introduce __refill_objects() that uses the functions above to fill an array of objects. It has to handle the possibility that the slabs will contain more objects that were requested, due to concurrent freeing of objects to those slabs. When no more slabs on partial lists are available, it will allocate new slabs. It is intended to be only used in context where spinning is allowed, so add a WARN_ON_ONCE check there. Finally, switch refill_sheaf() to use __refill_objects(). Sheaves are only refilled from contexts that allow spinning, or even blocking. Reviewed-by: Suren Baghdasaryan <surenb@google.com> Reviewed-by: Hao Li <hao.li@linux.dev> Reviewed-by: Harry Yoo <harry.yoo@oracle.com> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
1 parent 913ffd3 commit ed30c4a

1 file changed

Lines changed: 272 additions & 21 deletions

File tree

mm/slub.c

Lines changed: 272 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,14 @@ struct partial_context {
248248
void *object;
249249
};
250250

251+
/* Structure holding parameters for get_partial_node_bulk() */
252+
struct partial_bulk_context {
253+
gfp_t flags;
254+
unsigned int min_objects;
255+
unsigned int max_objects;
256+
struct list_head slabs;
257+
};
258+
251259
static inline bool kmem_cache_debug(struct kmem_cache *s)
252260
{
253261
return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
@@ -779,7 +787,8 @@ __update_freelist_slow(struct slab *slab, struct freelist_counters *old,
779787
if (slab->freelist == old->freelist &&
780788
slab->counters == old->counters) {
781789
slab->freelist = new->freelist;
782-
slab->counters = new->counters;
790+
/* prevent tearing for the read in get_partial_node_bulk() */
791+
WRITE_ONCE(slab->counters, new->counters);
783792
ret = true;
784793
}
785794
slab_unlock(slab);
@@ -2638,9 +2647,9 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
26382647
stat(s, SHEAF_FREE);
26392648
}
26402649

2641-
static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
2642-
size_t size, void **p);
2643-
2650+
static unsigned int
2651+
__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
2652+
unsigned int max);
26442653

26452654
static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
26462655
gfp_t gfp)
@@ -2651,8 +2660,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
26512660
if (!to_fill)
26522661
return 0;
26532662

2654-
filled = __kmem_cache_alloc_bulk(s, gfp, to_fill,
2655-
&sheaf->objects[sheaf->size]);
2663+
filled = __refill_objects(s, &sheaf->objects[sheaf->size], gfp,
2664+
to_fill, to_fill);
26562665

26572666
sheaf->size += filled;
26582667

@@ -3518,6 +3527,57 @@ static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
35183527
#endif
35193528
static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
35203529

3530+
static bool get_partial_node_bulk(struct kmem_cache *s,
3531+
struct kmem_cache_node *n,
3532+
struct partial_bulk_context *pc)
3533+
{
3534+
struct slab *slab, *slab2;
3535+
unsigned int total_free = 0;
3536+
unsigned long flags;
3537+
3538+
/* Racy check to avoid taking the lock unnecessarily. */
3539+
if (!n || data_race(!n->nr_partial))
3540+
return false;
3541+
3542+
INIT_LIST_HEAD(&pc->slabs);
3543+
3544+
spin_lock_irqsave(&n->list_lock, flags);
3545+
3546+
list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
3547+
struct freelist_counters flc;
3548+
unsigned int slab_free;
3549+
3550+
if (!pfmemalloc_match(slab, pc->flags))
3551+
continue;
3552+
3553+
/*
3554+
* determine the number of free objects in the slab racily
3555+
*
3556+
* slab_free is a lower bound due to possible subsequent
3557+
* concurrent freeing, so the caller may get more objects than
3558+
* requested and must handle that
3559+
*/
3560+
flc.counters = data_race(READ_ONCE(slab->counters));
3561+
slab_free = flc.objects - flc.inuse;
3562+
3563+
/* we have already min and this would get us over the max */
3564+
if (total_free >= pc->min_objects
3565+
&& total_free + slab_free > pc->max_objects)
3566+
break;
3567+
3568+
remove_partial(n, slab);
3569+
3570+
list_add(&slab->slab_list, &pc->slabs);
3571+
3572+
total_free += slab_free;
3573+
if (total_free >= pc->max_objects)
3574+
break;
3575+
}
3576+
3577+
spin_unlock_irqrestore(&n->list_lock, flags);
3578+
return total_free > 0;
3579+
}
3580+
35213581
/*
35223582
* Try to allocate a partial slab from a specific node.
35233583
*/
@@ -4444,6 +4504,33 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
44444504
return old.freelist;
44454505
}
44464506

4507+
/*
4508+
* Get the slab's freelist and do not freeze it.
4509+
*
4510+
* Assumes the slab is isolated from node partial list and not frozen.
4511+
*
4512+
* Assumes this is performed only for caches without debugging so we
4513+
* don't need to worry about adding the slab to the full list.
4514+
*/
4515+
static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab)
4516+
{
4517+
struct freelist_counters old, new;
4518+
4519+
do {
4520+
old.freelist = slab->freelist;
4521+
old.counters = slab->counters;
4522+
4523+
new.freelist = NULL;
4524+
new.counters = old.counters;
4525+
VM_WARN_ON_ONCE(new.frozen);
4526+
4527+
new.inuse = old.objects;
4528+
4529+
} while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze"));
4530+
4531+
return old.freelist;
4532+
}
4533+
44474534
/*
44484535
* Freeze the partial slab and return the pointer to the freelist.
44494536
*/
@@ -4467,6 +4554,72 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
44674554
return old.freelist;
44684555
}
44694556

4557+
/*
4558+
* If the object has been wiped upon free, make sure it's fully initialized by
4559+
* zeroing out freelist pointer.
4560+
*
4561+
* Note that we also wipe custom freelist pointers.
4562+
*/
4563+
static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
4564+
void *obj)
4565+
{
4566+
if (unlikely(slab_want_init_on_free(s)) && obj &&
4567+
!freeptr_outside_object(s))
4568+
memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
4569+
0, sizeof(void *));
4570+
}
4571+
4572+
static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
4573+
void **p, unsigned int count, bool allow_spin)
4574+
{
4575+
unsigned int allocated = 0;
4576+
struct kmem_cache_node *n;
4577+
bool needs_add_partial;
4578+
unsigned long flags;
4579+
void *object;
4580+
4581+
/*
4582+
* Are we going to put the slab on the partial list?
4583+
* Note slab->inuse is 0 on a new slab.
4584+
*/
4585+
needs_add_partial = (slab->objects > count);
4586+
4587+
if (!allow_spin && needs_add_partial) {
4588+
4589+
n = get_node(s, slab_nid(slab));
4590+
4591+
if (!spin_trylock_irqsave(&n->list_lock, flags)) {
4592+
/* Unlucky, discard newly allocated slab */
4593+
defer_deactivate_slab(slab, NULL);
4594+
return 0;
4595+
}
4596+
}
4597+
4598+
object = slab->freelist;
4599+
while (object && allocated < count) {
4600+
p[allocated] = object;
4601+
object = get_freepointer(s, object);
4602+
maybe_wipe_obj_freeptr(s, p[allocated]);
4603+
4604+
slab->inuse++;
4605+
allocated++;
4606+
}
4607+
slab->freelist = object;
4608+
4609+
if (needs_add_partial) {
4610+
4611+
if (allow_spin) {
4612+
n = get_node(s, slab_nid(slab));
4613+
spin_lock_irqsave(&n->list_lock, flags);
4614+
}
4615+
add_partial(n, slab, DEACTIVATE_TO_HEAD);
4616+
spin_unlock_irqrestore(&n->list_lock, flags);
4617+
}
4618+
4619+
inc_slabs_node(s, slab_nid(slab), slab->objects);
4620+
return allocated;
4621+
}
4622+
44704623
/*
44714624
* Slow path. The lockless freelist is empty or we need to perform
44724625
* debugging duties.
@@ -4909,21 +5062,6 @@ static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
49095062
return object;
49105063
}
49115064

4912-
/*
4913-
* If the object has been wiped upon free, make sure it's fully initialized by
4914-
* zeroing out freelist pointer.
4915-
*
4916-
* Note that we also wipe custom freelist pointers.
4917-
*/
4918-
static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
4919-
void *obj)
4920-
{
4921-
if (unlikely(slab_want_init_on_free(s)) && obj &&
4922-
!freeptr_outside_object(s))
4923-
memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
4924-
0, sizeof(void *));
4925-
}
4926-
49275065
static __fastpath_inline
49285066
struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
49295067
{
@@ -5384,6 +5522,9 @@ static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s,
53845522
return ret;
53855523
}
53865524

5525+
static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
5526+
size_t size, void **p);
5527+
53875528
/*
53885529
* returns a sheaf that has at least the requested size
53895530
* when prefilling is needed, do so with given gfp flags
@@ -7497,6 +7638,116 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
74977638
}
74987639
EXPORT_SYMBOL(kmem_cache_free_bulk);
74997640

7641+
static unsigned int
7642+
__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
7643+
unsigned int max)
7644+
{
7645+
struct partial_bulk_context pc;
7646+
struct slab *slab, *slab2;
7647+
unsigned int refilled = 0;
7648+
unsigned long flags;
7649+
void *object;
7650+
int node;
7651+
7652+
pc.flags = gfp;
7653+
pc.min_objects = min;
7654+
pc.max_objects = max;
7655+
7656+
node = numa_mem_id();
7657+
7658+
if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp)))
7659+
return 0;
7660+
7661+
/* TODO: consider also other nodes? */
7662+
if (!get_partial_node_bulk(s, get_node(s, node), &pc))
7663+
goto new_slab;
7664+
7665+
list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
7666+
7667+
list_del(&slab->slab_list);
7668+
7669+
object = get_freelist_nofreeze(s, slab);
7670+
7671+
while (object && refilled < max) {
7672+
p[refilled] = object;
7673+
object = get_freepointer(s, object);
7674+
maybe_wipe_obj_freeptr(s, p[refilled]);
7675+
7676+
refilled++;
7677+
}
7678+
7679+
/*
7680+
* Freelist had more objects than we can accommodate, we need to
7681+
* free them back. We can treat it like a detached freelist, just
7682+
* need to find the tail object.
7683+
*/
7684+
if (unlikely(object)) {
7685+
void *head = object;
7686+
void *tail;
7687+
int cnt = 0;
7688+
7689+
do {
7690+
tail = object;
7691+
cnt++;
7692+
object = get_freepointer(s, object);
7693+
} while (object);
7694+
do_slab_free(s, slab, head, tail, cnt, _RET_IP_);
7695+
}
7696+
7697+
if (refilled >= max)
7698+
break;
7699+
}
7700+
7701+
if (unlikely(!list_empty(&pc.slabs))) {
7702+
struct kmem_cache_node *n = get_node(s, node);
7703+
7704+
spin_lock_irqsave(&n->list_lock, flags);
7705+
7706+
list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
7707+
7708+
if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial))
7709+
continue;
7710+
7711+
list_del(&slab->slab_list);
7712+
add_partial(n, slab, DEACTIVATE_TO_HEAD);
7713+
}
7714+
7715+
spin_unlock_irqrestore(&n->list_lock, flags);
7716+
7717+
/* any slabs left are completely free and for discard */
7718+
list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
7719+
7720+
list_del(&slab->slab_list);
7721+
discard_slab(s, slab);
7722+
}
7723+
}
7724+
7725+
7726+
if (likely(refilled >= min))
7727+
goto out;
7728+
7729+
new_slab:
7730+
7731+
slab = new_slab(s, pc.flags, node);
7732+
if (!slab)
7733+
goto out;
7734+
7735+
stat(s, ALLOC_SLAB);
7736+
7737+
/*
7738+
* TODO: possible optimization - if we know we will consume the whole
7739+
* slab we might skip creating the freelist?
7740+
*/
7741+
refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled,
7742+
/* allow_spin = */ true);
7743+
7744+
if (refilled < min)
7745+
goto new_slab;
7746+
out:
7747+
7748+
return refilled;
7749+
}
7750+
75007751
static inline
75017752
int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
75027753
void **p)

0 commit comments

Comments
 (0)