@@ -248,6 +248,14 @@ struct partial_context {
248248 void * object ;
249249};
250250
251+ /* Structure holding parameters for get_partial_node_bulk() */
252+ struct partial_bulk_context {
253+ gfp_t flags ;
254+ unsigned int min_objects ;
255+ unsigned int max_objects ;
256+ struct list_head slabs ;
257+ };
258+
251259static inline bool kmem_cache_debug (struct kmem_cache * s )
252260{
253261 return kmem_cache_debug_flags (s , SLAB_DEBUG_FLAGS );
@@ -779,7 +787,8 @@ __update_freelist_slow(struct slab *slab, struct freelist_counters *old,
779787 if (slab -> freelist == old -> freelist &&
780788 slab -> counters == old -> counters ) {
781789 slab -> freelist = new -> freelist ;
782- slab -> counters = new -> counters ;
790+ /* prevent tearing for the read in get_partial_node_bulk() */
791+ WRITE_ONCE (slab -> counters , new -> counters );
783792 ret = true;
784793 }
785794 slab_unlock (slab );
@@ -2638,9 +2647,9 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
26382647 stat (s , SHEAF_FREE );
26392648}
26402649
2641- static int __kmem_cache_alloc_bulk ( struct kmem_cache * s , gfp_t flags ,
2642- size_t size , void * * p );
2643-
2650+ static unsigned int
2651+ __refill_objects ( struct kmem_cache * s , void * * p , gfp_t gfp , unsigned int min ,
2652+ unsigned int max );
26442653
26452654static int refill_sheaf (struct kmem_cache * s , struct slab_sheaf * sheaf ,
26462655 gfp_t gfp )
@@ -2651,8 +2660,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
26512660 if (!to_fill )
26522661 return 0 ;
26532662
2654- filled = __kmem_cache_alloc_bulk (s , gfp , to_fill ,
2655- & sheaf -> objects [ sheaf -> size ] );
2663+ filled = __refill_objects (s , & sheaf -> objects [ sheaf -> size ], gfp ,
2664+ to_fill , to_fill );
26562665
26572666 sheaf -> size += filled ;
26582667
@@ -3518,6 +3527,57 @@ static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
35183527#endif
35193528static inline bool pfmemalloc_match (struct slab * slab , gfp_t gfpflags );
35203529
3530+ static bool get_partial_node_bulk (struct kmem_cache * s ,
3531+ struct kmem_cache_node * n ,
3532+ struct partial_bulk_context * pc )
3533+ {
3534+ struct slab * slab , * slab2 ;
3535+ unsigned int total_free = 0 ;
3536+ unsigned long flags ;
3537+
3538+ /* Racy check to avoid taking the lock unnecessarily. */
3539+ if (!n || data_race (!n -> nr_partial ))
3540+ return false;
3541+
3542+ INIT_LIST_HEAD (& pc -> slabs );
3543+
3544+ spin_lock_irqsave (& n -> list_lock , flags );
3545+
3546+ list_for_each_entry_safe (slab , slab2 , & n -> partial , slab_list ) {
3547+ struct freelist_counters flc ;
3548+ unsigned int slab_free ;
3549+
3550+ if (!pfmemalloc_match (slab , pc -> flags ))
3551+ continue ;
3552+
3553+ /*
3554+ * determine the number of free objects in the slab racily
3555+ *
3556+ * slab_free is a lower bound due to possible subsequent
3557+ * concurrent freeing, so the caller may get more objects than
3558+ * requested and must handle that
3559+ */
3560+ flc .counters = data_race (READ_ONCE (slab -> counters ));
3561+ slab_free = flc .objects - flc .inuse ;
3562+
3563+ /* we have already min and this would get us over the max */
3564+ if (total_free >= pc -> min_objects
3565+ && total_free + slab_free > pc -> max_objects )
3566+ break ;
3567+
3568+ remove_partial (n , slab );
3569+
3570+ list_add (& slab -> slab_list , & pc -> slabs );
3571+
3572+ total_free += slab_free ;
3573+ if (total_free >= pc -> max_objects )
3574+ break ;
3575+ }
3576+
3577+ spin_unlock_irqrestore (& n -> list_lock , flags );
3578+ return total_free > 0 ;
3579+ }
3580+
35213581/*
35223582 * Try to allocate a partial slab from a specific node.
35233583 */
@@ -4444,6 +4504,33 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
44444504 return old .freelist ;
44454505}
44464506
4507+ /*
4508+ * Get the slab's freelist and do not freeze it.
4509+ *
4510+ * Assumes the slab is isolated from node partial list and not frozen.
4511+ *
4512+ * Assumes this is performed only for caches without debugging so we
4513+ * don't need to worry about adding the slab to the full list.
4514+ */
4515+ static inline void * get_freelist_nofreeze (struct kmem_cache * s , struct slab * slab )
4516+ {
4517+ struct freelist_counters old , new ;
4518+
4519+ do {
4520+ old .freelist = slab -> freelist ;
4521+ old .counters = slab -> counters ;
4522+
4523+ new .freelist = NULL ;
4524+ new .counters = old .counters ;
4525+ VM_WARN_ON_ONCE (new .frozen );
4526+
4527+ new .inuse = old .objects ;
4528+
4529+ } while (!slab_update_freelist (s , slab , & old , & new , "get_freelist_nofreeze" ));
4530+
4531+ return old .freelist ;
4532+ }
4533+
44474534/*
44484535 * Freeze the partial slab and return the pointer to the freelist.
44494536 */
@@ -4467,6 +4554,72 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
44674554 return old .freelist ;
44684555}
44694556
4557+ /*
4558+ * If the object has been wiped upon free, make sure it's fully initialized by
4559+ * zeroing out freelist pointer.
4560+ *
4561+ * Note that we also wipe custom freelist pointers.
4562+ */
4563+ static __always_inline void maybe_wipe_obj_freeptr (struct kmem_cache * s ,
4564+ void * obj )
4565+ {
4566+ if (unlikely (slab_want_init_on_free (s )) && obj &&
4567+ !freeptr_outside_object (s ))
4568+ memset ((void * )((char * )kasan_reset_tag (obj ) + s -> offset ),
4569+ 0 , sizeof (void * ));
4570+ }
4571+
4572+ static unsigned int alloc_from_new_slab (struct kmem_cache * s , struct slab * slab ,
4573+ void * * p , unsigned int count , bool allow_spin )
4574+ {
4575+ unsigned int allocated = 0 ;
4576+ struct kmem_cache_node * n ;
4577+ bool needs_add_partial ;
4578+ unsigned long flags ;
4579+ void * object ;
4580+
4581+ /*
4582+ * Are we going to put the slab on the partial list?
4583+ * Note slab->inuse is 0 on a new slab.
4584+ */
4585+ needs_add_partial = (slab -> objects > count );
4586+
4587+ if (!allow_spin && needs_add_partial ) {
4588+
4589+ n = get_node (s , slab_nid (slab ));
4590+
4591+ if (!spin_trylock_irqsave (& n -> list_lock , flags )) {
4592+ /* Unlucky, discard newly allocated slab */
4593+ defer_deactivate_slab (slab , NULL );
4594+ return 0 ;
4595+ }
4596+ }
4597+
4598+ object = slab -> freelist ;
4599+ while (object && allocated < count ) {
4600+ p [allocated ] = object ;
4601+ object = get_freepointer (s , object );
4602+ maybe_wipe_obj_freeptr (s , p [allocated ]);
4603+
4604+ slab -> inuse ++ ;
4605+ allocated ++ ;
4606+ }
4607+ slab -> freelist = object ;
4608+
4609+ if (needs_add_partial ) {
4610+
4611+ if (allow_spin ) {
4612+ n = get_node (s , slab_nid (slab ));
4613+ spin_lock_irqsave (& n -> list_lock , flags );
4614+ }
4615+ add_partial (n , slab , DEACTIVATE_TO_HEAD );
4616+ spin_unlock_irqrestore (& n -> list_lock , flags );
4617+ }
4618+
4619+ inc_slabs_node (s , slab_nid (slab ), slab -> objects );
4620+ return allocated ;
4621+ }
4622+
44704623/*
44714624 * Slow path. The lockless freelist is empty or we need to perform
44724625 * debugging duties.
@@ -4909,21 +5062,6 @@ static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
49095062 return object ;
49105063}
49115064
4912- /*
4913- * If the object has been wiped upon free, make sure it's fully initialized by
4914- * zeroing out freelist pointer.
4915- *
4916- * Note that we also wipe custom freelist pointers.
4917- */
4918- static __always_inline void maybe_wipe_obj_freeptr (struct kmem_cache * s ,
4919- void * obj )
4920- {
4921- if (unlikely (slab_want_init_on_free (s )) && obj &&
4922- !freeptr_outside_object (s ))
4923- memset ((void * )((char * )kasan_reset_tag (obj ) + s -> offset ),
4924- 0 , sizeof (void * ));
4925- }
4926-
49275065static __fastpath_inline
49285066struct kmem_cache * slab_pre_alloc_hook (struct kmem_cache * s , gfp_t flags )
49295067{
@@ -5384,6 +5522,9 @@ static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s,
53845522 return ret ;
53855523}
53865524
5525+ static int __kmem_cache_alloc_bulk (struct kmem_cache * s , gfp_t flags ,
5526+ size_t size , void * * p );
5527+
53875528/*
53885529 * returns a sheaf that has at least the requested size
53895530 * when prefilling is needed, do so with given gfp flags
@@ -7497,6 +7638,116 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
74977638}
74987639EXPORT_SYMBOL (kmem_cache_free_bulk );
74997640
7641+ static unsigned int
7642+ __refill_objects (struct kmem_cache * s , void * * p , gfp_t gfp , unsigned int min ,
7643+ unsigned int max )
7644+ {
7645+ struct partial_bulk_context pc ;
7646+ struct slab * slab , * slab2 ;
7647+ unsigned int refilled = 0 ;
7648+ unsigned long flags ;
7649+ void * object ;
7650+ int node ;
7651+
7652+ pc .flags = gfp ;
7653+ pc .min_objects = min ;
7654+ pc .max_objects = max ;
7655+
7656+ node = numa_mem_id ();
7657+
7658+ if (WARN_ON_ONCE (!gfpflags_allow_spinning (gfp )))
7659+ return 0 ;
7660+
7661+ /* TODO: consider also other nodes? */
7662+ if (!get_partial_node_bulk (s , get_node (s , node ), & pc ))
7663+ goto new_slab ;
7664+
7665+ list_for_each_entry_safe (slab , slab2 , & pc .slabs , slab_list ) {
7666+
7667+ list_del (& slab -> slab_list );
7668+
7669+ object = get_freelist_nofreeze (s , slab );
7670+
7671+ while (object && refilled < max ) {
7672+ p [refilled ] = object ;
7673+ object = get_freepointer (s , object );
7674+ maybe_wipe_obj_freeptr (s , p [refilled ]);
7675+
7676+ refilled ++ ;
7677+ }
7678+
7679+ /*
7680+ * Freelist had more objects than we can accommodate, we need to
7681+ * free them back. We can treat it like a detached freelist, just
7682+ * need to find the tail object.
7683+ */
7684+ if (unlikely (object )) {
7685+ void * head = object ;
7686+ void * tail ;
7687+ int cnt = 0 ;
7688+
7689+ do {
7690+ tail = object ;
7691+ cnt ++ ;
7692+ object = get_freepointer (s , object );
7693+ } while (object );
7694+ do_slab_free (s , slab , head , tail , cnt , _RET_IP_ );
7695+ }
7696+
7697+ if (refilled >= max )
7698+ break ;
7699+ }
7700+
7701+ if (unlikely (!list_empty (& pc .slabs ))) {
7702+ struct kmem_cache_node * n = get_node (s , node );
7703+
7704+ spin_lock_irqsave (& n -> list_lock , flags );
7705+
7706+ list_for_each_entry_safe (slab , slab2 , & pc .slabs , slab_list ) {
7707+
7708+ if (unlikely (!slab -> inuse && n -> nr_partial >= s -> min_partial ))
7709+ continue ;
7710+
7711+ list_del (& slab -> slab_list );
7712+ add_partial (n , slab , DEACTIVATE_TO_HEAD );
7713+ }
7714+
7715+ spin_unlock_irqrestore (& n -> list_lock , flags );
7716+
7717+ /* any slabs left are completely free and for discard */
7718+ list_for_each_entry_safe (slab , slab2 , & pc .slabs , slab_list ) {
7719+
7720+ list_del (& slab -> slab_list );
7721+ discard_slab (s , slab );
7722+ }
7723+ }
7724+
7725+
7726+ if (likely (refilled >= min ))
7727+ goto out ;
7728+
7729+ new_slab :
7730+
7731+ slab = new_slab (s , pc .flags , node );
7732+ if (!slab )
7733+ goto out ;
7734+
7735+ stat (s , ALLOC_SLAB );
7736+
7737+ /*
7738+ * TODO: possible optimization - if we know we will consume the whole
7739+ * slab we might skip creating the freelist?
7740+ */
7741+ refilled += alloc_from_new_slab (s , slab , p + refilled , max - refilled ,
7742+ /* allow_spin = */ true);
7743+
7744+ if (refilled < min )
7745+ goto new_slab ;
7746+ out :
7747+
7748+ return refilled ;
7749+ }
7750+
75007751static inline
75017752int __kmem_cache_alloc_bulk (struct kmem_cache * s , gfp_t flags , size_t size ,
75027753 void * * p )
0 commit comments