Skip to content

Commit 0f35040

Browse files
hygonitehcaster
authored andcommitted
mm/slab: introduce kvfree_rcu_barrier_on_cache() for cache destruction
Currently, kvfree_rcu_barrier() flushes RCU sheaves across all slab caches when a cache is destroyed. This is unnecessary; only the RCU sheaves belonging to the cache being destroyed need to be flushed. As suggested by Vlastimil Babka, introduce a weaker form of kvfree_rcu_barrier() that operates on a specific slab cache. Factor out flush_rcu_sheaves_on_cache() from flush_all_rcu_sheaves() and call it from flush_all_rcu_sheaves() and kvfree_rcu_barrier_on_cache(). Call kvfree_rcu_barrier_on_cache() instead of kvfree_rcu_barrier() on cache destruction. The performance benefit is evaluated on a 12 core 24 threads AMD Ryzen 5900X machine (1 socket), by loading slub_kunit module. Before: Total calls: 19 Average latency (us): 18127 Total time (us): 344414 After: Total calls: 19 Average latency (us): 10066 Total time (us): 191264 Two performance regression have been reported: - stress module loader test's runtime increases by 50-60% (Daniel) - internal graphics test's runtime on Tegra234 increases by 35% (Jon) They are fixed by this change. Suggested-by: Vlastimil Babka <vbabka@suse.cz> Fixes: ec66e0d ("slab: add sheaf support for batching kfree_rcu() operations") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/linux-mm/1bda09da-93be-4737-aef0-d47f8c5c9301@suse.cz Reported-and-tested-by: Daniel Gomez <da.gomez@samsung.com> Closes: https://lore.kernel.org/linux-mm/0406562e-2066-4cf8-9902-b2b0616dd742@kernel.org Reported-and-tested-by: Jon Hunter <jonathanh@nvidia.com> Closes: https://lore.kernel.org/linux-mm/e988eff6-1287-425e-a06c-805af5bbf262@nvidia.com Signed-off-by: Harry Yoo <harry.yoo@oracle.com> Link: https://patch.msgid.link/20251207154148.117723-1-harry.yoo@oracle.com Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
1 parent b687034 commit 0f35040

4 files changed

Lines changed: 75 additions & 40 deletions

File tree

include/linux/slab.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1150,10 +1150,17 @@ static inline void kvfree_rcu_barrier(void)
11501150
rcu_barrier();
11511151
}
11521152

1153+
static inline void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
1154+
{
1155+
rcu_barrier();
1156+
}
1157+
11531158
static inline void kfree_rcu_scheduler_running(void) { }
11541159
#else
11551160
void kvfree_rcu_barrier(void);
11561161

1162+
void kvfree_rcu_barrier_on_cache(struct kmem_cache *s);
1163+
11571164
void kfree_rcu_scheduler_running(void);
11581165
#endif
11591166

mm/slab.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,7 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
422422

423423
bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
424424
void flush_all_rcu_sheaves(void);
425+
void flush_rcu_sheaves_on_cache(struct kmem_cache *s);
425426

426427
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
427428
SLAB_CACHE_DMA32 | SLAB_PANIC | \

mm/slab_common.c

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
492492
return;
493493

494494
/* in-flight kfree_rcu()'s may include objects from our cache */
495-
kvfree_rcu_barrier();
495+
kvfree_rcu_barrier_on_cache(s);
496496

497497
if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) &&
498498
(s->flags & SLAB_TYPESAFE_BY_RCU)) {
@@ -2038,25 +2038,13 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
20382038
}
20392039
EXPORT_SYMBOL_GPL(kvfree_call_rcu);
20402040

2041-
/**
2042-
* kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
2043-
*
2044-
* Note that a single argument of kvfree_rcu() call has a slow path that
2045-
* triggers synchronize_rcu() following by freeing a pointer. It is done
2046-
* before the return from the function. Therefore for any single-argument
2047-
* call that will result in a kfree() to a cache that is to be destroyed
2048-
* during module exit, it is developer's responsibility to ensure that all
2049-
* such calls have returned before the call to kmem_cache_destroy().
2050-
*/
2051-
void kvfree_rcu_barrier(void)
2041+
static inline void __kvfree_rcu_barrier(void)
20522042
{
20532043
struct kfree_rcu_cpu_work *krwp;
20542044
struct kfree_rcu_cpu *krcp;
20552045
bool queued;
20562046
int i, cpu;
20572047

2058-
flush_all_rcu_sheaves();
2059-
20602048
/*
20612049
* Firstly we detach objects and queue them over an RCU-batch
20622050
* for all CPUs. Finally queued works are flushed for each CPU.
@@ -2118,8 +2106,43 @@ void kvfree_rcu_barrier(void)
21182106
}
21192107
}
21202108
}
2109+
2110+
/**
2111+
* kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
2112+
*
2113+
* Note that a single argument of kvfree_rcu() call has a slow path that
2114+
* triggers synchronize_rcu() following by freeing a pointer. It is done
2115+
* before the return from the function. Therefore for any single-argument
2116+
* call that will result in a kfree() to a cache that is to be destroyed
2117+
* during module exit, it is developer's responsibility to ensure that all
2118+
* such calls have returned before the call to kmem_cache_destroy().
2119+
*/
2120+
void kvfree_rcu_barrier(void)
2121+
{
2122+
flush_all_rcu_sheaves();
2123+
__kvfree_rcu_barrier();
2124+
}
21212125
EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
21222126

2127+
/**
2128+
* kvfree_rcu_barrier_on_cache - Wait for in-flight kvfree_rcu() calls on a
2129+
* specific slab cache.
2130+
* @s: slab cache to wait for
2131+
*
2132+
* See the description of kvfree_rcu_barrier() for details.
2133+
*/
2134+
void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
2135+
{
2136+
if (s->cpu_sheaves)
2137+
flush_rcu_sheaves_on_cache(s);
2138+
/*
2139+
* TODO: Introduce a version of __kvfree_rcu_barrier() that works
2140+
* on a specific slab cache.
2141+
*/
2142+
__kvfree_rcu_barrier();
2143+
}
2144+
EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
2145+
21232146
static unsigned long
21242147
kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
21252148
{
@@ -2215,4 +2238,3 @@ void __init kvfree_rcu_init(void)
22152238
}
22162239

22172240
#endif /* CONFIG_KVFREE_RCU_BATCHED */
2218-

mm/slub.c

Lines changed: 30 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4122,42 +4122,47 @@ static void flush_rcu_sheaf(struct work_struct *w)
41224122

41234123

41244124
/* needed for kvfree_rcu_barrier() */
4125-
void flush_all_rcu_sheaves(void)
4125+
void flush_rcu_sheaves_on_cache(struct kmem_cache *s)
41264126
{
41274127
struct slub_flush_work *sfw;
4128-
struct kmem_cache *s;
41294128
unsigned int cpu;
41304129

4131-
cpus_read_lock();
4132-
mutex_lock(&slab_mutex);
4130+
mutex_lock(&flush_lock);
41334131

4134-
list_for_each_entry(s, &slab_caches, list) {
4135-
if (!s->cpu_sheaves)
4136-
continue;
4132+
for_each_online_cpu(cpu) {
4133+
sfw = &per_cpu(slub_flush, cpu);
41374134

4138-
mutex_lock(&flush_lock);
4135+
/*
4136+
* we don't check if rcu_free sheaf exists - racing
4137+
* __kfree_rcu_sheaf() might have just removed it.
4138+
* by executing flush_rcu_sheaf() on the cpu we make
4139+
* sure the __kfree_rcu_sheaf() finished its call_rcu()
4140+
*/
41394141

4140-
for_each_online_cpu(cpu) {
4141-
sfw = &per_cpu(slub_flush, cpu);
4142+
INIT_WORK(&sfw->work, flush_rcu_sheaf);
4143+
sfw->s = s;
4144+
queue_work_on(cpu, flushwq, &sfw->work);
4145+
}
41424146

4143-
/*
4144-
* we don't check if rcu_free sheaf exists - racing
4145-
* __kfree_rcu_sheaf() might have just removed it.
4146-
* by executing flush_rcu_sheaf() on the cpu we make
4147-
* sure the __kfree_rcu_sheaf() finished its call_rcu()
4148-
*/
4147+
for_each_online_cpu(cpu) {
4148+
sfw = &per_cpu(slub_flush, cpu);
4149+
flush_work(&sfw->work);
4150+
}
41494151

4150-
INIT_WORK(&sfw->work, flush_rcu_sheaf);
4151-
sfw->s = s;
4152-
queue_work_on(cpu, flushwq, &sfw->work);
4153-
}
4152+
mutex_unlock(&flush_lock);
4153+
}
41544154

4155-
for_each_online_cpu(cpu) {
4156-
sfw = &per_cpu(slub_flush, cpu);
4157-
flush_work(&sfw->work);
4158-
}
4155+
void flush_all_rcu_sheaves(void)
4156+
{
4157+
struct kmem_cache *s;
4158+
4159+
cpus_read_lock();
4160+
mutex_lock(&slab_mutex);
41594161

4160-
mutex_unlock(&flush_lock);
4162+
list_for_each_entry(s, &slab_caches, list) {
4163+
if (!s->cpu_sheaves)
4164+
continue;
4165+
flush_rcu_sheaves_on_cache(s);
41614166
}
41624167

41634168
mutex_unlock(&slab_mutex);

0 commit comments

Comments
 (0)