Skip to content

Commit 3cb278e

Browse files
joelagnelpaulmckrcu
authored andcommitted
rcu: Make call_rcu() lazy to save power
Implement timer-based RCU callback batching (also known as lazy callbacks). With this we save about 5-10% of power consumed due to RCU requests that happen when system is lightly loaded or idle. By default, all async callbacks (queued via call_rcu) are marked lazy. An alternate API call_rcu_hurry() is provided for the few users, for example synchronize_rcu(), that need the old behavior. The batch is flushed whenever a certain amount of time has passed, or the batch on a particular CPU grows too big. Also memory pressure will flush it in a future patch. To handle several corner cases automagically (such as rcu_barrier() and hotplug), we re-use bypass lists which were originally introduced to address lock contention, to handle lazy CBs as well. The bypass list length has the lazy CB length included in it. A separate lazy CB length counter is also introduced to keep track of the number of lazy CBs. [ paulmck: Fix formatting of inline call_rcu_lazy() definition. ] [ paulmck: Apply Zqiang feedback. ] [ paulmck: Apply s/call_rcu_flush/call_rcu_hurry/ feedback from Tejun Heo. ] Suggested-by: Paul McKenney <paulmck@kernel.org> Acked-by: Frederic Weisbecker <frederic@kernel.org> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
1 parent b8f7aca commit 3cb278e

8 files changed

Lines changed: 246 additions & 82 deletions

File tree

include/linux/rcupdate.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,15 @@ static inline int rcu_preempt_depth(void)
108108

109109
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
110110

111+
#ifdef CONFIG_RCU_LAZY
112+
void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func);
113+
#else
114+
static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
115+
{
116+
call_rcu(head, func);
117+
}
118+
#endif
119+
111120
/* Internal to kernel */
112121
void rcu_init(void);
113122
extern int rcu_scheduler_active;

kernel/rcu/Kconfig

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,4 +311,12 @@ config TASKS_TRACE_RCU_READ_MB
311311
Say N here if you hate read-side memory barriers.
312312
Take the default if you are unsure.
313313

314+
config RCU_LAZY
315+
bool "RCU callback lazy invocation functionality"
316+
depends on RCU_NOCB_CPU
317+
default n
318+
help
319+
To save power, batch RCU callbacks and flush after delay, memory
320+
pressure, or callback list growing too big.
321+
314322
endmenu # "RCU Subsystem"

kernel/rcu/rcu.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,14 @@ enum rcutorture_type {
474474
INVALID_RCU_FLAVOR
475475
};
476476

477+
#if defined(CONFIG_RCU_LAZY)
478+
unsigned long rcu_lazy_get_jiffies_till_flush(void);
479+
void rcu_lazy_set_jiffies_till_flush(unsigned long j);
480+
#else
481+
static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
482+
static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
483+
#endif
484+
477485
#if defined(CONFIG_TREE_RCU)
478486
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
479487
unsigned long *gp_seq);

kernel/rcu/tiny.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
4444

4545
void rcu_barrier(void)
4646
{
47-
wait_rcu_gp(call_rcu);
47+
wait_rcu_gp(call_rcu_hurry);
4848
}
4949
EXPORT_SYMBOL(rcu_barrier);
5050

kernel/rcu/tree.c

Lines changed: 83 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -2728,47 +2728,8 @@ static void check_cb_ovld(struct rcu_data *rdp)
27282728
raw_spin_unlock_rcu_node(rnp);
27292729
}
27302730

2731-
/**
2732-
* call_rcu() - Queue an RCU callback for invocation after a grace period.
2733-
* @head: structure to be used for queueing the RCU updates.
2734-
* @func: actual callback function to be invoked after the grace period
2735-
*
2736-
* The callback function will be invoked some time after a full grace
2737-
* period elapses, in other words after all pre-existing RCU read-side
2738-
* critical sections have completed. However, the callback function
2739-
* might well execute concurrently with RCU read-side critical sections
2740-
* that started after call_rcu() was invoked.
2741-
*
2742-
* RCU read-side critical sections are delimited by rcu_read_lock()
2743-
* and rcu_read_unlock(), and may be nested. In addition, but only in
2744-
* v5.0 and later, regions of code across which interrupts, preemption,
2745-
* or softirqs have been disabled also serve as RCU read-side critical
2746-
* sections. This includes hardware interrupt handlers, softirq handlers,
2747-
* and NMI handlers.
2748-
*
2749-
* Note that all CPUs must agree that the grace period extended beyond
2750-
* all pre-existing RCU read-side critical section. On systems with more
2751-
* than one CPU, this means that when "func()" is invoked, each CPU is
2752-
* guaranteed to have executed a full memory barrier since the end of its
2753-
* last RCU read-side critical section whose beginning preceded the call
2754-
* to call_rcu(). It also means that each CPU executing an RCU read-side
2755-
* critical section that continues beyond the start of "func()" must have
2756-
* executed a memory barrier after the call_rcu() but before the beginning
2757-
* of that RCU read-side critical section. Note that these guarantees
2758-
* include CPUs that are offline, idle, or executing in user mode, as
2759-
* well as CPUs that are executing in the kernel.
2760-
*
2761-
* Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
2762-
* resulting RCU callback function "func()", then both CPU A and CPU B are
2763-
* guaranteed to execute a full memory barrier during the time interval
2764-
* between the call to call_rcu() and the invocation of "func()" -- even
2765-
* if CPU A and CPU B are the same CPU (but again only if the system has
2766-
* more than one CPU).
2767-
*
2768-
* Implementation of these memory-ordering guarantees is described here:
2769-
* Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
2770-
*/
2771-
void call_rcu(struct rcu_head *head, rcu_callback_t func)
2731+
static void
2732+
__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
27722733
{
27732734
static atomic_t doublefrees;
27742735
unsigned long flags;
@@ -2809,7 +2770,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
28092770
}
28102771

28112772
check_cb_ovld(rdp);
2812-
if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
2773+
if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
28132774
return; // Enqueued onto ->nocb_bypass, so just leave.
28142775
// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
28152776
rcu_segcblist_enqueue(&rdp->cblist, head);
@@ -2831,8 +2792,84 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
28312792
local_irq_restore(flags);
28322793
}
28332794
}
2834-
EXPORT_SYMBOL_GPL(call_rcu);
28352795

2796+
#ifdef CONFIG_RCU_LAZY
2797+
/**
2798+
* call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
2799+
* flush all lazy callbacks (including the new one) to the main ->cblist while
2800+
* doing so.
2801+
*
2802+
* @head: structure to be used for queueing the RCU updates.
2803+
* @func: actual callback function to be invoked after the grace period
2804+
*
2805+
* The callback function will be invoked some time after a full grace
2806+
* period elapses, in other words after all pre-existing RCU read-side
2807+
* critical sections have completed.
2808+
*
2809+
* Use this API instead of call_rcu() if you don't want the callback to be
2810+
* invoked after very long periods of time, which can happen on systems without
2811+
* memory pressure and on systems which are lightly loaded or mostly idle.
2812+
* This function will cause callbacks to be invoked sooner than later at the
2813+
* expense of extra power. Other than that, this function is identical to, and
2814+
* reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory
2815+
* ordering and other functionality.
2816+
*/
2817+
void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
2818+
{
2819+
return __call_rcu_common(head, func, false);
2820+
}
2821+
EXPORT_SYMBOL_GPL(call_rcu_hurry);
2822+
#endif
2823+
2824+
/**
2825+
* call_rcu() - Queue an RCU callback for invocation after a grace period.
2826+
* By default the callbacks are 'lazy' and are kept hidden from the main
2827+
* ->cblist to prevent starting of grace periods too soon.
2828+
* If you desire grace periods to start very soon, use call_rcu_hurry().
2829+
*
2830+
* @head: structure to be used for queueing the RCU updates.
2831+
* @func: actual callback function to be invoked after the grace period
2832+
*
2833+
* The callback function will be invoked some time after a full grace
2834+
* period elapses, in other words after all pre-existing RCU read-side
2835+
* critical sections have completed. However, the callback function
2836+
* might well execute concurrently with RCU read-side critical sections
2837+
* that started after call_rcu() was invoked.
2838+
*
2839+
* RCU read-side critical sections are delimited by rcu_read_lock()
2840+
* and rcu_read_unlock(), and may be nested. In addition, but only in
2841+
* v5.0 and later, regions of code across which interrupts, preemption,
2842+
* or softirqs have been disabled also serve as RCU read-side critical
2843+
* sections. This includes hardware interrupt handlers, softirq handlers,
2844+
* and NMI handlers.
2845+
*
2846+
* Note that all CPUs must agree that the grace period extended beyond
2847+
* all pre-existing RCU read-side critical section. On systems with more
2848+
* than one CPU, this means that when "func()" is invoked, each CPU is
2849+
* guaranteed to have executed a full memory barrier since the end of its
2850+
* last RCU read-side critical section whose beginning preceded the call
2851+
* to call_rcu(). It also means that each CPU executing an RCU read-side
2852+
* critical section that continues beyond the start of "func()" must have
2853+
* executed a memory barrier after the call_rcu() but before the beginning
2854+
* of that RCU read-side critical section. Note that these guarantees
2855+
* include CPUs that are offline, idle, or executing in user mode, as
2856+
* well as CPUs that are executing in the kernel.
2857+
*
2858+
* Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
2859+
* resulting RCU callback function "func()", then both CPU A and CPU B are
2860+
* guaranteed to execute a full memory barrier during the time interval
2861+
* between the call to call_rcu() and the invocation of "func()" -- even
2862+
* if CPU A and CPU B are the same CPU (but again only if the system has
2863+
* more than one CPU).
2864+
*
2865+
* Implementation of these memory-ordering guarantees is described here:
2866+
* Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
2867+
*/
2868+
void call_rcu(struct rcu_head *head, rcu_callback_t func)
2869+
{
2870+
return __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
2871+
}
2872+
EXPORT_SYMBOL_GPL(call_rcu);
28362873

28372874
/* Maximum number of jiffies to wait before draining a batch. */
28382875
#define KFREE_DRAIN_JIFFIES (5 * HZ)
@@ -3507,7 +3544,7 @@ void synchronize_rcu(void)
35073544
if (rcu_gp_is_expedited())
35083545
synchronize_rcu_expedited();
35093546
else
3510-
wait_rcu_gp(call_rcu);
3547+
wait_rcu_gp(call_rcu_hurry);
35113548
return;
35123549
}
35133550

@@ -3910,7 +3947,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
39103947
* if it's fully lazy.
39113948
*/
39123949
was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
3913-
WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
3950+
WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
39143951
wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist);
39153952
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
39163953
atomic_inc(&rcu_state.barrier_cpu_count);
@@ -4336,7 +4373,7 @@ void rcutree_migrate_callbacks(int cpu)
43364373
my_rdp = this_cpu_ptr(&rcu_data);
43374374
my_rnp = my_rdp->mynode;
43384375
rcu_nocb_lock(my_rdp); /* irqs already disabled. */
4339-
WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
4376+
WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false));
43404377
raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
43414378
/* Leverage recent GPs and set GP for new callbacks. */
43424379
needwake = rcu_advance_cbs(my_rnp, rdp) ||

kernel/rcu/tree.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -263,14 +263,16 @@ struct rcu_data {
263263
unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
264264
unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
265265

266+
long lazy_len; /* Length of buffered lazy callbacks. */
266267
int cpu;
267268
};
268269

269270
/* Values for nocb_defer_wakeup field in struct rcu_data. */
270271
#define RCU_NOCB_WAKE_NOT 0
271272
#define RCU_NOCB_WAKE_BYPASS 1
272-
#define RCU_NOCB_WAKE 2
273-
#define RCU_NOCB_WAKE_FORCE 3
273+
#define RCU_NOCB_WAKE_LAZY 2
274+
#define RCU_NOCB_WAKE 3
275+
#define RCU_NOCB_WAKE_FORCE 4
274276

275277
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
276278
/* For jiffies_till_first_fqs and */
@@ -441,9 +443,10 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
441443
static void rcu_init_one_nocb(struct rcu_node *rnp);
442444
static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
443445
static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
444-
unsigned long j);
446+
unsigned long j, bool lazy);
445447
static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
446-
bool *was_alldone, unsigned long flags);
448+
bool *was_alldone, unsigned long flags,
449+
bool lazy);
447450
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
448451
unsigned long flags);
449452
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);

kernel/rcu/tree_exp.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -937,7 +937,7 @@ void synchronize_rcu_expedited(void)
937937

938938
/* If expedited grace periods are prohibited, fall back to normal. */
939939
if (rcu_gp_is_normal()) {
940-
wait_rcu_gp(call_rcu);
940+
wait_rcu_gp(call_rcu_hurry);
941941
return;
942942
}
943943

0 commit comments

Comments
 (0)