Skip to content

Commit fd796e4

Browse files
committed
rcu-tasks: Use fewer callbacks queues if callback flood ends
By default, when lock contention is encountered, the RCU Tasks flavors of RCU switch to using per-CPU queueing. However, if the callback flood ends, per-CPU queueing continues to be used, which introduces significant additional overhead, especially for callback invocation, which fans out a series of workqueue handlers. This commit therefore switches back to single-queue operation if at the beginning of a grace period there are very few callbacks. The definition of "very few" is set by the rcupdate.rcu_task_collapse_lim module parameter, which defaults to 10. This switch happens in two phases, with the first phase causing future callbacks to be enqueued on CPU 0's queue, but with all queues continuing to be checked for grace periods and callback invocation. The second phase checks to see if an RCU grace period has elapsed and if all remaining RCU-Tasks callbacks are queued on CPU 0. If so, only CPU 0 is checked for future grace periods and callback operation. Of course, the return of contention anywhere during this process will result in returning to per-CPU callback queueing. Reported-by: Martin Lau <kafai@fb.com> Cc: Neeraj Upadhyay <neeraj.iitr10@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
1 parent 2cee078 commit fd796e4

2 files changed

Lines changed: 54 additions & 2 deletions

File tree

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4805,6 +4805,14 @@
48054805
period to instead use normal non-expedited
48064806
grace-period processing.
48074807

4808+
rcupdate.rcu_task_collapse_lim= [KNL]
4809+
Set the maximum number of callbacks present
4810+
at the beginning of a grace period that allows
4811+
the RCU Tasks flavors to collapse back to using
4812+
a single callback queue. This switching only
4813+
occurs when rcupdate.rcu_task_enqueue_lim is
4814+
set to the default value of -1.
4815+
48084816
rcupdate.rcu_task_contend_lim= [KNL]
48094817
Set the minimum number of callback-queuing-time
48104818
lock-contention events per jiffy required to

kernel/rcu/tasks.h

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ struct rcu_tasks_percpu {
6868
* @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks.
6969
* @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing.
7070
* @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing.
71+
* @percpu_dequeue_gpseq: RCU grace-period number to propagate enqueue limit to dequeuers.
7172
* @barrier_q_mutex: Serialize barrier operations.
7273
* @barrier_q_count: Number of queues being waited on.
7374
* @barrier_q_completion: Barrier wait/wakeup mechanism.
@@ -98,6 +99,7 @@ struct rcu_tasks {
9899
int percpu_enqueue_shift;
99100
int percpu_enqueue_lim;
100101
int percpu_dequeue_lim;
102+
unsigned long percpu_dequeue_gpseq;
101103
struct mutex barrier_q_mutex;
102104
atomic_t barrier_q_count;
103105
struct completion barrier_q_completion;
@@ -148,6 +150,8 @@ module_param(rcu_task_enqueue_lim, int, 0444);
148150
static bool rcu_task_cb_adjust;
149151
static int rcu_task_contend_lim __read_mostly = 100;
150152
module_param(rcu_task_contend_lim, int, 0444);
153+
static int rcu_task_collapse_lim __read_mostly = 10;
154+
module_param(rcu_task_collapse_lim, int, 0444);
151155

152156
/* RCU tasks grace-period state for debugging. */
153157
#define RTGS_INIT 0
@@ -269,6 +273,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
269273
rhp->next = NULL;
270274
rhp->func = func;
271275
local_irq_save(flags);
276+
rcu_read_lock();
272277
rtpcp = per_cpu_ptr(rtp->rtpcpu,
273278
smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift));
274279
if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled.
@@ -294,12 +299,13 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
294299
raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
295300
if (rtp->percpu_enqueue_lim != nr_cpu_ids) {
296301
WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids));
297-
WRITE_ONCE(rtp->percpu_enqueue_lim, nr_cpu_ids);
302+
WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids);
298303
smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids);
299304
pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name);
300305
}
301306
raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
302307
}
308+
rcu_read_unlock();
303309
/* We can't create the thread unless interrupts are enabled. */
304310
if (needwake && READ_ONCE(rtp->kthread_ptr))
305311
irq_work_queue(&rtpcp->rtp_irq_work);
@@ -369,15 +375,25 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
369375
{
370376
int cpu;
371377
unsigned long flags;
378+
long n;
379+
long ncbs = 0;
380+
long ncbsnz = 0;
372381
int needgpcb = 0;
373382

374383
for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) {
375384
struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
376385

377386
/* Advance and accelerate any new callbacks. */
378-
if (rcu_segcblist_empty(&rtpcp->cblist))
387+
if (!rcu_segcblist_n_cbs(&rtpcp->cblist))
379388
continue;
380389
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
390+
// Should we shrink down to a single callback queue?
391+
n = rcu_segcblist_n_cbs(&rtpcp->cblist);
392+
if (n) {
393+
ncbs += n;
394+
if (cpu > 0)
395+
ncbsnz += n;
396+
}
381397
rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
382398
(void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
383399
if (rcu_segcblist_pend_cbs(&rtpcp->cblist))
@@ -386,6 +402,34 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
386402
needgpcb |= 0x1;
387403
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
388404
}
405+
406+
// Shrink down to a single callback queue if appropriate.
407+
// This is done in two stages: (1) If there are no more than
408+
// rcu_task_collapse_lim callbacks on CPU 0 and none on any other
409+
// CPU, limit enqueueing to CPU 0. (2) After an RCU grace period,
410+
// if there has not been an increase in callbacks, limit dequeuing
411+
// to CPU 0. Note the matching RCU read-side critical section in
412+
// call_rcu_tasks_generic().
413+
if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) {
414+
raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
415+
if (rtp->percpu_enqueue_lim > 1) {
416+
WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids));
417+
smp_store_release(&rtp->percpu_enqueue_lim, 1);
418+
rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu();
419+
pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name);
420+
}
421+
raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
422+
}
423+
if (rcu_task_cb_adjust && !ncbsnz &&
424+
poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq)) {
425+
raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
426+
if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) {
427+
WRITE_ONCE(rtp->percpu_dequeue_lim, 1);
428+
pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);
429+
}
430+
raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
431+
}
432+
389433
return needgpcb;
390434
}
391435

0 commit comments

Comments
 (0)