Skip to content

Commit 6d36714

Browse files
committed
Merge branch 'pm-cpuidle'
Merge cpuidle updates for 6.20-rc1/7.0-rc1: - Add a command line option to adjust the C-states table in the intel_idle driver, remove the 'preferred_cstates' module parameter from it, add C-states validation to it and clean it up (Artem Bityutskiy) - Make the menu cpuidle governor always check the time till the closest timer event when the scheduler tick has been stopped to prevent it from mistakenly selecting the deepest available idle state (Rafael Wysocki) - Update the teo cpuidle governor to avoid making suboptimal decisions in certain corner cases and generally improve idle state selection accuracy (Rafael Wysocki) - Remove an unlikely() annotation on the early-return condition in menu_select() that leads to branch misprediction 100% of the time on systems with only 1 idle state enabled, like ARM64 servers (Breno Leitao) - Add Christian Loehle to MAINTAINERS as a cpuidle reviewer (Christian Loehle) * pm-cpuidle: cpuidle: governors: teo: Refine intercepts-based idle state lookup cpuidle: governors: teo: Adjust the classification of wakeup events cpuidle: governors: teo: Refine tick_intercepts vs total events check cpuidle: governors: teo: Avoid fake intercepts produced by tick cpuidle: governors: teo: Avoid selecting states with zero-size bins cpuidle: governors: menu: Always check timers with tick stopped MAINTAINERS: Add myself as cpuidle reviewer cpuidle: menu: Remove incorrect unlikely() annotation intel_idle: Add C-states validation intel_idle: Add cmdline option to adjust C-states table intel_idle: Initialize sysfs after cpuidle driver initialization intel_idle: Remove the 'preferred_cstates' parameter intel_idle: Remove unused driver version constant
2 parents 3bd1cde + a971f98 commit 6d36714

4 files changed

Lines changed: 317 additions & 74 deletions

File tree

MAINTAINERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6561,6 +6561,7 @@ F: rust/kernel/cpu.rs
65616561
CPU IDLE TIME MANAGEMENT FRAMEWORK
65626562
M: "Rafael J. Wysocki" <rafael@kernel.org>
65636563
M: Daniel Lezcano <daniel.lezcano@linaro.org>
6564+
R: Christian Loehle <christian.loehle@arm.com>
65646565
L: linux-pm@vger.kernel.org
65656566
S: Maintained
65666567
B: https://bugzilla.kernel.org

drivers/cpuidle/governors/menu.c

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
239239

240240
/* Find the shortest expected idle interval. */
241241
predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
242-
if (predicted_ns > RESIDENCY_THRESHOLD_NS) {
242+
if (predicted_ns > RESIDENCY_THRESHOLD_NS || tick_nohz_tick_stopped()) {
243243
unsigned int timer_us;
244244

245245
/* Determine the time till the closest timer. */
@@ -259,6 +259,16 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
259259
RESOLUTION * DECAY * NSEC_PER_USEC);
260260
/* Use the lowest expected idle interval to pick the idle state. */
261261
predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns);
262+
/*
263+
* If the tick is already stopped, the cost of possible short
264+
* idle duration misprediction is much higher, because the CPU
265+
* may be stuck in a shallow idle state for a long time as a
266+
* result of it. In that case, say we might mispredict and use
267+
* the known time till the closest timer event for the idle
268+
* state selection.
269+
*/
270+
if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC)
271+
predicted_ns = data->next_timer_ns;
262272
} else {
263273
/*
264274
* Because the next timer event is not going to be determined
@@ -271,7 +281,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
271281
data->bucket = BUCKETS - 1;
272282
}
273283

274-
if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
284+
if (drv->state_count <= 1 || latency_req == 0 ||
275285
((data->next_timer_ns < drv->states[1].target_residency_ns ||
276286
latency_req < drv->states[1].exit_latency_ns) &&
277287
!dev->states_usage[0].disable)) {
@@ -284,16 +294,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
284294
return 0;
285295
}
286296

287-
/*
288-
* If the tick is already stopped, the cost of possible short idle
289-
* duration misprediction is much higher, because the CPU may be stuck
290-
* in a shallow idle state for a long time as a result of it. In that
291-
* case, say we might mispredict and use the known time till the closest
292-
* timer event for the idle state selection.
293-
*/
294-
if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC)
295-
predicted_ns = data->next_timer_ns;
296-
297297
/*
298298
* Find the idle state with the lowest power while satisfying
299299
* our constraints.

drivers/cpuidle/governors/teo.c

Lines changed: 79 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,11 @@
4848
* in accordance with what happened last time.
4949
*
5050
* The "hits" metric reflects the relative frequency of situations in which the
51-
* sleep length and the idle duration measured after CPU wakeup fall into the
52-
* same bin (that is, the CPU appears to wake up "on time" relative to the sleep
53-
* length). In turn, the "intercepts" metric reflects the relative frequency of
54-
* non-timer wakeup events for which the measured idle duration falls into a bin
55-
* that corresponds to an idle state shallower than the one whose bin is fallen
56-
* into by the sleep length (these events are also referred to as "intercepts"
51+
* sleep length and the idle duration measured after CPU wakeup are close enough
52+
* (that is, the CPU appears to wake up "on time" relative to the sleep length).
53+
* In turn, the "intercepts" metric reflects the relative frequency of non-timer
54+
* wakeup events for which the measured idle duration is significantly different
55+
* from the sleep length (these events are also referred to as "intercepts"
5756
* below).
5857
*
5958
* The governor also counts "intercepts" with the measured idle duration below
@@ -75,12 +74,17 @@
7574
* than the candidate one (it represents the cases in which the CPU was
7675
* likely woken up by a non-timer wakeup source).
7776
*
77+
* Also find the idle state with the maximum intercepts metric (if there are
78+
* multiple states with the maximum intercepts metric, choose the one with
79+
* the highest index).
80+
*
7881
* 2. If the second sum computed in step 1 is greater than a half of the sum of
7982
* both metrics for the candidate state bin and all subsequent bins (if any),
8083
* a shallower idle state is likely to be more suitable, so look for it.
8184
*
8285
* - Traverse the enabled idle states shallower than the candidate one in the
83-
* descending order.
86+
* descending order, starting at the state with the maximum intercepts
87+
* metric found in step 1.
8488
*
8589
* - For each of them compute the sum of the "intercepts" metrics over all
8690
* of the idle states between it and the candidate one (including the
@@ -167,6 +171,7 @@ static void teo_decay(unsigned int *metric)
167171
*/
168172
static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
169173
{
174+
s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
170175
struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
171176
int i, idx_timer = 0, idx_duration = 0;
172177
s64 target_residency_ns, measured_ns;
@@ -182,8 +187,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
182187
*/
183188
measured_ns = S64_MAX;
184189
} else {
185-
s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
186-
187190
measured_ns = dev->last_residency_ns;
188191
/*
189192
* The delay between the wakeup and the first instruction
@@ -239,15 +242,31 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
239242
cpu_data->state_bins[drv->state_count-1].hits += PULSE;
240243
return;
241244
}
245+
/*
246+
* If intercepts within the tick period range are not frequent
247+
* enough, count this wakeup as a hit, since it is likely that
248+
* the tick has woken up the CPU because an expected intercept
249+
* was not there. Otherwise, one of the intercepts may have
250+
* been incidentally preceded by the tick wakeup.
251+
*/
252+
if (3 * cpu_data->tick_intercepts < 2 * total) {
253+
cpu_data->state_bins[idx_timer].hits += PULSE;
254+
return;
255+
}
242256
}
243257

244258
/*
245-
* If the measured idle duration falls into the same bin as the sleep
246-
* length, this is a "hit", so update the "hits" metric for that bin.
259+
* If the measured idle duration (adjusted for the entered state exit
260+
* latency) falls into the same bin as the sleep length and the latter
261+
* is less than the "raw" measured idle duration (so the wakeup appears
262+
* to have occurred after the anticipated timer event), this is a "hit",
263+
* so update the "hits" metric for that bin.
264+
*
247265
* Otherwise, update the "intercepts" metric for the bin fallen into by
248266
* the measured idle duration.
249267
*/
250-
if (idx_timer == idx_duration) {
268+
if (idx_timer == idx_duration &&
269+
cpu_data->sleep_length_ns - measured_ns < lat_ns / 2) {
251270
cpu_data->state_bins[idx_timer].hits += PULSE;
252271
} else {
253272
cpu_data->state_bins[idx_duration].intercepts += PULSE;
@@ -294,8 +313,10 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
294313
ktime_t delta_tick = TICK_NSEC / 2;
295314
unsigned int idx_intercept_sum = 0;
296315
unsigned int intercept_sum = 0;
316+
unsigned int intercept_max = 0;
297317
unsigned int idx_hit_sum = 0;
298318
unsigned int hit_sum = 0;
319+
int intercept_max_idx = -1;
299320
int constraint_idx = 0;
300321
int idx0 = 0, idx = -1;
301322
s64 duration_ns;
@@ -326,17 +347,32 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
326347
if (!dev->states_usage[0].disable)
327348
idx = 0;
328349

329-
/* Compute the sums of metrics for early wakeup pattern detection. */
350+
/*
351+
* Compute the sums of metrics for early wakeup pattern detection and
352+
* look for the state bin with the maximum intercepts metric below the
353+
* deepest enabled one (if there are multiple states with the maximum
354+
* intercepts metric, choose the one with the highest index).
355+
*/
330356
for (i = 1; i < drv->state_count; i++) {
331357
struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
358+
unsigned int prev_intercepts = prev_bin->intercepts;
332359
struct cpuidle_state *s = &drv->states[i];
333360

334361
/*
335362
* Update the sums of idle state metrics for all of the states
336363
* shallower than the current one.
337364
*/
338-
intercept_sum += prev_bin->intercepts;
339365
hit_sum += prev_bin->hits;
366+
intercept_sum += prev_intercepts;
367+
/*
368+
* Check if this is the bin with the maximum number of
369+
* intercepts so far and in that case update the index of
370+
* the state with the maximum intercepts metric.
371+
*/
372+
if (prev_intercepts >= intercept_max) {
373+
intercept_max = prev_intercepts;
374+
intercept_max_idx = i - 1;
375+
}
340376

341377
if (dev->states_usage[i].disable)
342378
continue;
@@ -388,12 +424,34 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
388424
while (min_idx < idx &&
389425
drv->states[min_idx].target_residency_ns < TICK_NSEC)
390426
min_idx++;
427+
428+
/*
429+
* Avoid selecting a state with a lower index, but with
430+
* the same target residency as the current candidate
431+
* one.
432+
*/
433+
if (drv->states[min_idx].target_residency_ns ==
434+
drv->states[idx].target_residency_ns)
435+
goto constraint;
436+
}
437+
438+
/*
439+
* If the minimum state index is greater than or equal to the
440+
* index of the state with the maximum intercepts metric and
441+
* the corresponding state is enabled, there is no need to look
442+
* at the deeper states.
443+
*/
444+
if (min_idx >= intercept_max_idx &&
445+
!dev->states_usage[min_idx].disable) {
446+
idx = min_idx;
447+
goto constraint;
391448
}
392449

393450
/*
394-
* Look for the deepest idle state whose target residency had
395-
* not exceeded the idle duration in over a half of the relevant
396-
* cases in the past.
451+
* Look for the deepest enabled idle state, at most as deep as
452+
* the one with the maximum intercepts metric, whose target
453+
* residency had not been greater than the idle duration in over
454+
* a half of the relevant cases in the past.
397455
*
398456
* Take the possible duration limitation present if the tick
399457
* has been stopped already into account.
@@ -405,11 +463,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
405463
continue;
406464

407465
idx = i;
408-
if (2 * intercept_sum > idx_intercept_sum)
466+
if (2 * intercept_sum > idx_intercept_sum &&
467+
i <= intercept_max_idx)
409468
break;
410469
}
411470
}
412471

472+
constraint:
413473
/*
414474
* If there is a latency constraint, it may be necessary to select an
415475
* idle state shallower than the current candidate one.
@@ -464,7 +524,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
464524
* total wakeup events, do not stop the tick.
465525
*/
466526
if (drv->states[idx].target_residency_ns < TICK_NSEC &&
467-
cpu_data->tick_intercepts > cpu_data->total / 2 + cpu_data->total / 8)
527+
3 * cpu_data->tick_intercepts >= 2 * cpu_data->total)
468528
duration_ns = TICK_NSEC / 2;
469529

470530
end:

0 commit comments

Comments
 (0)