Skip to content

Commit bf7ae17

Browse files
committed
Merge branches 'pm-cpuidle' and 'pm-powercap'
Merge cpuidle and power capping updates for 6.19-rc1: - Use residency threshold in polling state override decisions in the menu cpuidle governor (Aboorva Devarajan) - Add sanity check for exit latency and target residency in the cpufreq core (Rafael Wysocki) - Use this_cpu_ptr() where possible in the teo governor (Christian Loehle) - Rework the handling of tick wakeups in the teo cpuidle governor to increase the likelihood of stopping the scheduler tick in the cases when tick wakeups can be counted as non-timer ones (Rafael Wysocki) - Fix a reverse condition in the teo cpuidle governor and drop a misguided target residency check from it (Rafael Wysocki) - Clean up muliple minor defects in the teo cpuidle governor (Rafael Wysocki) - Update header inclusion to make it follow the Include What You Use principle (Andy Shevchenko) - Enable MSR-based RAPL PMU support in the intel_rapl power capping driver and arrange for using it on the Panther Lake and Wildcat Lake processors (Kuppuswamy Sathyanarayanan) - Add support for Nova Lake and Wildcat Lake processors to the intel_rapl power capping driver (Kaushlendra Kumar, Srinivas Pandruvada) * pm-cpuidle: cpuidle: Warn instead of bailing out if target residency check fails cpuidle: Update header inclusion cpuidle: governors: teo: Add missing space to the description cpuidle: governors: teo: Simplify intercepts-based state lookup cpuidle: governors: teo: Fix tick_intercepts handling in teo_update() cpuidle: governors: teo: Rework the handling of tick wakeups cpuidle: governors: teo: Decay metrics below DECAY_SHIFT threshold cpuidle: governors: teo: Use s64 consistently in teo_update() cpuidle: governors: teo: Drop redundant function parameter cpuidle: governors: teo: Drop misguided target residency check cpuidle: teo: Use this_cpu_ptr() where possible cpuidle: Add sanity check for exit latency and target residency cpuidle: menu: Use residency threshold in polling state override decisions * pm-powercap: powercap: intel_rapl: Enable MSR-based RAPL PMU support powercap: intel_rapl: Prepare read_raw() interface for atomic-context callers powercap: intel_rapl: Add support for Nova Lake processors powercap: intel_rapl: Add support for Wildcat Lake platform
3 parents 1fe2523 + 4bf944f + 748d6ba commit bf7ae17

9 files changed

Lines changed: 156 additions & 114 deletions

File tree

drivers/cpuidle/driver.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
* This code is licenced under the GPL.
99
*/
1010

11+
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12+
1113
#include <linux/mutex.h>
1214
#include <linux/module.h>
1315
#include <linux/sched.h>
@@ -193,6 +195,14 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv)
193195
s->exit_latency_ns = 0;
194196
else
195197
s->exit_latency = div_u64(s->exit_latency_ns, NSEC_PER_USEC);
198+
199+
/*
200+
* Warn if the exit latency of a CPU idle state exceeds its
201+
* target residency which is assumed to never happen in cpuidle
202+
* in multiple places.
203+
*/
204+
if (s->exit_latency_ns > s->target_residency_ns)
205+
pr_warn("Idle state %d target residency too low\n", i);
196206
}
197207
}
198208

drivers/cpuidle/governors/menu.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -317,12 +317,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
317317
}
318318

319319
/*
320-
* Use a physical idle state, not busy polling, unless a timer
321-
* is going to trigger soon enough or the exit latency of the
322-
* idle state in question is greater than the predicted idle
323-
* duration.
320+
* Use a physical idle state instead of busy polling so long as
321+
* its target residency is below the residency threshold, its
322+
* exit latency is not greater than the predicted idle duration,
323+
* and the next timer doesn't expire soon.
324324
*/
325325
if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
326+
s->target_residency_ns < RESIDENCY_THRESHOLD_NS &&
326327
s->target_residency_ns <= data->next_timer_ns &&
327328
s->exit_latency_ns <= predicted_ns) {
328329
predicted_ns = s->target_residency_ns;

drivers/cpuidle/governors/teo.c

Lines changed: 72 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@
7676
* likely woken up by a non-timer wakeup source).
7777
*
7878
* 2. If the second sum computed in step 1 is greater than a half of the sum of
79-
* both metrics for the candidate state bin and all subsequent bins(if any),
79+
* both metrics for the candidate state bin and all subsequent bins (if any),
8080
* a shallower idle state is likely to be more suitable, so look for it.
8181
*
8282
* - Traverse the enabled idle states shallower than the candidate one in the
@@ -133,43 +133,56 @@ struct teo_bin {
133133
* @sleep_length_ns: Time till the closest timer event (at the selection time).
134134
* @state_bins: Idle state data bins for this CPU.
135135
* @total: Grand total of the "intercepts" and "hits" metrics for all bins.
136+
* @total_tick: Wakeups by the scheduler tick.
136137
* @tick_intercepts: "Intercepts" before TICK_NSEC.
137138
* @short_idles: Wakeups after short idle periods.
138-
* @artificial_wakeup: Set if the wakeup has been triggered by a safety net.
139+
* @tick_wakeup: Set if the last wakeup was by the scheduler tick.
139140
*/
140141
struct teo_cpu {
141142
s64 sleep_length_ns;
142143
struct teo_bin state_bins[CPUIDLE_STATE_MAX];
143144
unsigned int total;
145+
unsigned int total_tick;
144146
unsigned int tick_intercepts;
145147
unsigned int short_idles;
146-
bool artificial_wakeup;
148+
bool tick_wakeup;
147149
};
148150

149151
static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
150152

153+
static void teo_decay(unsigned int *metric)
154+
{
155+
unsigned int delta = *metric >> DECAY_SHIFT;
156+
157+
if (delta)
158+
*metric -= delta;
159+
else
160+
*metric = 0;
161+
}
162+
151163
/**
152164
* teo_update - Update CPU metrics after wakeup.
153165
* @drv: cpuidle driver containing state data.
154166
* @dev: Target CPU.
155167
*/
156168
static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
157169
{
158-
struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
170+
struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
159171
int i, idx_timer = 0, idx_duration = 0;
160-
s64 target_residency_ns;
161-
u64 measured_ns;
172+
s64 target_residency_ns, measured_ns;
173+
unsigned int total = 0;
162174

163-
cpu_data->short_idles -= cpu_data->short_idles >> DECAY_SHIFT;
175+
teo_decay(&cpu_data->short_idles);
164176

165-
if (cpu_data->artificial_wakeup) {
177+
if (dev->poll_time_limit) {
178+
dev->poll_time_limit = false;
166179
/*
167-
* If one of the safety nets has triggered, assume that this
180+
* Polling state timeout has triggered, so assume that this
168181
* might have been a long sleep.
169182
*/
170-
measured_ns = U64_MAX;
183+
measured_ns = S64_MAX;
171184
} else {
172-
u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
185+
s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
173186

174187
measured_ns = dev->last_residency_ns;
175188
/*
@@ -196,8 +209,10 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
196209
for (i = 0; i < drv->state_count; i++) {
197210
struct teo_bin *bin = &cpu_data->state_bins[i];
198211

199-
bin->hits -= bin->hits >> DECAY_SHIFT;
200-
bin->intercepts -= bin->intercepts >> DECAY_SHIFT;
212+
teo_decay(&bin->hits);
213+
total += bin->hits;
214+
teo_decay(&bin->intercepts);
215+
total += bin->intercepts;
201216

202217
target_residency_ns = drv->states[i].target_residency_ns;
203218

@@ -208,7 +223,24 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
208223
}
209224
}
210225

211-
cpu_data->tick_intercepts -= cpu_data->tick_intercepts >> DECAY_SHIFT;
226+
cpu_data->total = total + PULSE;
227+
228+
teo_decay(&cpu_data->tick_intercepts);
229+
230+
teo_decay(&cpu_data->total_tick);
231+
if (cpu_data->tick_wakeup) {
232+
cpu_data->total_tick += PULSE;
233+
/*
234+
* If tick wakeups dominate the wakeup pattern, count this one
235+
* as a hit on the deepest available idle state to increase the
236+
* likelihood of stopping the tick.
237+
*/
238+
if (3 * cpu_data->total_tick > 2 * cpu_data->total) {
239+
cpu_data->state_bins[drv->state_count-1].hits += PULSE;
240+
return;
241+
}
242+
}
243+
212244
/*
213245
* If the measured idle duration falls into the same bin as the sleep
214246
* length, this is a "hit", so update the "hits" metric for that bin.
@@ -219,18 +251,9 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
219251
cpu_data->state_bins[idx_timer].hits += PULSE;
220252
} else {
221253
cpu_data->state_bins[idx_duration].intercepts += PULSE;
222-
if (TICK_NSEC <= measured_ns)
254+
if (measured_ns <= TICK_NSEC)
223255
cpu_data->tick_intercepts += PULSE;
224256
}
225-
226-
cpu_data->total -= cpu_data->total >> DECAY_SHIFT;
227-
cpu_data->total += PULSE;
228-
}
229-
230-
static bool teo_state_ok(int i, struct cpuidle_driver *drv)
231-
{
232-
return !tick_nohz_tick_stopped() ||
233-
drv->states[i].target_residency_ns >= TICK_NSEC;
234257
}
235258

236259
/**
@@ -239,17 +262,15 @@ static bool teo_state_ok(int i, struct cpuidle_driver *drv)
239262
* @dev: Target CPU.
240263
* @state_idx: Index of the capping idle state.
241264
* @duration_ns: Idle duration value to match.
242-
* @no_poll: Don't consider polling states.
243265
*/
244266
static int teo_find_shallower_state(struct cpuidle_driver *drv,
245267
struct cpuidle_device *dev, int state_idx,
246-
s64 duration_ns, bool no_poll)
268+
s64 duration_ns)
247269
{
248270
int i;
249271

250272
for (i = state_idx - 1; i >= 0; i--) {
251-
if (dev->states_usage[i].disable ||
252-
(no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING))
273+
if (dev->states_usage[i].disable)
253274
continue;
254275

255276
state_idx = i;
@@ -268,7 +289,7 @@ static int teo_find_shallower_state(struct cpuidle_driver *drv,
268289
static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
269290
bool *stop_tick)
270291
{
271-
struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
292+
struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
272293
s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
273294
ktime_t delta_tick = TICK_NSEC / 2;
274295
unsigned int idx_intercept_sum = 0;
@@ -356,7 +377,18 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
356377
* better choice.
357378
*/
358379
if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) {
359-
int first_suitable_idx = idx;
380+
int min_idx = idx0;
381+
382+
if (tick_nohz_tick_stopped()) {
383+
/*
384+
* Look for the shallowest idle state below the current
385+
* candidate one whose target residency is at least
386+
* equal to the tick period length.
387+
*/
388+
while (min_idx < idx &&
389+
drv->states[min_idx].target_residency_ns < TICK_NSEC)
390+
min_idx++;
391+
}
360392

361393
/*
362394
* Look for the deepest idle state whose target residency had
@@ -366,49 +398,14 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
366398
* Take the possible duration limitation present if the tick
367399
* has been stopped already into account.
368400
*/
369-
intercept_sum = 0;
370-
371-
for (i = idx - 1; i >= 0; i--) {
372-
struct teo_bin *bin = &cpu_data->state_bins[i];
373-
374-
intercept_sum += bin->intercepts;
375-
376-
if (2 * intercept_sum > idx_intercept_sum) {
377-
/*
378-
* Use the current state unless it is too
379-
* shallow or disabled, in which case take the
380-
* first enabled state that is deep enough.
381-
*/
382-
if (teo_state_ok(i, drv) &&
383-
!dev->states_usage[i].disable) {
384-
idx = i;
385-
break;
386-
}
387-
idx = first_suitable_idx;
388-
break;
389-
}
401+
for (i = idx - 1, intercept_sum = 0; i >= min_idx; i--) {
402+
intercept_sum += cpu_data->state_bins[i].intercepts;
390403

391404
if (dev->states_usage[i].disable)
392405
continue;
393406

394-
if (teo_state_ok(i, drv)) {
395-
/*
396-
* The current state is deep enough, but still
397-
* there may be a better one.
398-
*/
399-
first_suitable_idx = i;
400-
continue;
401-
}
402-
403-
/*
404-
* The current state is too shallow, so if no suitable
405-
* states other than the initial candidate have been
406-
* found, give up (the remaining states to check are
407-
* shallower still), but otherwise the first suitable
408-
* state other than the initial candidate may turn out
409-
* to be preferable.
410-
*/
411-
if (first_suitable_idx == idx)
407+
idx = i;
408+
if (2 * intercept_sum > idx_intercept_sum)
412409
break;
413410
}
414411
}
@@ -458,11 +455,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
458455
* If the closest expected timer is before the target residency of the
459456
* candidate state, a shallower one needs to be found.
460457
*/
461-
if (drv->states[idx].target_residency_ns > duration_ns) {
462-
i = teo_find_shallower_state(drv, dev, idx, duration_ns, false);
463-
if (teo_state_ok(i, drv))
464-
idx = i;
465-
}
458+
if (drv->states[idx].target_residency_ns > duration_ns)
459+
idx = teo_find_shallower_state(drv, dev, idx, duration_ns);
466460

467461
/*
468462
* If the selected state's target residency is below the tick length
@@ -490,7 +484,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
490484
*/
491485
if (idx > idx0 &&
492486
drv->states[idx].target_residency_ns > delta_tick)
493-
idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false);
487+
idx = teo_find_shallower_state(drv, dev, idx, delta_tick);
494488

495489
out_tick:
496490
*stop_tick = false;
@@ -504,20 +498,11 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
504498
*/
505499
static void teo_reflect(struct cpuidle_device *dev, int state)
506500
{
507-
struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
501+
struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
502+
503+
cpu_data->tick_wakeup = tick_nohz_idle_got_tick();
508504

509505
dev->last_state_idx = state;
510-
if (dev->poll_time_limit ||
511-
(tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
512-
/*
513-
* The wakeup was not "genuine", but triggered by one of the
514-
* safety nets.
515-
*/
516-
dev->poll_time_limit = false;
517-
cpu_data->artificial_wakeup = true;
518-
} else {
519-
cpu_data->artificial_wakeup = false;
520-
}
521506
}
522507

523508
/**

drivers/cpuidle/poll_state.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,13 @@
44
*/
55

66
#include <linux/cpuidle.h>
7+
#include <linux/export.h>
8+
#include <linux/irqflags.h>
79
#include <linux/sched.h>
810
#include <linux/sched/clock.h>
911
#include <linux/sched/idle.h>
12+
#include <linux/sprintf.h>
13+
#include <linux/types.h>
1014

1115
#define POLL_IDLE_RELAX_COUNT 200
1216

0 commit comments

Comments
 (0)