Skip to content

Commit d82caa2

Browse files
lellomanPeter Zijlstra
authored andcommitted
sched/psi: Allow unprivileged polling of N*2s period
PSI offers 2 mechanisms to get information about a specific resource pressure. One is reading from /proc/pressure/<resource>, which gives average pressures aggregated every 2s. The other is creating a pollable fd for a specific resource and cgroup. The trigger creation requires CAP_SYS_RESOURCE, and gives the possibility to pick specific time window and threshold, spawing an RT thread to aggregate the data. Systemd would like to provide containers the option to monitor pressure on their own cgroup and sub-cgroups. For example, if systemd launches a container that itself then launches services, the container should have the ability to poll() for pressure in individual services. But neither the container nor the services are privileged. This patch implements a mechanism to allow unprivileged users to create pressure triggers. The difference with privileged triggers creation is that unprivileged ones must have a time window that's a multiple of 2s. This is so that we can avoid unrestricted spawning of rt threads, and use instead the same aggregation mechanism done for the averages, which runs independently of any triggers. Suggested-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Link: https://lore.kernel.org/r/20230330105418.77061-5-cerasuolodomenico@gmail.com
1 parent 4468fca commit d82caa2

5 files changed

Lines changed: 121 additions & 69 deletions

File tree

Documentation/accounting/psi.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ prevent overly frequent polling. Max limit is chosen as a high enough number
105105
after which monitors are most likely not needed and psi averages can be used
106106
instead.
107107

108+
Unprivileged users can also create monitors, with the only limitation that the
109+
window size must be a multiple of 2s, in order to prevent excessive resource
110+
usage.
111+
108112
When activated, psi monitor stays active for at least the duration of one
109113
tracking window to avoid repeated activations/deactivations when system is
110114
bouncing in and out of the stall state.

include/linux/psi.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ void psi_memstall_leave(unsigned long *flags);
2424

2525
int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
2626
struct psi_trigger *psi_trigger_create(struct psi_group *group,
27-
char *buf, enum psi_res res);
27+
char *buf, enum psi_res res, struct file *file);
2828
void psi_trigger_destroy(struct psi_trigger *t);
2929

3030
__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,

include/linux/psi_types.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,9 @@ struct psi_trigger {
151151

152152
/* Deferred event(s) from previous ratelimit window */
153153
bool pending_event;
154+
155+
/* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */
156+
enum psi_aggregators aggregator;
154157
};
155158

156159
struct psi_group {
@@ -171,6 +174,10 @@ struct psi_group {
171174
/* Aggregator work control */
172175
struct delayed_work avgs_work;
173176

177+
/* Unprivileged triggers against N*PSI_FREQ windows */
178+
struct list_head avg_triggers;
179+
u32 avg_nr_triggers[NR_PSI_STATES - 1];
180+
174181
/* Total stall times and sampled pressure averages */
175182
u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
176183
unsigned long avg[NR_PSI_STATES - 1][3];

kernel/cgroup/cgroup.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3761,7 +3761,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
37613761
}
37623762

37633763
psi = cgroup_psi(cgrp);
3764-
new = psi_trigger_create(psi, buf, res);
3764+
new = psi_trigger_create(psi, buf, res, of->file);
37653765
if (IS_ERR(new)) {
37663766
cgroup_put(cgrp);
37673767
return PTR_ERR(new);

kernel/sched/psi.c

Lines changed: 108 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,14 @@ static void group_init(struct psi_group *group)
186186
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
187187
group->avg_last_update = sched_clock();
188188
group->avg_next_update = group->avg_last_update + psi_period;
189-
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
190189
mutex_init(&group->avgs_lock);
191-
/* Init trigger-related members */
190+
191+
/* Init avg trigger-related members */
192+
INIT_LIST_HEAD(&group->avg_triggers);
193+
memset(group->avg_nr_triggers, 0, sizeof(group->avg_nr_triggers));
194+
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
195+
196+
/* Init rtpoll trigger-related members */
192197
atomic_set(&group->rtpoll_scheduled, 0);
193198
mutex_init(&group->rtpoll_trigger_lock);
194199
INIT_LIST_HEAD(&group->rtpoll_triggers);
@@ -430,21 +435,32 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
430435
return growth;
431436
}
432437

433-
static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total)
438+
static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
439+
enum psi_aggregators aggregator)
434440
{
435441
struct psi_trigger *t;
436-
u64 *total = group->total[PSI_POLL];
442+
u64 *total = group->total[aggregator];
443+
struct list_head *triggers;
444+
u64 *aggregator_total;
437445
*update_total = false;
438446

447+
if (aggregator == PSI_AVGS) {
448+
triggers = &group->avg_triggers;
449+
aggregator_total = group->avg_total;
450+
} else {
451+
triggers = &group->rtpoll_triggers;
452+
aggregator_total = group->rtpoll_total;
453+
}
454+
439455
/*
440456
* On subsequent updates, calculate growth deltas and let
441457
* watchers know when their specified thresholds are exceeded.
442458
*/
443-
list_for_each_entry(t, &group->rtpoll_triggers, node) {
459+
list_for_each_entry(t, triggers, node) {
444460
u64 growth;
445461
bool new_stall;
446462

447-
new_stall = group->rtpoll_total[t->state] != total[t->state];
463+
new_stall = aggregator_total[t->state] != total[t->state];
448464

449465
/* Check for stall activity or a previous threshold breach */
450466
if (!new_stall && !t->pending_event)
@@ -546,6 +562,7 @@ static void psi_avgs_work(struct work_struct *work)
546562
struct delayed_work *dwork;
547563
struct psi_group *group;
548564
u32 changed_states;
565+
bool update_total;
549566
u64 now;
550567

551568
dwork = to_delayed_work(work);
@@ -563,8 +580,10 @@ static void psi_avgs_work(struct work_struct *work)
563580
* Once restarted, we'll catch up the running averages in one
564581
* go - see calc_avgs() and missed_periods.
565582
*/
566-
if (now >= group->avg_next_update)
583+
if (now >= group->avg_next_update) {
584+
update_triggers(group, now, &update_total, PSI_AVGS);
567585
group->avg_next_update = update_averages(group, now);
586+
}
568587

569588
if (changed_states & PSI_STATE_RESCHEDULE) {
570589
schedule_delayed_work(dwork, nsecs_to_jiffies(
@@ -574,7 +593,7 @@ static void psi_avgs_work(struct work_struct *work)
574593
mutex_unlock(&group->avgs_lock);
575594
}
576595

577-
static void init_triggers(struct psi_group *group, u64 now)
596+
static void init_rtpoll_triggers(struct psi_group *group, u64 now)
578597
{
579598
struct psi_trigger *t;
580599

@@ -667,7 +686,7 @@ static void psi_rtpoll_work(struct psi_group *group)
667686
if (changed_states & group->rtpoll_states) {
668687
/* Initialize trigger windows when entering polling mode */
669688
if (now > group->rtpoll_until)
670-
init_triggers(group, now);
689+
init_rtpoll_triggers(group, now);
671690

672691
/*
673692
* Keep the monitor active for at least the duration of the
@@ -684,7 +703,7 @@ static void psi_rtpoll_work(struct psi_group *group)
684703
}
685704

686705
if (now >= group->rtpoll_next_update) {
687-
group->rtpoll_next_update = update_triggers(group, now, &update_total);
706+
group->rtpoll_next_update = update_triggers(group, now, &update_total, PSI_POLL);
688707
if (update_total)
689708
memcpy(group->rtpoll_total, group->total[PSI_POLL],
690709
sizeof(group->rtpoll_total));
@@ -1254,16 +1273,23 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
12541273
}
12551274

12561275
struct psi_trigger *psi_trigger_create(struct psi_group *group,
1257-
char *buf, enum psi_res res)
1276+
char *buf, enum psi_res res, struct file *file)
12581277
{
12591278
struct psi_trigger *t;
12601279
enum psi_states state;
12611280
u32 threshold_us;
1281+
bool privileged;
12621282
u32 window_us;
12631283

12641284
if (static_branch_likely(&psi_disabled))
12651285
return ERR_PTR(-EOPNOTSUPP);
12661286

1287+
/*
1288+
* Checking the privilege here on file->f_cred implies that a privileged user
1289+
* could open the file and delegate the write to an unprivileged one.
1290+
*/
1291+
privileged = cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE);
1292+
12671293
if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
12681294
state = PSI_IO_SOME + res * 2;
12691295
else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
@@ -1283,6 +1309,13 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
12831309
window_us > WINDOW_MAX_US)
12841310
return ERR_PTR(-EINVAL);
12851311

1312+
/*
1313+
* Unprivileged users can only use 2s windows so that averages aggregation
1314+
* work is used, and no RT threads need to be spawned.
1315+
*/
1316+
if (!privileged && window_us % 2000000)
1317+
return ERR_PTR(-EINVAL);
1318+
12861319
/* Check threshold */
12871320
if (threshold_us == 0 || threshold_us > window_us)
12881321
return ERR_PTR(-EINVAL);
@@ -1302,31 +1335,40 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
13021335
t->last_event_time = 0;
13031336
init_waitqueue_head(&t->event_wait);
13041337
t->pending_event = false;
1338+
t->aggregator = privileged ? PSI_POLL : PSI_AVGS;
13051339

1306-
mutex_lock(&group->rtpoll_trigger_lock);
1340+
if (privileged) {
1341+
mutex_lock(&group->rtpoll_trigger_lock);
13071342

1308-
if (!rcu_access_pointer(group->rtpoll_task)) {
1309-
struct task_struct *task;
1343+
if (!rcu_access_pointer(group->rtpoll_task)) {
1344+
struct task_struct *task;
13101345

1311-
task = kthread_create(psi_rtpoll_worker, group, "psimon");
1312-
if (IS_ERR(task)) {
1313-
kfree(t);
1314-
mutex_unlock(&group->rtpoll_trigger_lock);
1315-
return ERR_CAST(task);
1346+
task = kthread_create(psi_rtpoll_worker, group, "psimon");
1347+
if (IS_ERR(task)) {
1348+
kfree(t);
1349+
mutex_unlock(&group->rtpoll_trigger_lock);
1350+
return ERR_CAST(task);
1351+
}
1352+
atomic_set(&group->rtpoll_wakeup, 0);
1353+
wake_up_process(task);
1354+
rcu_assign_pointer(group->rtpoll_task, task);
13161355
}
1317-
atomic_set(&group->rtpoll_wakeup, 0);
1318-
wake_up_process(task);
1319-
rcu_assign_pointer(group->rtpoll_task, task);
1320-
}
13211356

1322-
list_add(&t->node, &group->rtpoll_triggers);
1323-
group->rtpoll_min_period = min(group->rtpoll_min_period,
1324-
div_u64(t->win.size, UPDATES_PER_WINDOW));
1325-
group->rtpoll_nr_triggers[t->state]++;
1326-
group->rtpoll_states |= (1 << t->state);
1357+
list_add(&t->node, &group->rtpoll_triggers);
1358+
group->rtpoll_min_period = min(group->rtpoll_min_period,
1359+
div_u64(t->win.size, UPDATES_PER_WINDOW));
1360+
group->rtpoll_nr_triggers[t->state]++;
1361+
group->rtpoll_states |= (1 << t->state);
13271362

1328-
mutex_unlock(&group->rtpoll_trigger_lock);
1363+
mutex_unlock(&group->rtpoll_trigger_lock);
1364+
} else {
1365+
mutex_lock(&group->avgs_lock);
1366+
1367+
list_add(&t->node, &group->avg_triggers);
1368+
group->avg_nr_triggers[t->state]++;
13291369

1370+
mutex_unlock(&group->avgs_lock);
1371+
}
13301372
return t;
13311373
}
13321374

@@ -1350,34 +1392,41 @@ void psi_trigger_destroy(struct psi_trigger *t)
13501392
*/
13511393
wake_up_pollfree(&t->event_wait);
13521394

1353-
mutex_lock(&group->rtpoll_trigger_lock);
1354-
1355-
if (!list_empty(&t->node)) {
1356-
struct psi_trigger *tmp;
1357-
u64 period = ULLONG_MAX;
1358-
1359-
list_del(&t->node);
1360-
group->rtpoll_nr_triggers[t->state]--;
1361-
if (!group->rtpoll_nr_triggers[t->state])
1362-
group->rtpoll_states &= ~(1 << t->state);
1363-
/* reset min update period for the remaining triggers */
1364-
list_for_each_entry(tmp, &group->rtpoll_triggers, node)
1365-
period = min(period, div_u64(tmp->win.size,
1366-
UPDATES_PER_WINDOW));
1367-
group->rtpoll_min_period = period;
1368-
/* Destroy rtpoll_task when the last trigger is destroyed */
1369-
if (group->rtpoll_states == 0) {
1370-
group->rtpoll_until = 0;
1371-
task_to_destroy = rcu_dereference_protected(
1372-
group->rtpoll_task,
1373-
lockdep_is_held(&group->rtpoll_trigger_lock));
1374-
rcu_assign_pointer(group->rtpoll_task, NULL);
1375-
del_timer(&group->rtpoll_timer);
1395+
if (t->aggregator == PSI_AVGS) {
1396+
mutex_lock(&group->avgs_lock);
1397+
if (!list_empty(&t->node)) {
1398+
list_del(&t->node);
1399+
group->avg_nr_triggers[t->state]--;
13761400
}
1401+
mutex_unlock(&group->avgs_lock);
1402+
} else {
1403+
mutex_lock(&group->rtpoll_trigger_lock);
1404+
if (!list_empty(&t->node)) {
1405+
struct psi_trigger *tmp;
1406+
u64 period = ULLONG_MAX;
1407+
1408+
list_del(&t->node);
1409+
group->rtpoll_nr_triggers[t->state]--;
1410+
if (!group->rtpoll_nr_triggers[t->state])
1411+
group->rtpoll_states &= ~(1 << t->state);
1412+
/* reset min update period for the remaining triggers */
1413+
list_for_each_entry(tmp, &group->rtpoll_triggers, node)
1414+
period = min(period, div_u64(tmp->win.size,
1415+
UPDATES_PER_WINDOW));
1416+
group->rtpoll_min_period = period;
1417+
/* Destroy rtpoll_task when the last trigger is destroyed */
1418+
if (group->rtpoll_states == 0) {
1419+
group->rtpoll_until = 0;
1420+
task_to_destroy = rcu_dereference_protected(
1421+
group->rtpoll_task,
1422+
lockdep_is_held(&group->rtpoll_trigger_lock));
1423+
rcu_assign_pointer(group->rtpoll_task, NULL);
1424+
del_timer(&group->rtpoll_timer);
1425+
}
1426+
}
1427+
mutex_unlock(&group->rtpoll_trigger_lock);
13771428
}
13781429

1379-
mutex_unlock(&group->rtpoll_trigger_lock);
1380-
13811430
/*
13821431
* Wait for psi_schedule_rtpoll_work RCU to complete its read-side
13831432
* critical section before destroying the trigger and optionally the
@@ -1437,27 +1486,19 @@ static int psi_cpu_show(struct seq_file *m, void *v)
14371486
return psi_show(m, &psi_system, PSI_CPU);
14381487
}
14391488

1440-
static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
1441-
{
1442-
if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
1443-
return -EPERM;
1444-
1445-
return single_open(file, psi_show, NULL);
1446-
}
1447-
14481489
static int psi_io_open(struct inode *inode, struct file *file)
14491490
{
1450-
return psi_open(file, psi_io_show);
1491+
return single_open(file, psi_io_show, NULL);
14511492
}
14521493

14531494
static int psi_memory_open(struct inode *inode, struct file *file)
14541495
{
1455-
return psi_open(file, psi_memory_show);
1496+
return single_open(file, psi_memory_show, NULL);
14561497
}
14571498

14581499
static int psi_cpu_open(struct inode *inode, struct file *file)
14591500
{
1460-
return psi_open(file, psi_cpu_show);
1501+
return single_open(file, psi_cpu_show, NULL);
14611502
}
14621503

14631504
static ssize_t psi_write(struct file *file, const char __user *user_buf,
@@ -1491,7 +1532,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
14911532
return -EBUSY;
14921533
}
14931534

1494-
new = psi_trigger_create(&psi_system, buf, res);
1535+
new = psi_trigger_create(&psi_system, buf, res, file);
14951536
if (IS_ERR(new)) {
14961537
mutex_unlock(&seq->lock);
14971538
return PTR_ERR(new);
@@ -1571,7 +1612,7 @@ static int psi_irq_show(struct seq_file *m, void *v)
15711612

15721613
static int psi_irq_open(struct inode *inode, struct file *file)
15731614
{
1574-
return psi_open(file, psi_irq_show);
1615+
return single_open(file, psi_irq_show, NULL);
15751616
}
15761617

15771618
static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,

0 commit comments

Comments
 (0)