Skip to content

Commit 3323ddc

Browse files
committed
Merge tag 'v6.4/kernel.user_worker' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux
Pull user work thread updates from Christian Brauner: "This contains the work generalizing the ability to create a kernel worker from a userspace process. Such user workers will run with the same credentials as the userspace process they were created from providing stronger security and accounting guarantees than the traditional override_creds() approach ever could've hoped for. The original work was heavily based and optimzed for the needs of io_uring which was the first user. However, as it quickly turned out the ability to create user workers inherting properties from a userspace process is generally useful. The vhost subsystem currently creates workers using the kthread api. The consequences of using the kthread api are that RLIMITs don't work correctly as they are inherited from khtreadd. This leads to bugs where more workers are created than would be allowed by the RLIMITs of the userspace process in lieu of which workers are created. Problems like this disappear with user workers created from the userspace processes for which they perform the work. In addition, providing this api allows vhost to remove additional complexity. For example, cgroup and mm sharing will just work out of the box with user workers based on the relevant userspace process instead of manually ensuring the correct cgroup and mm contexts are used. So the vhost subsystem should simply be made to use the same mechanism as io_uring. To this end the original mechanism used for create_io_thread() is generalized into user workers: - Introduce PF_USER_WORKER as a generic indicator that a given task is a user worker, i.e., a kernel task that was created from a userspace process. Now a PF_IO_WORKER thread is just a specialized version of PF_USER_WORKER. So io_uring io workers raise both flags. - Make copy_process() available to core kernel code - Extend struct kernel_clone_args with the following bitfields allowing to indicate to copy_process(): - to create a user worker (raise PF_USER_WORKER) - to not inherit any files from the userspace process - to ignore signals After all generic changes are in place the vhost subsystem implements a new dedicated vhost api based on user workers. Finally, vhost is switched to rely on the new api moving it off of kthreads. Thanks to Mike for sticking it out and making it through this rather arduous journey" * tag 'v6.4/kernel.user_worker' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux: vhost: use vhost_tasks for worker threads vhost: move worker thread fields to new struct vhost_task: Allow vhost layer to use copy_process fork: allow kernel code to call copy_process fork: Add kernel_clone_args flag to ignore signals fork: add kernel_clone_args flag to not dup/clone files fork/vm: Move common PF_IO_WORKER behavior to new flag kernel: Make io_thread and kthread bit fields kthread: Pass in the thread's name during creation kernel: Allow a kernel thread's name to be set in copy_process csky: Remove kernel_thread declaration
2 parents a632b76 + 6e890c5 commit 3323ddc

14 files changed

Lines changed: 263 additions & 101 deletions

File tree

MAINTAINERS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22177,7 +22177,9 @@ L: virtualization@lists.linux-foundation.org
2217722177
L: netdev@vger.kernel.org
2217822178
S: Maintained
2217922179
T: git git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git
22180+
F: kernel/vhost_task.c
2218022181
F: drivers/vhost/
22182+
F: include/linux/sched/vhost_task.h
2218122183
F: include/linux/vhost_iotlb.h
2218222184
F: include/uapi/linux/vhost.h
2218322185

arch/csky/include/asm/processor.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,6 @@ struct task_struct;
7272
/* Prepare to copy thread state - unlazy all lazy status */
7373
#define prepare_to_copy(tsk) do { } while (0)
7474

75-
extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
76-
7775
unsigned long __get_wchan(struct task_struct *p);
7876

7977
#define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc)

drivers/vhost/Kconfig

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,14 @@ config VHOST_RING
1313
This option is selected by any driver which needs to access
1414
the host side of a virtio ring.
1515

16+
config VHOST_TASK
17+
bool
18+
default n
19+
1620
config VHOST
1721
tristate
1822
select VHOST_IOTLB
23+
select VHOST_TASK
1924
help
2025
This option is selected by any driver which needs to access
2126
the core of vhost.

drivers/vhost/vhost.c

Lines changed: 60 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,11 @@
2222
#include <linux/slab.h>
2323
#include <linux/vmalloc.h>
2424
#include <linux/kthread.h>
25-
#include <linux/cgroup.h>
2625
#include <linux/module.h>
2726
#include <linux/sort.h>
2827
#include <linux/sched/mm.h>
2928
#include <linux/sched/signal.h>
29+
#include <linux/sched/vhost_task.h>
3030
#include <linux/interval_tree_generic.h>
3131
#include <linux/nospec.h>
3232
#include <linux/kcov.h>
@@ -255,16 +255,16 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
255255
* sure it was not in the list.
256256
* test_and_set_bit() implies a memory barrier.
257257
*/
258-
llist_add(&work->node, &dev->work_list);
259-
wake_up_process(dev->worker);
258+
llist_add(&work->node, &dev->worker->work_list);
259+
wake_up_process(dev->worker->vtsk->task);
260260
}
261261
}
262262
EXPORT_SYMBOL_GPL(vhost_work_queue);
263263

264264
/* A lockless hint for busy polling code to exit the loop */
265265
bool vhost_has_work(struct vhost_dev *dev)
266266
{
267-
return !llist_empty(&dev->work_list);
267+
return dev->worker && !llist_empty(&dev->worker->work_list);
268268
}
269269
EXPORT_SYMBOL_GPL(vhost_has_work);
270270

@@ -335,22 +335,20 @@ static void vhost_vq_reset(struct vhost_dev *dev,
335335

336336
static int vhost_worker(void *data)
337337
{
338-
struct vhost_dev *dev = data;
338+
struct vhost_worker *worker = data;
339339
struct vhost_work *work, *work_next;
340340
struct llist_node *node;
341341

342-
kthread_use_mm(dev->mm);
343-
344342
for (;;) {
345343
/* mb paired w/ kthread_stop */
346344
set_current_state(TASK_INTERRUPTIBLE);
347345

348-
if (kthread_should_stop()) {
346+
if (vhost_task_should_stop(worker->vtsk)) {
349347
__set_current_state(TASK_RUNNING);
350348
break;
351349
}
352350

353-
node = llist_del_all(&dev->work_list);
351+
node = llist_del_all(&worker->work_list);
354352
if (!node)
355353
schedule();
356354

@@ -360,14 +358,14 @@ static int vhost_worker(void *data)
360358
llist_for_each_entry_safe(work, work_next, node, node) {
361359
clear_bit(VHOST_WORK_QUEUED, &work->flags);
362360
__set_current_state(TASK_RUNNING);
363-
kcov_remote_start_common(dev->kcov_handle);
361+
kcov_remote_start_common(worker->kcov_handle);
364362
work->fn(work);
365363
kcov_remote_stop();
366364
if (need_resched())
367365
schedule();
368366
}
369367
}
370-
kthread_unuse_mm(dev->mm);
368+
371369
return 0;
372370
}
373371

@@ -479,7 +477,6 @@ void vhost_dev_init(struct vhost_dev *dev,
479477
dev->byte_weight = byte_weight;
480478
dev->use_worker = use_worker;
481479
dev->msg_handler = msg_handler;
482-
init_llist_head(&dev->work_list);
483480
init_waitqueue_head(&dev->wait);
484481
INIT_LIST_HEAD(&dev->read_list);
485482
INIT_LIST_HEAD(&dev->pending_list);
@@ -509,31 +506,6 @@ long vhost_dev_check_owner(struct vhost_dev *dev)
509506
}
510507
EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
511508

512-
struct vhost_attach_cgroups_struct {
513-
struct vhost_work work;
514-
struct task_struct *owner;
515-
int ret;
516-
};
517-
518-
static void vhost_attach_cgroups_work(struct vhost_work *work)
519-
{
520-
struct vhost_attach_cgroups_struct *s;
521-
522-
s = container_of(work, struct vhost_attach_cgroups_struct, work);
523-
s->ret = cgroup_attach_task_all(s->owner, current);
524-
}
525-
526-
static int vhost_attach_cgroups(struct vhost_dev *dev)
527-
{
528-
struct vhost_attach_cgroups_struct attach;
529-
530-
attach.owner = current;
531-
vhost_work_init(&attach.work, vhost_attach_cgroups_work);
532-
vhost_work_queue(dev, &attach.work);
533-
vhost_dev_flush(dev);
534-
return attach.ret;
535-
}
536-
537509
/* Caller should have device mutex */
538510
bool vhost_dev_has_owner(struct vhost_dev *dev)
539511
{
@@ -571,10 +543,54 @@ static void vhost_detach_mm(struct vhost_dev *dev)
571543
dev->mm = NULL;
572544
}
573545

546+
static void vhost_worker_free(struct vhost_dev *dev)
547+
{
548+
struct vhost_worker *worker = dev->worker;
549+
550+
if (!worker)
551+
return;
552+
553+
dev->worker = NULL;
554+
WARN_ON(!llist_empty(&worker->work_list));
555+
vhost_task_stop(worker->vtsk);
556+
kfree(worker);
557+
}
558+
559+
static int vhost_worker_create(struct vhost_dev *dev)
560+
{
561+
struct vhost_worker *worker;
562+
struct vhost_task *vtsk;
563+
char name[TASK_COMM_LEN];
564+
int ret;
565+
566+
worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
567+
if (!worker)
568+
return -ENOMEM;
569+
570+
dev->worker = worker;
571+
worker->kcov_handle = kcov_common_handle();
572+
init_llist_head(&worker->work_list);
573+
snprintf(name, sizeof(name), "vhost-%d", current->pid);
574+
575+
vtsk = vhost_task_create(vhost_worker, worker, name);
576+
if (!vtsk) {
577+
ret = -ENOMEM;
578+
goto free_worker;
579+
}
580+
581+
worker->vtsk = vtsk;
582+
vhost_task_start(vtsk);
583+
return 0;
584+
585+
free_worker:
586+
kfree(worker);
587+
dev->worker = NULL;
588+
return ret;
589+
}
590+
574591
/* Caller should have device mutex */
575592
long vhost_dev_set_owner(struct vhost_dev *dev)
576593
{
577-
struct task_struct *worker;
578594
int err;
579595

580596
/* Is there an owner already? */
@@ -585,36 +601,21 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
585601

586602
vhost_attach_mm(dev);
587603

588-
dev->kcov_handle = kcov_common_handle();
589604
if (dev->use_worker) {
590-
worker = kthread_create(vhost_worker, dev,
591-
"vhost-%d", current->pid);
592-
if (IS_ERR(worker)) {
593-
err = PTR_ERR(worker);
594-
goto err_worker;
595-
}
596-
597-
dev->worker = worker;
598-
wake_up_process(worker); /* avoid contributing to loadavg */
599-
600-
err = vhost_attach_cgroups(dev);
605+
err = vhost_worker_create(dev);
601606
if (err)
602-
goto err_cgroup;
607+
goto err_worker;
603608
}
604609

605610
err = vhost_dev_alloc_iovecs(dev);
606611
if (err)
607-
goto err_cgroup;
612+
goto err_iovecs;
608613

609614
return 0;
610-
err_cgroup:
611-
if (dev->worker) {
612-
kthread_stop(dev->worker);
613-
dev->worker = NULL;
614-
}
615+
err_iovecs:
616+
vhost_worker_free(dev);
615617
err_worker:
616618
vhost_detach_mm(dev);
617-
dev->kcov_handle = 0;
618619
err_mm:
619620
return err;
620621
}
@@ -705,12 +706,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
705706
dev->iotlb = NULL;
706707
vhost_clear_msg(dev);
707708
wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
708-
WARN_ON(!llist_empty(&dev->work_list));
709-
if (dev->worker) {
710-
kthread_stop(dev->worker);
711-
dev->worker = NULL;
712-
dev->kcov_handle = 0;
713-
}
709+
vhost_worker_free(dev);
714710
vhost_detach_mm(dev);
715711
}
716712
EXPORT_SYMBOL_GPL(vhost_dev_cleanup);

drivers/vhost/vhost.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <linux/irqbypass.h>
1717

1818
struct vhost_work;
19+
struct vhost_task;
1920
typedef void (*vhost_work_fn_t)(struct vhost_work *work);
2021

2122
#define VHOST_WORK_QUEUED 1
@@ -25,6 +26,12 @@ struct vhost_work {
2526
unsigned long flags;
2627
};
2728

29+
struct vhost_worker {
30+
struct vhost_task *vtsk;
31+
struct llist_head work_list;
32+
u64 kcov_handle;
33+
};
34+
2835
/* Poll a file (eventfd or socket) */
2936
/* Note: there's nothing vhost specific about this structure. */
3037
struct vhost_poll {
@@ -147,8 +154,7 @@ struct vhost_dev {
147154
struct vhost_virtqueue **vqs;
148155
int nvqs;
149156
struct eventfd_ctx *log_ctx;
150-
struct llist_head work_list;
151-
struct task_struct *worker;
157+
struct vhost_worker *worker;
152158
struct vhost_iotlb *umem;
153159
struct vhost_iotlb *iotlb;
154160
spinlock_t iotlb_lock;
@@ -158,7 +164,6 @@ struct vhost_dev {
158164
int iov_limit;
159165
int weight;
160166
int byte_weight;
161-
u64 kcov_handle;
162167
bool use_worker;
163168
int (*msg_handler)(struct vhost_dev *dev, u32 asid,
164169
struct vhost_iotlb_msg *msg);

include/linux/sched.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1729,7 +1729,7 @@ extern struct pid *cad_pid;
17291729
#define PF_MEMALLOC 0x00000800 /* Allocating memory */
17301730
#define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */
17311731
#define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */
1732-
#define PF__HOLE__00004000 0x00004000
1732+
#define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */
17331733
#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
17341734
#define PF__HOLE__00010000 0x00010000
17351735
#define PF_KSWAPD 0x00020000 /* I am kswapd */

include/linux/sched/task.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,20 @@ struct kernel_clone_args {
2323
int __user *pidfd;
2424
int __user *child_tid;
2525
int __user *parent_tid;
26+
const char *name;
2627
int exit_signal;
28+
u32 kthread:1;
29+
u32 io_thread:1;
30+
u32 user_worker:1;
31+
u32 no_files:1;
32+
u32 ignore_signals:1;
2733
unsigned long stack;
2834
unsigned long stack_size;
2935
unsigned long tls;
3036
pid_t *set_tid;
3137
/* Number of elements in *set_tid */
3238
size_t set_tid_size;
3339
int cgroup;
34-
int io_thread;
35-
int kthread;
3640
int idle;
3741
int (*fn)(void *);
3842
void *fn_arg;
@@ -89,9 +93,12 @@ extern void exit_files(struct task_struct *);
8993
extern void exit_itimers(struct task_struct *);
9094

9195
extern pid_t kernel_clone(struct kernel_clone_args *kargs);
96+
struct task_struct *copy_process(struct pid *pid, int trace, int node,
97+
struct kernel_clone_args *args);
9298
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
9399
struct task_struct *fork_idle(int);
94-
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
100+
extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
101+
unsigned long flags);
95102
extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags);
96103
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
97104
int kernel_wait(pid_t pid, int *stat);

include/linux/sched/vhost_task.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
#ifndef _LINUX_VHOST_TASK_H
3+
#define _LINUX_VHOST_TASK_H
4+
5+
#include <linux/completion.h>
6+
7+
struct task_struct;
8+
9+
struct vhost_task {
10+
int (*fn)(void *data);
11+
void *data;
12+
struct completion exited;
13+
unsigned long flags;
14+
struct task_struct *task;
15+
};
16+
17+
struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
18+
const char *name);
19+
void vhost_task_start(struct vhost_task *vtsk);
20+
void vhost_task_stop(struct vhost_task *vtsk);
21+
bool vhost_task_should_stop(struct vhost_task *vtsk);
22+
23+
#endif

init/main.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -711,7 +711,7 @@ noinline void __ref rest_init(void)
711711
rcu_read_unlock();
712712

713713
numa_default_policy();
714-
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
714+
pid = kernel_thread(kthreadd, NULL, NULL, CLONE_FS | CLONE_FILES);
715715
rcu_read_lock();
716716
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
717717
rcu_read_unlock();

kernel/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ obj-y = fork.o exec_domain.o panic.o \
1515
obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o
1616
obj-$(CONFIG_MODULES) += kmod.o
1717
obj-$(CONFIG_MULTIUSER) += groups.o
18+
obj-$(CONFIG_VHOST_TASK) += vhost_task.o
1819

1920
ifdef CONFIG_FUNCTION_TRACER
2021
# Do not trace internal ftrace files

0 commit comments

Comments
 (0)