Skip to content

Commit 0acefef

Browse files
committed
Merge tag 'threads-v5.5' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux
Pull thread management updates from Christian Brauner: - A pidfd's fdinfo file currently contains the field "Pid:\t<pid>" where <pid> is the pid of the process in the pid namespace of the procfs instance the fdinfo file for the pidfd was opened in. The fdinfo file has now gained a new "NSpid:\t<ns-pid1>[\t<ns-pid2>[...]]" field which lists the pids of the process in all child pid namespaces provided the pid namespace of the procfs instance it is looked up under has an ancestoral relationship with the pid namespace of the process. If it does not 0 will be shown and no further pid namespaces will be listed. Tests included. (Christian Kellner) - If the process the pidfd references has already exited, print -1 for the Pid and NSpid fields in the pidfd's fdinfo file. Tests included. (me) - Add CLONE_CLEAR_SIGHAND. This lets callers clear all signal handler that are not SIG_DFL or SIG_IGN at process creation time. This originated as a feature request from glibc to improve performance and elimate races in their posix_spawn() implementation. Tests included. (me) - Add support for choosing a specific pid for a process with clone3(). This is the feature which was part of the thread update for v5.4 but after a discussion at LPC in Lisbon we decided to delay it for one more cycle in order to make the interface more generic. This has now done. It is now possible to choose a specific pid in a whole pid namespaces (sub)hierarchy instead of just one pid namespace. In order to choose a specific pid the caller must have CAP_SYS_ADMIN in all owning user namespaces of the target pid namespaces. Tests included. (Adrian Reber) - Test improvements and extensions. (Andrei Vagin, me) * tag 'threads-v5.5' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux: selftests/clone3: skip if clone3() is ENOSYS selftests/clone3: check that all pids are released on error paths selftests/clone3: report a correct number of fails selftests/clone3: flush stdout and stderr before clone3() and _exit() selftests: add tests for clone3() with *set_tid fork: extend clone3() to support setting a PID selftests: add tests for clone3() tests: test CLONE_CLEAR_SIGHAND clone3: add CLONE_CLEAR_SIGHAND pid: use pid_has_task() in pidfd_open() exit: use pid_has_task() in do_wait() pid: use pid_has_task() in __change_pid() test: verify fdinfo for pidfd of reaped process pidfd: check pid has attached task in fdinfo pidfd: add tests for NSpid info in fdinfo pidfd: add NSpid entries to fdinfo
2 parents 9c91e6a + 11fde16 commit 0acefef

18 files changed

Lines changed: 1308 additions & 58 deletions

File tree

MAINTAINERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12861,6 +12861,7 @@ S: Maintained
1286112861
T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
1286212862
F: samples/pidfd/
1286312863
F: tools/testing/selftests/pidfd/
12864+
F: tools/testing/selftests/clone3/
1286412865
K: (?i)pidfd
1286512866
K: (?i)clone3
1286612867
K: \b(clone_args|kernel_clone_args)\b

include/linux/pid.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,10 @@ static inline struct pid *get_pid(struct pid *pid)
8585

8686
extern void put_pid(struct pid *pid);
8787
extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
88+
static inline bool pid_has_task(struct pid *pid, enum pid_type type)
89+
{
90+
return !hlist_empty(&pid->tasks[type]);
91+
}
8892
extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);
8993

9094
extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
@@ -120,7 +124,8 @@ extern struct pid *find_vpid(int nr);
120124
extern struct pid *find_get_pid(int nr);
121125
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
122126

123-
extern struct pid *alloc_pid(struct pid_namespace *ns);
127+
extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
128+
size_t set_tid_size);
124129
extern void free_pid(struct pid *pid);
125130
extern void disable_pid_allocation(struct pid_namespace *ns);
126131

include/linux/pid_namespace.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
#include <linux/ns_common.h>
1313
#include <linux/idr.h>
1414

15+
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
16+
#define MAX_PID_NS_LEVEL 32
1517

1618
struct fs_pin;
1719

include/linux/sched/task.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ struct kernel_clone_args {
2626
unsigned long stack;
2727
unsigned long stack_size;
2828
unsigned long tls;
29+
pid_t *set_tid;
30+
/* Number of elements in *set_tid */
31+
size_t set_tid_size;
2932
};
3033

3134
/*

include/uapi/linux/sched.h

Lines changed: 42 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -33,31 +33,48 @@
3333
#define CLONE_NEWNET 0x40000000 /* New network namespace */
3434
#define CLONE_IO 0x80000000 /* Clone io context */
3535

36+
/* Flags for the clone3() syscall. */
37+
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
38+
3639
#ifndef __ASSEMBLY__
3740
/**
3841
* struct clone_args - arguments for the clone3 syscall
39-
* @flags: Flags for the new process as listed above.
40-
* All flags are valid except for CSIGNAL and
41-
* CLONE_DETACHED.
42-
* @pidfd: If CLONE_PIDFD is set, a pidfd will be
43-
* returned in this argument.
44-
* @child_tid: If CLONE_CHILD_SETTID is set, the TID of the
45-
* child process will be returned in the child's
46-
* memory.
47-
* @parent_tid: If CLONE_PARENT_SETTID is set, the TID of
48-
* the child process will be returned in the
49-
* parent's memory.
50-
* @exit_signal: The exit_signal the parent process will be
51-
* sent when the child exits.
52-
* @stack: Specify the location of the stack for the
53-
* child process.
54-
* Note, @stack is expected to point to the
55-
* lowest address. The stack direction will be
56-
* determined by the kernel and set up
57-
* appropriately based on @stack_size.
58-
* @stack_size: The size of the stack for the child process.
59-
* @tls: If CLONE_SETTLS is set, the tls descriptor
60-
* is set to tls.
42+
* @flags: Flags for the new process as listed above.
43+
* All flags are valid except for CSIGNAL and
44+
* CLONE_DETACHED.
45+
* @pidfd: If CLONE_PIDFD is set, a pidfd will be
46+
* returned in this argument.
47+
* @child_tid: If CLONE_CHILD_SETTID is set, the TID of the
48+
* child process will be returned in the child's
49+
* memory.
50+
* @parent_tid: If CLONE_PARENT_SETTID is set, the TID of
51+
* the child process will be returned in the
52+
* parent's memory.
53+
* @exit_signal: The exit_signal the parent process will be
54+
* sent when the child exits.
55+
* @stack: Specify the location of the stack for the
56+
* child process.
57+
* Note, @stack is expected to point to the
58+
* lowest address. The stack direction will be
59+
* determined by the kernel and set up
60+
* appropriately based on @stack_size.
61+
* @stack_size: The size of the stack for the child process.
62+
* @tls: If CLONE_SETTLS is set, the tls descriptor
63+
* is set to tls.
64+
* @set_tid: Pointer to an array of type *pid_t. The size
65+
* of the array is defined using @set_tid_size.
66+
* This array is used to select PIDs/TIDs for
67+
* newly created processes. The first element in
68+
* this defines the PID in the most nested PID
69+
* namespace. Each additional element in the array
70+
* defines the PID in the parent PID namespace of
71+
* the original PID namespace. If the array has
72+
* less entries than the number of currently
73+
* nested PID namespaces only the PIDs in the
74+
* corresponding namespaces are set.
75+
* @set_tid_size: This defines the size of the array referenced
76+
* in @set_tid. This cannot be larger than the
77+
* kernel's limit of nested PID namespaces.
6178
*
6279
* The structure is versioned by size and thus extensible.
6380
* New struct members must go at the end of the struct and
@@ -72,10 +89,13 @@ struct clone_args {
7289
__aligned_u64 stack;
7390
__aligned_u64 stack_size;
7491
__aligned_u64 tls;
92+
__aligned_u64 set_tid;
93+
__aligned_u64 set_tid_size;
7594
};
7695
#endif
7796

7897
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
98+
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
7999

80100
/*
81101
* Scheduling policies

kernel/exit.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1457,7 +1457,7 @@ static long do_wait(struct wait_opts *wo)
14571457
*/
14581458
wo->notask_error = -ECHILD;
14591459
if ((wo->wo_type < PIDTYPE_MAX) &&
1460-
(!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
1460+
(!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
14611461
goto notask;
14621462

14631463
set_current_state(TASK_INTERRUPTIBLE);

kernel/fork.c

Lines changed: 92 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1517,6 +1517,11 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
15171517
spin_lock_irq(&current->sighand->siglock);
15181518
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
15191519
spin_unlock_irq(&current->sighand->siglock);
1520+
1521+
/* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
1522+
if (clone_flags & CLONE_CLEAR_SIGHAND)
1523+
flush_signal_handlers(tsk, 0);
1524+
15201525
return 0;
15211526
}
15221527

@@ -1695,12 +1700,68 @@ static int pidfd_release(struct inode *inode, struct file *file)
16951700
}
16961701

16971702
#ifdef CONFIG_PROC_FS
1703+
/**
1704+
* pidfd_show_fdinfo - print information about a pidfd
1705+
* @m: proc fdinfo file
1706+
* @f: file referencing a pidfd
1707+
*
1708+
* Pid:
1709+
* This function will print the pid that a given pidfd refers to in the
1710+
* pid namespace of the procfs instance.
1711+
* If the pid namespace of the process is not a descendant of the pid
1712+
* namespace of the procfs instance 0 will be shown as its pid. This is
1713+
* similar to calling getppid() on a process whose parent is outside of
1714+
* its pid namespace.
1715+
*
1716+
* NSpid:
1717+
* If pid namespaces are supported then this function will also print
1718+
* the pid of a given pidfd refers to for all descendant pid namespaces
1719+
* starting from the current pid namespace of the instance, i.e. the
1720+
* Pid field and the first entry in the NSpid field will be identical.
1721+
* If the pid namespace of the process is not a descendant of the pid
1722+
* namespace of the procfs instance 0 will be shown as its first NSpid
1723+
* entry and no others will be shown.
1724+
* Note that this differs from the Pid and NSpid fields in
1725+
* /proc/<pid>/status where Pid and NSpid are always shown relative to
1726+
* the pid namespace of the procfs instance. The difference becomes
1727+
* obvious when sending around a pidfd between pid namespaces from a
1728+
* different branch of the tree, i.e. where no ancestoral relation is
1729+
* present between the pid namespaces:
1730+
* - create two new pid namespaces ns1 and ns2 in the initial pid
1731+
* namespace (also take care to create new mount namespaces in the
1732+
* new pid namespace and mount procfs)
1733+
* - create a process with a pidfd in ns1
1734+
* - send pidfd from ns1 to ns2
1735+
* - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
1736+
* have exactly one entry, which is 0
1737+
*/
16981738
static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
16991739
{
1700-
struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
17011740
struct pid *pid = f->private_data;
1741+
struct pid_namespace *ns;
1742+
pid_t nr = -1;
1743+
1744+
if (likely(pid_has_task(pid, PIDTYPE_PID))) {
1745+
ns = proc_pid_ns(file_inode(m->file));
1746+
nr = pid_nr_ns(pid, ns);
1747+
}
1748+
1749+
seq_put_decimal_ll(m, "Pid:\t", nr);
1750+
1751+
#ifdef CONFIG_PID_NS
1752+
seq_put_decimal_ll(m, "\nNSpid:\t", nr);
1753+
if (nr > 0) {
1754+
int i;
17021755

1703-
seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
1756+
/* If nr is non-zero it means that 'pid' is valid and that
1757+
* ns, i.e. the pid namespace associated with the procfs
1758+
* instance, is in the pid namespace hierarchy of pid.
1759+
* Start at one below the already printed level.
1760+
*/
1761+
for (i = ns->level + 1; i <= pid->level; i++)
1762+
seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
1763+
}
1764+
#endif
17041765
seq_putc(m, '\n');
17051766
}
17061767
#endif
@@ -2026,7 +2087,8 @@ static __latent_entropy struct task_struct *copy_process(
20262087
stackleak_task_init(p);
20272088

20282089
if (pid != &init_struct_pid) {
2029-
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
2090+
pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
2091+
args->set_tid_size);
20302092
if (IS_ERR(pid)) {
20312093
retval = PTR_ERR(pid);
20322094
goto bad_fork_cleanup_thread;
@@ -2529,6 +2591,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
25292591
{
25302592
int err;
25312593
struct clone_args args;
2594+
pid_t *kset_tid = kargs->set_tid;
25322595

25332596
if (unlikely(usize > PAGE_SIZE))
25342597
return -E2BIG;
@@ -2539,6 +2602,15 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
25392602
if (err)
25402603
return err;
25412604

2605+
if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
2606+
return -EINVAL;
2607+
2608+
if (unlikely(!args.set_tid && args.set_tid_size > 0))
2609+
return -EINVAL;
2610+
2611+
if (unlikely(args.set_tid && args.set_tid_size == 0))
2612+
return -EINVAL;
2613+
25422614
/*
25432615
* Verify that higher 32bits of exit_signal are unset and that
25442616
* it is a valid signal
@@ -2556,8 +2628,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
25562628
.stack = args.stack,
25572629
.stack_size = args.stack_size,
25582630
.tls = args.tls,
2631+
.set_tid_size = args.set_tid_size,
25592632
};
25602633

2634+
if (args.set_tid &&
2635+
copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
2636+
(kargs->set_tid_size * sizeof(pid_t))))
2637+
return -EFAULT;
2638+
2639+
kargs->set_tid = kset_tid;
2640+
25612641
return 0;
25622642
}
25632643

@@ -2591,11 +2671,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
25912671

25922672
static bool clone3_args_valid(struct kernel_clone_args *kargs)
25932673
{
2594-
/*
2595-
* All lower bits of the flag word are taken.
2596-
* Verify that no other unknown flags are passed along.
2597-
*/
2598-
if (kargs->flags & ~CLONE_LEGACY_FLAGS)
2674+
/* Verify that no unknown flags are passed along. */
2675+
if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND))
25992676
return false;
26002677

26012678
/*
@@ -2605,6 +2682,10 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
26052682
if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
26062683
return false;
26072684

2685+
if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
2686+
(CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
2687+
return false;
2688+
26082689
if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
26092690
kargs->exit_signal)
26102691
return false;
@@ -2631,6 +2712,9 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
26312712
int err;
26322713

26332714
struct kernel_clone_args kargs;
2715+
pid_t set_tid[MAX_PID_NS_LEVEL];
2716+
2717+
kargs.set_tid = set_tid;
26342718

26352719
err = copy_clone_args_from_user(&kargs, uargs, size);
26362720
if (err)

0 commit comments

Comments
 (0)