fork: extend clone3() to support setting a PID

The main motivation to add set_tid to clone3() is CRIU.

To restore a process with the same PID/TID CRIU currently uses
/proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to
ns_last_pid and then (quickly) does a clone(). This works most of the
time, but it is racy. It is also slow as it requires multiple syscalls.

Extending clone3() to support *set_tid makes it possible restore a
process using CRIU without accessing /proc/sys/kernel/ns_last_pid and
race free (as long as the desired PID/TID is available).

This clone3() extension places the same restrictions (CAP_SYS_ADMIN)
on clone3() with *set_tid as they are currently in place for ns_last_pid.

The original version of this change was using a single value for
set_tid. At the 2019 LPC, after presenting set_tid, it was, however,
decided to change set_tid to an array to enable setting the PID of a
process in multiple PID namespaces at the same time. If a process is
created in a PID namespace it is possible to influence the PID inside
and outside of the PID namespace. Details also in the corresponding
selftest.

To create a process with the following PIDs:

      PID NS level         Requested PID
        0 (host)              31496
        1                        42
        2                         1

For that example the two newly introduced parameters to struct
clone_args (set_tid and set_tid_size) would need to be:

  set_tid[0] = 1;
  set_tid[1] = 42;
  set_tid[2] = 31496;
  set_tid_size = 3;

If only the PIDs of the two innermost nested PID namespaces should be
defined it would look like this:

  set_tid[0] = 1;
  set_tid[1] = 42;
  set_tid_size = 2;

The PID of the newly created process would then be the next available
free PID in the PID namespace level 0 (host) and 42 in the PID namespace
at level 1 and the PID of the process in the innermost PID namespace
would be 1.

The set_tid array is used to specify the PID of a process starting
from the innermost nested PID namespaces up to set_tid_size PID namespaces.

set_tid_size cannot be larger then the current PID namespace level.

Signed-off-by: Adrian Reber <areber@redhat.com>
Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com>
Acked-by: Andrei Vagin <avagin@gmail.com>
Link: https://lore.kernel.org/r/20191115123621.142252-1-areber@redhat.com
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
This commit is contained in:
Adrian Reber 2019-11-15 13:36:20 +01:00 committed by Christian Brauner
parent 17a810699c
commit 49cb2fc42c
7 changed files with 121 additions and 36 deletions

View File

@ -124,7 +124,8 @@ extern struct pid *find_vpid(int nr);
extern struct pid *find_get_pid(int nr); extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *); extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
extern struct pid *alloc_pid(struct pid_namespace *ns); extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size);
extern void free_pid(struct pid *pid); extern void free_pid(struct pid *pid);
extern void disable_pid_allocation(struct pid_namespace *ns); extern void disable_pid_allocation(struct pid_namespace *ns);

View File

@ -12,6 +12,8 @@
#include <linux/ns_common.h> #include <linux/ns_common.h>
#include <linux/idr.h> #include <linux/idr.h>
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32
struct fs_pin; struct fs_pin;

View File

@ -26,6 +26,9 @@ struct kernel_clone_args {
unsigned long stack; unsigned long stack;
unsigned long stack_size; unsigned long stack_size;
unsigned long tls; unsigned long tls;
pid_t *set_tid;
/* Number of elements in *set_tid */
size_t set_tid_size;
}; };
/* /*

View File

@ -39,24 +39,38 @@
#ifndef __ASSEMBLY__ #ifndef __ASSEMBLY__
/** /**
* struct clone_args - arguments for the clone3 syscall * struct clone_args - arguments for the clone3 syscall
* @flags: Flags for the new process as listed above. * @flags: Flags for the new process as listed above.
* All flags are valid except for CSIGNAL and * All flags are valid except for CSIGNAL and
* CLONE_DETACHED. * CLONE_DETACHED.
* @pidfd: If CLONE_PIDFD is set, a pidfd will be * @pidfd: If CLONE_PIDFD is set, a pidfd will be
* returned in this argument. * returned in this argument.
* @child_tid: If CLONE_CHILD_SETTID is set, the TID of the * @child_tid: If CLONE_CHILD_SETTID is set, the TID of the
* child process will be returned in the child's * child process will be returned in the child's
* memory. * memory.
* @parent_tid: If CLONE_PARENT_SETTID is set, the TID of * @parent_tid: If CLONE_PARENT_SETTID is set, the TID of
* the child process will be returned in the * the child process will be returned in the
* parent's memory. * parent's memory.
* @exit_signal: The exit_signal the parent process will be * @exit_signal: The exit_signal the parent process will be
* sent when the child exits. * sent when the child exits.
* @stack: Specify the location of the stack for the * @stack: Specify the location of the stack for the
* child process. * child process.
* @stack_size: The size of the stack for the child process. * @stack_size: The size of the stack for the child process.
* @tls: If CLONE_SETTLS is set, the tls descriptor * @tls: If CLONE_SETTLS is set, the tls descriptor
* is set to tls. * is set to tls.
* @set_tid: Pointer to an array of type *pid_t. The size
* of the array is defined using @set_tid_size.
* This array is used to select PIDs/TIDs for
* newly created processes. The first element in
* this defines the PID in the most nested PID
* namespace. Each additional element in the array
* defines the PID in the parent PID namespace of
* the original PID namespace. If the array has
* less entries than the number of currently
* nested PID namespaces only the PIDs in the
* corresponding namespaces are set.
* @set_tid_size: This defines the size of the array referenced
* in @set_tid. This cannot be larger than the
* kernel's limit of nested PID namespaces.
* *
* The structure is versioned by size and thus extensible. * The structure is versioned by size and thus extensible.
* New struct members must go at the end of the struct and * New struct members must go at the end of the struct and
@ -71,10 +85,13 @@ struct clone_args {
__aligned_u64 stack; __aligned_u64 stack;
__aligned_u64 stack_size; __aligned_u64 stack_size;
__aligned_u64 tls; __aligned_u64 tls;
__aligned_u64 set_tid;
__aligned_u64 set_tid_size;
}; };
#endif #endif
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
/* /*
* Scheduling policies * Scheduling policies

View File

@ -2087,7 +2087,8 @@ static __latent_entropy struct task_struct *copy_process(
stackleak_task_init(p); stackleak_task_init(p);
if (pid != &init_struct_pid) { if (pid != &init_struct_pid) {
pid = alloc_pid(p->nsproxy->pid_ns_for_children); pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
args->set_tid_size);
if (IS_ERR(pid)) { if (IS_ERR(pid)) {
retval = PTR_ERR(pid); retval = PTR_ERR(pid);
goto bad_fork_cleanup_thread; goto bad_fork_cleanup_thread;
@ -2590,6 +2591,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
{ {
int err; int err;
struct clone_args args; struct clone_args args;
pid_t *kset_tid = kargs->set_tid;
if (unlikely(usize > PAGE_SIZE)) if (unlikely(usize > PAGE_SIZE))
return -E2BIG; return -E2BIG;
@ -2600,6 +2602,15 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
if (err) if (err)
return err; return err;
if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
return -EINVAL;
if (unlikely(!args.set_tid && args.set_tid_size > 0))
return -EINVAL;
if (unlikely(args.set_tid && args.set_tid_size == 0))
return -EINVAL;
/* /*
* Verify that higher 32bits of exit_signal are unset and that * Verify that higher 32bits of exit_signal are unset and that
* it is a valid signal * it is a valid signal
@ -2617,8 +2628,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
.stack = args.stack, .stack = args.stack,
.stack_size = args.stack_size, .stack_size = args.stack_size,
.tls = args.tls, .tls = args.tls,
.set_tid_size = args.set_tid_size,
}; };
if (args.set_tid &&
copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
(kargs->set_tid_size * sizeof(pid_t))))
return -EFAULT;
kargs->set_tid = kset_tid;
return 0; return 0;
} }
@ -2662,6 +2681,9 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
int err; int err;
struct kernel_clone_args kargs; struct kernel_clone_args kargs;
pid_t set_tid[MAX_PID_NS_LEVEL];
kargs.set_tid = set_tid;
err = copy_clone_args_from_user(&kargs, uargs, size); err = copy_clone_args_from_user(&kargs, uargs, size);
if (err) if (err)

View File

@ -157,7 +157,8 @@ void free_pid(struct pid *pid)
call_rcu(&pid->rcu, delayed_put_pid); call_rcu(&pid->rcu, delayed_put_pid);
} }
struct pid *alloc_pid(struct pid_namespace *ns) struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size)
{ {
struct pid *pid; struct pid *pid;
enum pid_type type; enum pid_type type;
@ -166,6 +167,17 @@ struct pid *alloc_pid(struct pid_namespace *ns)
struct upid *upid; struct upid *upid;
int retval = -ENOMEM; int retval = -ENOMEM;
/*
* set_tid_size contains the size of the set_tid array. Starting at
* the most nested currently active PID namespace it tells alloc_pid()
* which PID to set for a process in that most nested PID namespace
* up to set_tid_size PID namespaces. It does not have to set the PID
* for a process in all nested PID namespaces but set_tid_size must
* never be greater than the current ns->level + 1.
*/
if (set_tid_size > ns->level + 1)
return ERR_PTR(-EINVAL);
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid) if (!pid)
return ERR_PTR(retval); return ERR_PTR(retval);
@ -174,24 +186,54 @@ struct pid *alloc_pid(struct pid_namespace *ns)
pid->level = ns->level; pid->level = ns->level;
for (i = ns->level; i >= 0; i--) { for (i = ns->level; i >= 0; i--) {
int pid_min = 1; int tid = 0;
if (set_tid_size) {
tid = set_tid[ns->level - i];
retval = -EINVAL;
if (tid < 1 || tid >= pid_max)
goto out_free;
/*
* Also fail if a PID != 1 is requested and
* no PID 1 exists.
*/
if (tid != 1 && !tmp->child_reaper)
goto out_free;
retval = -EPERM;
if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN))
goto out_free;
set_tid_size--;
}
idr_preload(GFP_KERNEL); idr_preload(GFP_KERNEL);
spin_lock_irq(&pidmap_lock); spin_lock_irq(&pidmap_lock);
/* if (tid) {
* init really needs pid 1, but after reaching the maximum nr = idr_alloc(&tmp->idr, NULL, tid,
* wrap back to RESERVED_PIDS tid + 1, GFP_ATOMIC);
*/ /*
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) * If ENOSPC is returned it means that the PID is
pid_min = RESERVED_PIDS; * alreay in use. Return EEXIST in that case.
*/
if (nr == -ENOSPC)
nr = -EEXIST;
} else {
int pid_min = 1;
/*
* init really needs pid 1, but after reaching the
* maximum wrap back to RESERVED_PIDS
*/
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
pid_min = RESERVED_PIDS;
/* /*
* Store a null pointer so find_pid_ns does not find * Store a null pointer so find_pid_ns does not find
* a partially initialized PID (see below). * a partially initialized PID (see below).
*/ */
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC); pid_max, GFP_ATOMIC);
}
spin_unlock_irq(&pidmap_lock); spin_unlock_irq(&pidmap_lock);
idr_preload_end(); idr_preload_end();

View File

@ -26,8 +26,6 @@
static DEFINE_MUTEX(pid_caches_mutex); static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep; static struct kmem_cache *pid_ns_cachep;
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32
/* Write once array, filled from the beginning. */ /* Write once array, filled from the beginning. */
static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL];