forked from luck/tmp_suning_uos_patched
Merge branch 'for-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - Christian extended clone3 so that processes can be spawned into cgroups directly. This is not only neat in terms of semantics but also avoids grabbing the global cgroup_threadgroup_rwsem for migration. - Daniel added !root xattr support to cgroupfs. Userland already uses xattrs on cgroupfs for bookkeeping. This will allow delegated cgroups to support such usages. - Prateek tried to make cpuset hotplug handling synchronous but that led to possible deadlock scenarios. Reverted. - Other minor changes including release_agent_path handling cleanup. * 'for-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: docs: cgroup-v1: Document the cpuset_v2_mode mount option Revert "cpuset: Make cpuset hotplug synchronous" cgroupfs: Support user xattrs kernfs: Add option to enable user xattrs kernfs: Add removed_size out param for simple_xattr_set kernfs: kvmalloc xattr value instead of kmalloc cgroup: Restructure release_agent_path handling selftests/cgroup: add tests for cloning into cgroups clone3: allow spawning processes into cgroups cgroup: add cgroup_may_write() helper cgroup: refactor fork helpers cgroup: add cgroup_get_from_file() helper cgroup: unify attach permission checking cpuset: Make cpuset hotplug synchronous cgroup.c: Use built-in RCU list checking kselftest/cgroup: add cgroup destruction test cgroup: Clean up css_set task traversal
This commit is contained in:
commit
d883600523
|
@ -223,6 +223,17 @@ cpu_online_mask using a CPU hotplug notifier, and the mems file
|
||||||
automatically tracks the value of node_states[N_MEMORY]--i.e.,
|
automatically tracks the value of node_states[N_MEMORY]--i.e.,
|
||||||
nodes with memory--using the cpuset_track_online_nodes() hook.
|
nodes with memory--using the cpuset_track_online_nodes() hook.
|
||||||
|
|
||||||
|
The cpuset.effective_cpus and cpuset.effective_mems files are
|
||||||
|
normally read-only copies of cpuset.cpus and cpuset.mems files
|
||||||
|
respectively. If the cpuset cgroup filesystem is mounted with the
|
||||||
|
special "cpuset_v2_mode" option, the behavior of these files will become
|
||||||
|
similar to the corresponding files in cpuset v2. In other words, hotplug
|
||||||
|
events will not change cpuset.cpus and cpuset.mems. Those events will
|
||||||
|
only affect cpuset.effective_cpus and cpuset.effective_mems which show
|
||||||
|
the actual cpus and memory nodes that are currently used by this cpuset.
|
||||||
|
See Documentation/admin-guide/cgroup-v2.rst for more information about
|
||||||
|
cpuset v2 behavior.
|
||||||
|
|
||||||
|
|
||||||
1.4 What are exclusive cpusets ?
|
1.4 What are exclusive cpusets ?
|
||||||
--------------------------------
|
--------------------------------
|
||||||
|
|
|
@ -53,6 +53,8 @@ static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc)
|
||||||
kn->iattr->ia_ctime = kn->iattr->ia_atime;
|
kn->iattr->ia_ctime = kn->iattr->ia_atime;
|
||||||
|
|
||||||
simple_xattrs_init(&kn->iattr->xattrs);
|
simple_xattrs_init(&kn->iattr->xattrs);
|
||||||
|
atomic_set(&kn->iattr->nr_user_xattrs, 0);
|
||||||
|
atomic_set(&kn->iattr->user_xattr_size, 0);
|
||||||
out_unlock:
|
out_unlock:
|
||||||
ret = kn->iattr;
|
ret = kn->iattr;
|
||||||
mutex_unlock(&iattr_mutex);
|
mutex_unlock(&iattr_mutex);
|
||||||
|
@ -303,7 +305,7 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
|
||||||
if (!attrs)
|
if (!attrs)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
return simple_xattr_set(&attrs->xattrs, name, value, size, flags);
|
return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int kernfs_vfs_xattr_get(const struct xattr_handler *handler,
|
static int kernfs_vfs_xattr_get(const struct xattr_handler *handler,
|
||||||
|
@ -327,6 +329,86 @@ static int kernfs_vfs_xattr_set(const struct xattr_handler *handler,
|
||||||
return kernfs_xattr_set(kn, name, value, size, flags);
|
return kernfs_xattr_set(kn, name, value, size, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
|
||||||
|
const char *full_name,
|
||||||
|
struct simple_xattrs *xattrs,
|
||||||
|
const void *value, size_t size, int flags)
|
||||||
|
{
|
||||||
|
atomic_t *sz = &kn->iattr->user_xattr_size;
|
||||||
|
atomic_t *nr = &kn->iattr->nr_user_xattrs;
|
||||||
|
ssize_t removed_size;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) {
|
||||||
|
ret = -ENOSPC;
|
||||||
|
goto dec_count_out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) {
|
||||||
|
ret = -ENOSPC;
|
||||||
|
goto dec_size_out;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = simple_xattr_set(xattrs, full_name, value, size, flags,
|
||||||
|
&removed_size);
|
||||||
|
|
||||||
|
if (!ret && removed_size >= 0)
|
||||||
|
size = removed_size;
|
||||||
|
else if (!ret)
|
||||||
|
return 0;
|
||||||
|
dec_size_out:
|
||||||
|
atomic_sub(size, sz);
|
||||||
|
dec_count_out:
|
||||||
|
atomic_dec(nr);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
|
||||||
|
const char *full_name,
|
||||||
|
struct simple_xattrs *xattrs,
|
||||||
|
const void *value, size_t size, int flags)
|
||||||
|
{
|
||||||
|
atomic_t *sz = &kn->iattr->user_xattr_size;
|
||||||
|
atomic_t *nr = &kn->iattr->nr_user_xattrs;
|
||||||
|
ssize_t removed_size;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ret = simple_xattr_set(xattrs, full_name, value, size, flags,
|
||||||
|
&removed_size);
|
||||||
|
|
||||||
|
if (removed_size >= 0) {
|
||||||
|
atomic_sub(removed_size, sz);
|
||||||
|
atomic_dec(nr);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
|
||||||
|
struct dentry *unused, struct inode *inode,
|
||||||
|
const char *suffix, const void *value,
|
||||||
|
size_t size, int flags)
|
||||||
|
{
|
||||||
|
const char *full_name = xattr_full_name(handler, suffix);
|
||||||
|
struct kernfs_node *kn = inode->i_private;
|
||||||
|
struct kernfs_iattrs *attrs;
|
||||||
|
|
||||||
|
if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR))
|
||||||
|
return -EOPNOTSUPP;
|
||||||
|
|
||||||
|
attrs = kernfs_iattrs(kn);
|
||||||
|
if (!attrs)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
if (value)
|
||||||
|
return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs,
|
||||||
|
value, size, flags);
|
||||||
|
else
|
||||||
|
return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs,
|
||||||
|
value, size, flags);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
static const struct xattr_handler kernfs_trusted_xattr_handler = {
|
static const struct xattr_handler kernfs_trusted_xattr_handler = {
|
||||||
.prefix = XATTR_TRUSTED_PREFIX,
|
.prefix = XATTR_TRUSTED_PREFIX,
|
||||||
.get = kernfs_vfs_xattr_get,
|
.get = kernfs_vfs_xattr_get,
|
||||||
|
@ -339,8 +421,15 @@ static const struct xattr_handler kernfs_security_xattr_handler = {
|
||||||
.set = kernfs_vfs_xattr_set,
|
.set = kernfs_vfs_xattr_set,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static const struct xattr_handler kernfs_user_xattr_handler = {
|
||||||
|
.prefix = XATTR_USER_PREFIX,
|
||||||
|
.get = kernfs_vfs_xattr_get,
|
||||||
|
.set = kernfs_vfs_user_xattr_set,
|
||||||
|
};
|
||||||
|
|
||||||
const struct xattr_handler *kernfs_xattr_handlers[] = {
|
const struct xattr_handler *kernfs_xattr_handlers[] = {
|
||||||
&kernfs_trusted_xattr_handler,
|
&kernfs_trusted_xattr_handler,
|
||||||
&kernfs_security_xattr_handler,
|
&kernfs_security_xattr_handler,
|
||||||
|
&kernfs_user_xattr_handler,
|
||||||
NULL
|
NULL
|
||||||
};
|
};
|
||||||
|
|
|
@ -26,6 +26,8 @@ struct kernfs_iattrs {
|
||||||
struct timespec64 ia_ctime;
|
struct timespec64 ia_ctime;
|
||||||
|
|
||||||
struct simple_xattrs xattrs;
|
struct simple_xattrs xattrs;
|
||||||
|
atomic_t nr_user_xattrs;
|
||||||
|
atomic_t user_xattr_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* +1 to avoid triggering overflow warning when negating it */
|
/* +1 to avoid triggering overflow warning when negating it */
|
||||||
|
|
17
fs/xattr.c
17
fs/xattr.c
|
@ -817,7 +817,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
|
||||||
if (len < sizeof(*new_xattr))
|
if (len < sizeof(*new_xattr))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
new_xattr = kmalloc(len, GFP_KERNEL);
|
new_xattr = kvmalloc(len, GFP_KERNEL);
|
||||||
if (!new_xattr)
|
if (!new_xattr)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
@ -860,6 +860,7 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
|
||||||
* @value: value of the xattr. If %NULL, will remove the attribute.
|
* @value: value of the xattr. If %NULL, will remove the attribute.
|
||||||
* @size: size of the new xattr
|
* @size: size of the new xattr
|
||||||
* @flags: %XATTR_{CREATE|REPLACE}
|
* @flags: %XATTR_{CREATE|REPLACE}
|
||||||
|
* @removed_size: returns size of the removed xattr, -1 if none removed
|
||||||
*
|
*
|
||||||
* %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
|
* %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
|
||||||
* with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
|
* with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
|
||||||
|
@ -868,7 +869,8 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
|
||||||
* Returns 0 on success, -errno on failure.
|
* Returns 0 on success, -errno on failure.
|
||||||
*/
|
*/
|
||||||
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
|
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
|
||||||
const void *value, size_t size, int flags)
|
const void *value, size_t size, int flags,
|
||||||
|
ssize_t *removed_size)
|
||||||
{
|
{
|
||||||
struct simple_xattr *xattr;
|
struct simple_xattr *xattr;
|
||||||
struct simple_xattr *new_xattr = NULL;
|
struct simple_xattr *new_xattr = NULL;
|
||||||
|
@ -882,7 +884,7 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
|
||||||
|
|
||||||
new_xattr->name = kstrdup(name, GFP_KERNEL);
|
new_xattr->name = kstrdup(name, GFP_KERNEL);
|
||||||
if (!new_xattr->name) {
|
if (!new_xattr->name) {
|
||||||
kfree(new_xattr);
|
kvfree(new_xattr);
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -895,8 +897,12 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
|
||||||
err = -EEXIST;
|
err = -EEXIST;
|
||||||
} else if (new_xattr) {
|
} else if (new_xattr) {
|
||||||
list_replace(&xattr->list, &new_xattr->list);
|
list_replace(&xattr->list, &new_xattr->list);
|
||||||
|
if (removed_size)
|
||||||
|
*removed_size = xattr->size;
|
||||||
} else {
|
} else {
|
||||||
list_del(&xattr->list);
|
list_del(&xattr->list);
|
||||||
|
if (removed_size)
|
||||||
|
*removed_size = xattr->size;
|
||||||
}
|
}
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
@ -908,11 +914,14 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
|
||||||
list_add(&new_xattr->list, &xattrs->head);
|
list_add(&new_xattr->list, &xattrs->head);
|
||||||
xattr = NULL;
|
xattr = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (removed_size)
|
||||||
|
*removed_size = -1;
|
||||||
out:
|
out:
|
||||||
spin_unlock(&xattrs->lock);
|
spin_unlock(&xattrs->lock);
|
||||||
if (xattr) {
|
if (xattr) {
|
||||||
kfree(xattr->name);
|
kfree(xattr->name);
|
||||||
kfree(xattr);
|
kvfree(xattr);
|
||||||
}
|
}
|
||||||
return err;
|
return err;
|
||||||
|
|
||||||
|
|
|
@ -633,8 +633,9 @@ struct cgroup_subsys {
|
||||||
void (*cancel_attach)(struct cgroup_taskset *tset);
|
void (*cancel_attach)(struct cgroup_taskset *tset);
|
||||||
void (*attach)(struct cgroup_taskset *tset);
|
void (*attach)(struct cgroup_taskset *tset);
|
||||||
void (*post_attach)(void);
|
void (*post_attach)(void);
|
||||||
int (*can_fork)(struct task_struct *task);
|
int (*can_fork)(struct task_struct *task,
|
||||||
void (*cancel_fork)(struct task_struct *task);
|
struct css_set *cset);
|
||||||
|
void (*cancel_fork)(struct task_struct *task, struct css_set *cset);
|
||||||
void (*fork)(struct task_struct *task);
|
void (*fork)(struct task_struct *task);
|
||||||
void (*exit)(struct task_struct *task);
|
void (*exit)(struct task_struct *task);
|
||||||
void (*release)(struct task_struct *task);
|
void (*release)(struct task_struct *task);
|
||||||
|
|
|
@ -27,6 +27,8 @@
|
||||||
|
|
||||||
#include <linux/cgroup-defs.h>
|
#include <linux/cgroup-defs.h>
|
||||||
|
|
||||||
|
struct kernel_clone_args;
|
||||||
|
|
||||||
#ifdef CONFIG_CGROUPS
|
#ifdef CONFIG_CGROUPS
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -58,9 +60,6 @@ struct css_task_iter {
|
||||||
struct list_head *tcset_head;
|
struct list_head *tcset_head;
|
||||||
|
|
||||||
struct list_head *task_pos;
|
struct list_head *task_pos;
|
||||||
struct list_head *tasks_head;
|
|
||||||
struct list_head *mg_tasks_head;
|
|
||||||
struct list_head *dying_tasks_head;
|
|
||||||
|
|
||||||
struct list_head *cur_tasks_head;
|
struct list_head *cur_tasks_head;
|
||||||
struct css_set *cur_cset;
|
struct css_set *cur_cset;
|
||||||
|
@ -122,9 +121,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
|
||||||
struct pid *pid, struct task_struct *tsk);
|
struct pid *pid, struct task_struct *tsk);
|
||||||
|
|
||||||
void cgroup_fork(struct task_struct *p);
|
void cgroup_fork(struct task_struct *p);
|
||||||
extern int cgroup_can_fork(struct task_struct *p);
|
extern int cgroup_can_fork(struct task_struct *p,
|
||||||
extern void cgroup_cancel_fork(struct task_struct *p);
|
struct kernel_clone_args *kargs);
|
||||||
extern void cgroup_post_fork(struct task_struct *p);
|
extern void cgroup_cancel_fork(struct task_struct *p,
|
||||||
|
struct kernel_clone_args *kargs);
|
||||||
|
extern void cgroup_post_fork(struct task_struct *p,
|
||||||
|
struct kernel_clone_args *kargs);
|
||||||
void cgroup_exit(struct task_struct *p);
|
void cgroup_exit(struct task_struct *p);
|
||||||
void cgroup_release(struct task_struct *p);
|
void cgroup_release(struct task_struct *p);
|
||||||
void cgroup_free(struct task_struct *p);
|
void cgroup_free(struct task_struct *p);
|
||||||
|
@ -708,9 +710,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
|
||||||
struct dentry *dentry) { return -EINVAL; }
|
struct dentry *dentry) { return -EINVAL; }
|
||||||
|
|
||||||
static inline void cgroup_fork(struct task_struct *p) {}
|
static inline void cgroup_fork(struct task_struct *p) {}
|
||||||
static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
|
static inline int cgroup_can_fork(struct task_struct *p,
|
||||||
static inline void cgroup_cancel_fork(struct task_struct *p) {}
|
struct kernel_clone_args *kargs) { return 0; }
|
||||||
static inline void cgroup_post_fork(struct task_struct *p) {}
|
static inline void cgroup_cancel_fork(struct task_struct *p,
|
||||||
|
struct kernel_clone_args *kargs) {}
|
||||||
|
static inline void cgroup_post_fork(struct task_struct *p,
|
||||||
|
struct kernel_clone_args *kargs) {}
|
||||||
static inline void cgroup_exit(struct task_struct *p) {}
|
static inline void cgroup_exit(struct task_struct *p) {}
|
||||||
static inline void cgroup_release(struct task_struct *p) {}
|
static inline void cgroup_release(struct task_struct *p) {}
|
||||||
static inline void cgroup_free(struct task_struct *p) {}
|
static inline void cgroup_free(struct task_struct *p) {}
|
||||||
|
|
|
@ -37,8 +37,10 @@ enum kernfs_node_type {
|
||||||
KERNFS_LINK = 0x0004,
|
KERNFS_LINK = 0x0004,
|
||||||
};
|
};
|
||||||
|
|
||||||
#define KERNFS_TYPE_MASK 0x000f
|
#define KERNFS_TYPE_MASK 0x000f
|
||||||
#define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK
|
#define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK
|
||||||
|
#define KERNFS_MAX_USER_XATTRS 128
|
||||||
|
#define KERNFS_USER_XATTR_SIZE_LIMIT (128 << 10)
|
||||||
|
|
||||||
enum kernfs_node_flag {
|
enum kernfs_node_flag {
|
||||||
KERNFS_ACTIVATED = 0x0010,
|
KERNFS_ACTIVATED = 0x0010,
|
||||||
|
@ -78,6 +80,11 @@ enum kernfs_root_flag {
|
||||||
* fhandle to access nodes of the fs.
|
* fhandle to access nodes of the fs.
|
||||||
*/
|
*/
|
||||||
KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004,
|
KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004,
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Support user xattrs to be written to nodes rooted at this root.
|
||||||
|
*/
|
||||||
|
KERNFS_ROOT_SUPPORT_USER_XATTR = 0x0008,
|
||||||
};
|
};
|
||||||
|
|
||||||
/* type-specific structures for kernfs_node union members */
|
/* type-specific structures for kernfs_node union members */
|
||||||
|
|
|
@ -13,6 +13,7 @@
|
||||||
struct task_struct;
|
struct task_struct;
|
||||||
struct rusage;
|
struct rusage;
|
||||||
union thread_union;
|
union thread_union;
|
||||||
|
struct css_set;
|
||||||
|
|
||||||
/* All the bits taken by the old clone syscall. */
|
/* All the bits taken by the old clone syscall. */
|
||||||
#define CLONE_LEGACY_FLAGS 0xffffffffULL
|
#define CLONE_LEGACY_FLAGS 0xffffffffULL
|
||||||
|
@ -29,6 +30,9 @@ struct kernel_clone_args {
|
||||||
pid_t *set_tid;
|
pid_t *set_tid;
|
||||||
/* Number of elements in *set_tid */
|
/* Number of elements in *set_tid */
|
||||||
size_t set_tid_size;
|
size_t set_tid_size;
|
||||||
|
int cgroup;
|
||||||
|
struct cgroup *cgrp;
|
||||||
|
struct css_set *cset;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -102,7 +102,8 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size);
|
||||||
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
|
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
|
||||||
void *buffer, size_t size);
|
void *buffer, size_t size);
|
||||||
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
|
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
|
||||||
const void *value, size_t size, int flags);
|
const void *value, size_t size, int flags,
|
||||||
|
ssize_t *removed_size);
|
||||||
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer,
|
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer,
|
||||||
size_t size);
|
size_t size);
|
||||||
void simple_xattr_list_add(struct simple_xattrs *xattrs,
|
void simple_xattr_list_add(struct simple_xattrs *xattrs,
|
||||||
|
|
|
@ -35,6 +35,7 @@
|
||||||
|
|
||||||
/* Flags for the clone3() syscall. */
|
/* Flags for the clone3() syscall. */
|
||||||
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
|
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
|
||||||
|
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
|
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
|
||||||
|
@ -81,6 +82,8 @@
|
||||||
* @set_tid_size: This defines the size of the array referenced
|
* @set_tid_size: This defines the size of the array referenced
|
||||||
* in @set_tid. This cannot be larger than the
|
* in @set_tid. This cannot be larger than the
|
||||||
* kernel's limit of nested PID namespaces.
|
* kernel's limit of nested PID namespaces.
|
||||||
|
* @cgroup: If CLONE_INTO_CGROUP is specified set this to
|
||||||
|
* a file descriptor for the cgroup.
|
||||||
*
|
*
|
||||||
* The structure is versioned by size and thus extensible.
|
* The structure is versioned by size and thus extensible.
|
||||||
* New struct members must go at the end of the struct and
|
* New struct members must go at the end of the struct and
|
||||||
|
@ -97,11 +100,13 @@ struct clone_args {
|
||||||
__aligned_u64 tls;
|
__aligned_u64 tls;
|
||||||
__aligned_u64 set_tid;
|
__aligned_u64 set_tid;
|
||||||
__aligned_u64 set_tid_size;
|
__aligned_u64 set_tid_size;
|
||||||
|
__aligned_u64 cgroup;
|
||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
|
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
|
||||||
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
|
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
|
||||||
|
#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Scheduling policies
|
* Scheduling policies
|
||||||
|
|
|
@ -38,10 +38,7 @@ static bool cgroup_no_v1_named;
|
||||||
*/
|
*/
|
||||||
static struct workqueue_struct *cgroup_pidlist_destroy_wq;
|
static struct workqueue_struct *cgroup_pidlist_destroy_wq;
|
||||||
|
|
||||||
/*
|
/* protects cgroup_subsys->release_agent_path */
|
||||||
* Protects cgroup_subsys->release_agent_path. Modifying it also requires
|
|
||||||
* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
|
|
||||||
*/
|
|
||||||
static DEFINE_SPINLOCK(release_agent_path_lock);
|
static DEFINE_SPINLOCK(release_agent_path_lock);
|
||||||
|
|
||||||
bool cgroup1_ssid_disabled(int ssid)
|
bool cgroup1_ssid_disabled(int ssid)
|
||||||
|
@ -775,22 +772,29 @@ void cgroup1_release_agent(struct work_struct *work)
|
||||||
{
|
{
|
||||||
struct cgroup *cgrp =
|
struct cgroup *cgrp =
|
||||||
container_of(work, struct cgroup, release_agent_work);
|
container_of(work, struct cgroup, release_agent_work);
|
||||||
char *pathbuf = NULL, *agentbuf = NULL;
|
char *pathbuf, *agentbuf;
|
||||||
char *argv[3], *envp[3];
|
char *argv[3], *envp[3];
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
mutex_lock(&cgroup_mutex);
|
/* snoop agent path and exit early if empty */
|
||||||
|
if (!cgrp->root->release_agent_path[0])
|
||||||
|
return;
|
||||||
|
|
||||||
|
/* prepare argument buffers */
|
||||||
pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
|
pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
|
||||||
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
|
agentbuf = kmalloc(PATH_MAX, GFP_KERNEL);
|
||||||
if (!pathbuf || !agentbuf || !strlen(agentbuf))
|
if (!pathbuf || !agentbuf)
|
||||||
goto out;
|
goto out_free;
|
||||||
|
|
||||||
spin_lock_irq(&css_set_lock);
|
spin_lock(&release_agent_path_lock);
|
||||||
ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
|
strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
|
||||||
spin_unlock_irq(&css_set_lock);
|
spin_unlock(&release_agent_path_lock);
|
||||||
|
if (!agentbuf[0])
|
||||||
|
goto out_free;
|
||||||
|
|
||||||
|
ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
|
||||||
if (ret < 0 || ret >= PATH_MAX)
|
if (ret < 0 || ret >= PATH_MAX)
|
||||||
goto out;
|
goto out_free;
|
||||||
|
|
||||||
argv[0] = agentbuf;
|
argv[0] = agentbuf;
|
||||||
argv[1] = pathbuf;
|
argv[1] = pathbuf;
|
||||||
|
@ -801,11 +805,7 @@ void cgroup1_release_agent(struct work_struct *work)
|
||||||
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
|
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
|
||||||
envp[2] = NULL;
|
envp[2] = NULL;
|
||||||
|
|
||||||
mutex_unlock(&cgroup_mutex);
|
|
||||||
call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
|
call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
|
||||||
goto out_free;
|
|
||||||
out:
|
|
||||||
mutex_unlock(&cgroup_mutex);
|
|
||||||
out_free:
|
out_free:
|
||||||
kfree(agentbuf);
|
kfree(agentbuf);
|
||||||
kfree(pathbuf);
|
kfree(pathbuf);
|
||||||
|
|
|
@ -1966,7 +1966,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
|
||||||
|
|
||||||
root->kf_root = kernfs_create_root(kf_sops,
|
root->kf_root = kernfs_create_root(kf_sops,
|
||||||
KERNFS_ROOT_CREATE_DEACTIVATED |
|
KERNFS_ROOT_CREATE_DEACTIVATED |
|
||||||
KERNFS_ROOT_SUPPORT_EXPORTOP,
|
KERNFS_ROOT_SUPPORT_EXPORTOP |
|
||||||
|
KERNFS_ROOT_SUPPORT_USER_XATTR,
|
||||||
root_cgrp);
|
root_cgrp);
|
||||||
if (IS_ERR(root->kf_root)) {
|
if (IS_ERR(root->kf_root)) {
|
||||||
ret = PTR_ERR(root->kf_root);
|
ret = PTR_ERR(root->kf_root);
|
||||||
|
@ -2726,11 +2727,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
|
||||||
{
|
{
|
||||||
DEFINE_CGROUP_MGCTX(mgctx);
|
DEFINE_CGROUP_MGCTX(mgctx);
|
||||||
struct task_struct *task;
|
struct task_struct *task;
|
||||||
int ret;
|
int ret = 0;
|
||||||
|
|
||||||
ret = cgroup_migrate_vet_dst(dst_cgrp);
|
|
||||||
if (ret)
|
|
||||||
return ret;
|
|
||||||
|
|
||||||
/* look up all src csets */
|
/* look up all src csets */
|
||||||
spin_lock_irq(&css_set_lock);
|
spin_lock_irq(&css_set_lock);
|
||||||
|
@ -4160,7 +4157,8 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
|
||||||
} else if (likely(!(pos->flags & CSS_RELEASED))) {
|
} else if (likely(!(pos->flags & CSS_RELEASED))) {
|
||||||
next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
|
next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
|
||||||
} else {
|
} else {
|
||||||
list_for_each_entry_rcu(next, &parent->children, sibling)
|
list_for_each_entry_rcu(next, &parent->children, sibling,
|
||||||
|
lockdep_is_held(&cgroup_mutex))
|
||||||
if (next->serial_nr > pos->serial_nr)
|
if (next->serial_nr > pos->serial_nr)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -4403,29 +4401,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
|
||||||
|
|
||||||
lockdep_assert_held(&css_set_lock);
|
lockdep_assert_held(&css_set_lock);
|
||||||
|
|
||||||
/* Advance to the next non-empty css_set */
|
/* Advance to the next non-empty css_set and find first non-empty tasks list*/
|
||||||
do {
|
while ((cset = css_task_iter_next_css_set(it))) {
|
||||||
cset = css_task_iter_next_css_set(it);
|
if (!list_empty(&cset->tasks)) {
|
||||||
if (!cset) {
|
it->cur_tasks_head = &cset->tasks;
|
||||||
it->task_pos = NULL;
|
break;
|
||||||
return;
|
} else if (!list_empty(&cset->mg_tasks)) {
|
||||||
|
it->cur_tasks_head = &cset->mg_tasks;
|
||||||
|
break;
|
||||||
|
} else if (!list_empty(&cset->dying_tasks)) {
|
||||||
|
it->cur_tasks_head = &cset->dying_tasks;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
} while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
|
|
||||||
|
|
||||||
if (!list_empty(&cset->tasks)) {
|
|
||||||
it->task_pos = cset->tasks.next;
|
|
||||||
it->cur_tasks_head = &cset->tasks;
|
|
||||||
} else if (!list_empty(&cset->mg_tasks)) {
|
|
||||||
it->task_pos = cset->mg_tasks.next;
|
|
||||||
it->cur_tasks_head = &cset->mg_tasks;
|
|
||||||
} else {
|
|
||||||
it->task_pos = cset->dying_tasks.next;
|
|
||||||
it->cur_tasks_head = &cset->dying_tasks;
|
|
||||||
}
|
}
|
||||||
|
if (!cset) {
|
||||||
it->tasks_head = &cset->tasks;
|
it->task_pos = NULL;
|
||||||
it->mg_tasks_head = &cset->mg_tasks;
|
return;
|
||||||
it->dying_tasks_head = &cset->dying_tasks;
|
}
|
||||||
|
it->task_pos = it->cur_tasks_head->next;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We don't keep css_sets locked across iteration steps and thus
|
* We don't keep css_sets locked across iteration steps and thus
|
||||||
|
@ -4470,24 +4463,24 @@ static void css_task_iter_advance(struct css_task_iter *it)
|
||||||
repeat:
|
repeat:
|
||||||
if (it->task_pos) {
|
if (it->task_pos) {
|
||||||
/*
|
/*
|
||||||
* Advance iterator to find next entry. cset->tasks is
|
* Advance iterator to find next entry. We go through cset
|
||||||
* consumed first and then ->mg_tasks. After ->mg_tasks,
|
* tasks, mg_tasks and dying_tasks, when consumed we move onto
|
||||||
* we move onto the next cset.
|
* the next cset.
|
||||||
*/
|
*/
|
||||||
if (it->flags & CSS_TASK_ITER_SKIPPED)
|
if (it->flags & CSS_TASK_ITER_SKIPPED)
|
||||||
it->flags &= ~CSS_TASK_ITER_SKIPPED;
|
it->flags &= ~CSS_TASK_ITER_SKIPPED;
|
||||||
else
|
else
|
||||||
it->task_pos = it->task_pos->next;
|
it->task_pos = it->task_pos->next;
|
||||||
|
|
||||||
if (it->task_pos == it->tasks_head) {
|
if (it->task_pos == &it->cur_cset->tasks) {
|
||||||
it->task_pos = it->mg_tasks_head->next;
|
it->cur_tasks_head = &it->cur_cset->mg_tasks;
|
||||||
it->cur_tasks_head = it->mg_tasks_head;
|
it->task_pos = it->cur_tasks_head->next;
|
||||||
}
|
}
|
||||||
if (it->task_pos == it->mg_tasks_head) {
|
if (it->task_pos == &it->cur_cset->mg_tasks) {
|
||||||
it->task_pos = it->dying_tasks_head->next;
|
it->cur_tasks_head = &it->cur_cset->dying_tasks;
|
||||||
it->cur_tasks_head = it->dying_tasks_head;
|
it->task_pos = it->cur_tasks_head->next;
|
||||||
}
|
}
|
||||||
if (it->task_pos == it->dying_tasks_head)
|
if (it->task_pos == &it->cur_cset->dying_tasks)
|
||||||
css_task_iter_advance_css_set(it);
|
css_task_iter_advance_css_set(it);
|
||||||
} else {
|
} else {
|
||||||
/* called from start, proceed to the first cset */
|
/* called from start, proceed to the first cset */
|
||||||
|
@ -4505,12 +4498,12 @@ static void css_task_iter_advance(struct css_task_iter *it)
|
||||||
goto repeat;
|
goto repeat;
|
||||||
|
|
||||||
/* and dying leaders w/o live member threads */
|
/* and dying leaders w/o live member threads */
|
||||||
if (it->cur_tasks_head == it->dying_tasks_head &&
|
if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
|
||||||
!atomic_read(&task->signal->live))
|
!atomic_read(&task->signal->live))
|
||||||
goto repeat;
|
goto repeat;
|
||||||
} else {
|
} else {
|
||||||
/* skip all dying ones */
|
/* skip all dying ones */
|
||||||
if (it->cur_tasks_head == it->dying_tasks_head)
|
if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
|
||||||
goto repeat;
|
goto repeat;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4674,13 +4667,28 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
struct inode *inode;
|
||||||
|
|
||||||
|
lockdep_assert_held(&cgroup_mutex);
|
||||||
|
|
||||||
|
inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
|
||||||
|
if (!inode)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
ret = inode_permission(inode, MAY_WRITE);
|
||||||
|
iput(inode);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
|
static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
|
||||||
struct cgroup *dst_cgrp,
|
struct cgroup *dst_cgrp,
|
||||||
struct super_block *sb)
|
struct super_block *sb)
|
||||||
{
|
{
|
||||||
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
|
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
|
||||||
struct cgroup *com_cgrp = src_cgrp;
|
struct cgroup *com_cgrp = src_cgrp;
|
||||||
struct inode *inode;
|
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
lockdep_assert_held(&cgroup_mutex);
|
lockdep_assert_held(&cgroup_mutex);
|
||||||
|
@ -4690,12 +4698,7 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
|
||||||
com_cgrp = cgroup_parent(com_cgrp);
|
com_cgrp = cgroup_parent(com_cgrp);
|
||||||
|
|
||||||
/* %current should be authorized to migrate to the common ancestor */
|
/* %current should be authorized to migrate to the common ancestor */
|
||||||
inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
|
ret = cgroup_may_write(com_cgrp, sb);
|
||||||
if (!inode)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
ret = inode_permission(inode, MAY_WRITE);
|
|
||||||
iput(inode);
|
|
||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
|
@ -4711,6 +4714,26 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int cgroup_attach_permissions(struct cgroup *src_cgrp,
|
||||||
|
struct cgroup *dst_cgrp,
|
||||||
|
struct super_block *sb, bool threadgroup)
|
||||||
|
{
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
ret = cgroup_migrate_vet_dst(dst_cgrp);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
|
||||||
|
ret = -EOPNOTSUPP;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
|
static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
|
||||||
char *buf, size_t nbytes, loff_t off)
|
char *buf, size_t nbytes, loff_t off)
|
||||||
{
|
{
|
||||||
|
@ -4733,8 +4756,8 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
|
||||||
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
|
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
|
||||||
spin_unlock_irq(&css_set_lock);
|
spin_unlock_irq(&css_set_lock);
|
||||||
|
|
||||||
ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
|
ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
|
||||||
of->file->f_path.dentry->d_sb);
|
of->file->f_path.dentry->d_sb, true);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out_finish;
|
goto out_finish;
|
||||||
|
|
||||||
|
@ -4778,16 +4801,11 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
|
||||||
spin_unlock_irq(&css_set_lock);
|
spin_unlock_irq(&css_set_lock);
|
||||||
|
|
||||||
/* thread migrations follow the cgroup.procs delegation rule */
|
/* thread migrations follow the cgroup.procs delegation rule */
|
||||||
ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
|
ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
|
||||||
of->file->f_path.dentry->d_sb);
|
of->file->f_path.dentry->d_sb, false);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out_finish;
|
goto out_finish;
|
||||||
|
|
||||||
/* and must be contained in the same domain */
|
|
||||||
ret = -EOPNOTSUPP;
|
|
||||||
if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
|
|
||||||
goto out_finish;
|
|
||||||
|
|
||||||
ret = cgroup_attach_task(dst_cgrp, task, false);
|
ret = cgroup_attach_task(dst_cgrp, task, false);
|
||||||
|
|
||||||
out_finish:
|
out_finish:
|
||||||
|
@ -5876,8 +5894,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
|
||||||
* @child: pointer to task_struct of forking parent process.
|
* @child: pointer to task_struct of forking parent process.
|
||||||
*
|
*
|
||||||
* A task is associated with the init_css_set until cgroup_post_fork()
|
* A task is associated with the init_css_set until cgroup_post_fork()
|
||||||
* attaches it to the parent's css_set. Empty cg_list indicates that
|
* attaches it to the target css_set.
|
||||||
* @child isn't holding reference to its css_set.
|
|
||||||
*/
|
*/
|
||||||
void cgroup_fork(struct task_struct *child)
|
void cgroup_fork(struct task_struct *child)
|
||||||
{
|
{
|
||||||
|
@ -5885,21 +5902,172 @@ void cgroup_fork(struct task_struct *child)
|
||||||
INIT_LIST_HEAD(&child->cg_list);
|
INIT_LIST_HEAD(&child->cg_list);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static struct cgroup *cgroup_get_from_file(struct file *f)
|
||||||
|
{
|
||||||
|
struct cgroup_subsys_state *css;
|
||||||
|
struct cgroup *cgrp;
|
||||||
|
|
||||||
|
css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
|
||||||
|
if (IS_ERR(css))
|
||||||
|
return ERR_CAST(css);
|
||||||
|
|
||||||
|
cgrp = css->cgroup;
|
||||||
|
if (!cgroup_on_dfl(cgrp)) {
|
||||||
|
cgroup_put(cgrp);
|
||||||
|
return ERR_PTR(-EBADF);
|
||||||
|
}
|
||||||
|
|
||||||
|
return cgrp;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* cgroup_css_set_fork - find or create a css_set for a child process
|
||||||
|
* @kargs: the arguments passed to create the child process
|
||||||
|
*
|
||||||
|
* This functions finds or creates a new css_set which the child
|
||||||
|
* process will be attached to in cgroup_post_fork(). By default,
|
||||||
|
* the child process will be given the same css_set as its parent.
|
||||||
|
*
|
||||||
|
* If CLONE_INTO_CGROUP is specified this function will try to find an
|
||||||
|
* existing css_set which includes the requested cgroup and if not create
|
||||||
|
* a new css_set that the child will be attached to later. If this function
|
||||||
|
* succeeds it will hold cgroup_threadgroup_rwsem on return. If
|
||||||
|
* CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
|
||||||
|
* before grabbing cgroup_threadgroup_rwsem and will hold a reference
|
||||||
|
* to the target cgroup.
|
||||||
|
*/
|
||||||
|
static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
|
||||||
|
__acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
struct cgroup *dst_cgrp = NULL;
|
||||||
|
struct css_set *cset;
|
||||||
|
struct super_block *sb;
|
||||||
|
struct file *f;
|
||||||
|
|
||||||
|
if (kargs->flags & CLONE_INTO_CGROUP)
|
||||||
|
mutex_lock(&cgroup_mutex);
|
||||||
|
|
||||||
|
cgroup_threadgroup_change_begin(current);
|
||||||
|
|
||||||
|
spin_lock_irq(&css_set_lock);
|
||||||
|
cset = task_css_set(current);
|
||||||
|
get_css_set(cset);
|
||||||
|
spin_unlock_irq(&css_set_lock);
|
||||||
|
|
||||||
|
if (!(kargs->flags & CLONE_INTO_CGROUP)) {
|
||||||
|
kargs->cset = cset;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
f = fget_raw(kargs->cgroup);
|
||||||
|
if (!f) {
|
||||||
|
ret = -EBADF;
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
sb = f->f_path.dentry->d_sb;
|
||||||
|
|
||||||
|
dst_cgrp = cgroup_get_from_file(f);
|
||||||
|
if (IS_ERR(dst_cgrp)) {
|
||||||
|
ret = PTR_ERR(dst_cgrp);
|
||||||
|
dst_cgrp = NULL;
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cgroup_is_dead(dst_cgrp)) {
|
||||||
|
ret = -ENODEV;
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Verify that we the target cgroup is writable for us. This is
|
||||||
|
* usually done by the vfs layer but since we're not going through
|
||||||
|
* the vfs layer here we need to do it "manually".
|
||||||
|
*/
|
||||||
|
ret = cgroup_may_write(dst_cgrp, sb);
|
||||||
|
if (ret)
|
||||||
|
goto err;
|
||||||
|
|
||||||
|
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
|
||||||
|
!(kargs->flags & CLONE_THREAD));
|
||||||
|
if (ret)
|
||||||
|
goto err;
|
||||||
|
|
||||||
|
kargs->cset = find_css_set(cset, dst_cgrp);
|
||||||
|
if (!kargs->cset) {
|
||||||
|
ret = -ENOMEM;
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
|
||||||
|
put_css_set(cset);
|
||||||
|
fput(f);
|
||||||
|
kargs->cgrp = dst_cgrp;
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
err:
|
||||||
|
cgroup_threadgroup_change_end(current);
|
||||||
|
mutex_unlock(&cgroup_mutex);
|
||||||
|
if (f)
|
||||||
|
fput(f);
|
||||||
|
if (dst_cgrp)
|
||||||
|
cgroup_put(dst_cgrp);
|
||||||
|
put_css_set(cset);
|
||||||
|
if (kargs->cset)
|
||||||
|
put_css_set(kargs->cset);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* cgroup_css_set_put_fork - drop references we took during fork
|
||||||
|
* @kargs: the arguments passed to create the child process
|
||||||
|
*
|
||||||
|
* Drop references to the prepared css_set and target cgroup if
|
||||||
|
* CLONE_INTO_CGROUP was requested.
|
||||||
|
*/
|
||||||
|
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
|
||||||
|
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
|
||||||
|
{
|
||||||
|
cgroup_threadgroup_change_end(current);
|
||||||
|
|
||||||
|
if (kargs->flags & CLONE_INTO_CGROUP) {
|
||||||
|
struct cgroup *cgrp = kargs->cgrp;
|
||||||
|
struct css_set *cset = kargs->cset;
|
||||||
|
|
||||||
|
mutex_unlock(&cgroup_mutex);
|
||||||
|
|
||||||
|
if (cset) {
|
||||||
|
put_css_set(cset);
|
||||||
|
kargs->cset = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cgrp) {
|
||||||
|
cgroup_put(cgrp);
|
||||||
|
kargs->cgrp = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* cgroup_can_fork - called on a new task before the process is exposed
|
* cgroup_can_fork - called on a new task before the process is exposed
|
||||||
* @child: the task in question.
|
* @child: the child process
|
||||||
*
|
*
|
||||||
* This calls the subsystem can_fork() callbacks. If the can_fork() callback
|
* This prepares a new css_set for the child process which the child will
|
||||||
* returns an error, the fork aborts with that error code. This allows for
|
* be attached to in cgroup_post_fork().
|
||||||
* a cgroup subsystem to conditionally allow or deny new forks.
|
* This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
|
||||||
|
* callback returns an error, the fork aborts with that error code. This
|
||||||
|
* allows for a cgroup subsystem to conditionally allow or deny new forks.
|
||||||
*/
|
*/
|
||||||
int cgroup_can_fork(struct task_struct *child)
|
int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
|
||||||
{
|
{
|
||||||
struct cgroup_subsys *ss;
|
struct cgroup_subsys *ss;
|
||||||
int i, j, ret;
|
int i, j, ret;
|
||||||
|
|
||||||
|
ret = cgroup_css_set_fork(kargs);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
|
||||||
do_each_subsys_mask(ss, i, have_canfork_callback) {
|
do_each_subsys_mask(ss, i, have_canfork_callback) {
|
||||||
ret = ss->can_fork(child);
|
ret = ss->can_fork(child, kargs->cset);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out_revert;
|
goto out_revert;
|
||||||
} while_each_subsys_mask();
|
} while_each_subsys_mask();
|
||||||
|
@ -5911,54 +6079,64 @@ int cgroup_can_fork(struct task_struct *child)
|
||||||
if (j >= i)
|
if (j >= i)
|
||||||
break;
|
break;
|
||||||
if (ss->cancel_fork)
|
if (ss->cancel_fork)
|
||||||
ss->cancel_fork(child);
|
ss->cancel_fork(child, kargs->cset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cgroup_css_set_put_fork(kargs);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
|
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
|
||||||
* @child: the task in question
|
* @child: the child process
|
||||||
|
* @kargs: the arguments passed to create the child process
|
||||||
*
|
*
|
||||||
* This calls the cancel_fork() callbacks if a fork failed *after*
|
* This calls the cancel_fork() callbacks if a fork failed *after*
|
||||||
* cgroup_can_fork() succeded.
|
* cgroup_can_fork() succeded and cleans up references we took to
|
||||||
|
* prepare a new css_set for the child process in cgroup_can_fork().
|
||||||
*/
|
*/
|
||||||
void cgroup_cancel_fork(struct task_struct *child)
|
void cgroup_cancel_fork(struct task_struct *child,
|
||||||
|
struct kernel_clone_args *kargs)
|
||||||
{
|
{
|
||||||
struct cgroup_subsys *ss;
|
struct cgroup_subsys *ss;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
for_each_subsys(ss, i)
|
for_each_subsys(ss, i)
|
||||||
if (ss->cancel_fork)
|
if (ss->cancel_fork)
|
||||||
ss->cancel_fork(child);
|
ss->cancel_fork(child, kargs->cset);
|
||||||
|
|
||||||
|
cgroup_css_set_put_fork(kargs);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* cgroup_post_fork - called on a new task after adding it to the task list
|
* cgroup_post_fork - finalize cgroup setup for the child process
|
||||||
* @child: the task in question
|
* @child: the child process
|
||||||
*
|
*
|
||||||
* Adds the task to the list running through its css_set if necessary and
|
* Attach the child process to its css_set calling the subsystem fork()
|
||||||
* call the subsystem fork() callbacks. Has to be after the task is
|
* callbacks.
|
||||||
* visible on the task list in case we race with the first call to
|
|
||||||
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
|
|
||||||
* list.
|
|
||||||
*/
|
*/
|
||||||
void cgroup_post_fork(struct task_struct *child)
|
void cgroup_post_fork(struct task_struct *child,
|
||||||
|
struct kernel_clone_args *kargs)
|
||||||
|
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
|
||||||
{
|
{
|
||||||
struct cgroup_subsys *ss;
|
struct cgroup_subsys *ss;
|
||||||
struct css_set *cset;
|
struct css_set *cset;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
cset = kargs->cset;
|
||||||
|
kargs->cset = NULL;
|
||||||
|
|
||||||
spin_lock_irq(&css_set_lock);
|
spin_lock_irq(&css_set_lock);
|
||||||
|
|
||||||
/* init tasks are special, only link regular threads */
|
/* init tasks are special, only link regular threads */
|
||||||
if (likely(child->pid)) {
|
if (likely(child->pid)) {
|
||||||
WARN_ON_ONCE(!list_empty(&child->cg_list));
|
WARN_ON_ONCE(!list_empty(&child->cg_list));
|
||||||
cset = task_css_set(current); /* current is @child's parent */
|
|
||||||
get_css_set(cset);
|
|
||||||
cset->nr_tasks++;
|
cset->nr_tasks++;
|
||||||
css_set_move_task(child, NULL, cset, false);
|
css_set_move_task(child, NULL, cset, false);
|
||||||
|
} else {
|
||||||
|
put_css_set(cset);
|
||||||
|
cset = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -5990,6 +6168,17 @@ void cgroup_post_fork(struct task_struct *child)
|
||||||
do_each_subsys_mask(ss, i, have_fork_callback) {
|
do_each_subsys_mask(ss, i, have_fork_callback) {
|
||||||
ss->fork(child);
|
ss->fork(child);
|
||||||
} while_each_subsys_mask();
|
} while_each_subsys_mask();
|
||||||
|
|
||||||
|
/* Make the new cset the root_cset of the new cgroup namespace. */
|
||||||
|
if (kargs->flags & CLONE_NEWCGROUP) {
|
||||||
|
struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
|
||||||
|
|
||||||
|
get_css_set(cset);
|
||||||
|
child->nsproxy->cgroup_ns->root_cset = cset;
|
||||||
|
put_css_set(rcset);
|
||||||
|
}
|
||||||
|
|
||||||
|
cgroup_css_set_put_fork(kargs);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -6176,7 +6365,6 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path);
|
||||||
*/
|
*/
|
||||||
struct cgroup *cgroup_get_from_fd(int fd)
|
struct cgroup *cgroup_get_from_fd(int fd)
|
||||||
{
|
{
|
||||||
struct cgroup_subsys_state *css;
|
|
||||||
struct cgroup *cgrp;
|
struct cgroup *cgrp;
|
||||||
struct file *f;
|
struct file *f;
|
||||||
|
|
||||||
|
@ -6184,17 +6372,8 @@ struct cgroup *cgroup_get_from_fd(int fd)
|
||||||
if (!f)
|
if (!f)
|
||||||
return ERR_PTR(-EBADF);
|
return ERR_PTR(-EBADF);
|
||||||
|
|
||||||
css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
|
cgrp = cgroup_get_from_file(f);
|
||||||
fput(f);
|
fput(f);
|
||||||
if (IS_ERR(css))
|
|
||||||
return ERR_CAST(css);
|
|
||||||
|
|
||||||
cgrp = css->cgroup;
|
|
||||||
if (!cgroup_on_dfl(cgrp)) {
|
|
||||||
cgroup_put(cgrp);
|
|
||||||
return ERR_PTR(-EBADF);
|
|
||||||
}
|
|
||||||
|
|
||||||
return cgrp;
|
return cgrp;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
|
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
|
||||||
|
|
|
@ -358,8 +358,12 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
|
||||||
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
|
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Cgroup v2 behavior is used when on default hierarchy or the
|
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
|
||||||
* cgroup_v2_mode flag is set.
|
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
|
||||||
|
* the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
|
||||||
|
* With v2 behavior, "cpus" and "mems" are always what the users have
|
||||||
|
* requested and won't be changed by hotplug events. Only the effective
|
||||||
|
* cpus or mems will be affected.
|
||||||
*/
|
*/
|
||||||
static inline bool is_in_v2_mode(void)
|
static inline bool is_in_v2_mode(void)
|
||||||
{
|
{
|
||||||
|
|
|
@ -33,6 +33,7 @@
|
||||||
#include <linux/atomic.h>
|
#include <linux/atomic.h>
|
||||||
#include <linux/cgroup.h>
|
#include <linux/cgroup.h>
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
|
#include <linux/sched/task.h>
|
||||||
|
|
||||||
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
|
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
|
||||||
#define PIDS_MAX_STR "max"
|
#define PIDS_MAX_STR "max"
|
||||||
|
@ -214,13 +215,16 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
|
||||||
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
|
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
|
||||||
* on cgroup_threadgroup_change_begin() held by the copy_process().
|
* on cgroup_threadgroup_change_begin() held by the copy_process().
|
||||||
*/
|
*/
|
||||||
static int pids_can_fork(struct task_struct *task)
|
static int pids_can_fork(struct task_struct *task, struct css_set *cset)
|
||||||
{
|
{
|
||||||
struct cgroup_subsys_state *css;
|
struct cgroup_subsys_state *css;
|
||||||
struct pids_cgroup *pids;
|
struct pids_cgroup *pids;
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
css = task_css_check(current, pids_cgrp_id, true);
|
if (cset)
|
||||||
|
css = cset->subsys[pids_cgrp_id];
|
||||||
|
else
|
||||||
|
css = task_css_check(current, pids_cgrp_id, true);
|
||||||
pids = css_pids(css);
|
pids = css_pids(css);
|
||||||
err = pids_try_charge(pids, 1);
|
err = pids_try_charge(pids, 1);
|
||||||
if (err) {
|
if (err) {
|
||||||
|
@ -235,12 +239,15 @@ static int pids_can_fork(struct task_struct *task)
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void pids_cancel_fork(struct task_struct *task)
|
static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
|
||||||
{
|
{
|
||||||
struct cgroup_subsys_state *css;
|
struct cgroup_subsys_state *css;
|
||||||
struct pids_cgroup *pids;
|
struct pids_cgroup *pids;
|
||||||
|
|
||||||
css = task_css_check(current, pids_cgrp_id, true);
|
if (cset)
|
||||||
|
css = cset->subsys[pids_cgrp_id];
|
||||||
|
else
|
||||||
|
css = task_css_check(current, pids_cgrp_id, true);
|
||||||
pids = css_pids(css);
|
pids = css_pids(css);
|
||||||
pids_uncharge(pids, 1);
|
pids_uncharge(pids, 1);
|
||||||
}
|
}
|
||||||
|
|
|
@ -2176,16 +2176,15 @@ static __latent_entropy struct task_struct *copy_process(
|
||||||
INIT_LIST_HEAD(&p->thread_group);
|
INIT_LIST_HEAD(&p->thread_group);
|
||||||
p->task_works = NULL;
|
p->task_works = NULL;
|
||||||
|
|
||||||
cgroup_threadgroup_change_begin(current);
|
|
||||||
/*
|
/*
|
||||||
* Ensure that the cgroup subsystem policies allow the new process to be
|
* Ensure that the cgroup subsystem policies allow the new process to be
|
||||||
* forked. It should be noted the the new process's css_set can be changed
|
* forked. It should be noted the the new process's css_set can be changed
|
||||||
* between here and cgroup_post_fork() if an organisation operation is in
|
* between here and cgroup_post_fork() if an organisation operation is in
|
||||||
* progress.
|
* progress.
|
||||||
*/
|
*/
|
||||||
retval = cgroup_can_fork(p);
|
retval = cgroup_can_fork(p, args);
|
||||||
if (retval)
|
if (retval)
|
||||||
goto bad_fork_cgroup_threadgroup_change_end;
|
goto bad_fork_put_pidfd;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* From this point on we must avoid any synchronous user-space
|
* From this point on we must avoid any synchronous user-space
|
||||||
|
@ -2290,8 +2289,7 @@ static __latent_entropy struct task_struct *copy_process(
|
||||||
write_unlock_irq(&tasklist_lock);
|
write_unlock_irq(&tasklist_lock);
|
||||||
|
|
||||||
proc_fork_connector(p);
|
proc_fork_connector(p);
|
||||||
cgroup_post_fork(p);
|
cgroup_post_fork(p, args);
|
||||||
cgroup_threadgroup_change_end(current);
|
|
||||||
perf_event_fork(p);
|
perf_event_fork(p);
|
||||||
|
|
||||||
trace_task_newtask(p, clone_flags);
|
trace_task_newtask(p, clone_flags);
|
||||||
|
@ -2302,9 +2300,7 @@ static __latent_entropy struct task_struct *copy_process(
|
||||||
bad_fork_cancel_cgroup:
|
bad_fork_cancel_cgroup:
|
||||||
spin_unlock(¤t->sighand->siglock);
|
spin_unlock(¤t->sighand->siglock);
|
||||||
write_unlock_irq(&tasklist_lock);
|
write_unlock_irq(&tasklist_lock);
|
||||||
cgroup_cancel_fork(p);
|
cgroup_cancel_fork(p, args);
|
||||||
bad_fork_cgroup_threadgroup_change_end:
|
|
||||||
cgroup_threadgroup_change_end(current);
|
|
||||||
bad_fork_put_pidfd:
|
bad_fork_put_pidfd:
|
||||||
if (clone_flags & CLONE_PIDFD) {
|
if (clone_flags & CLONE_PIDFD) {
|
||||||
fput(pidfile);
|
fput(pidfile);
|
||||||
|
@ -2633,6 +2629,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
|
||||||
!valid_signal(args.exit_signal)))
|
!valid_signal(args.exit_signal)))
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
|
if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
*kargs = (struct kernel_clone_args){
|
*kargs = (struct kernel_clone_args){
|
||||||
.flags = args.flags,
|
.flags = args.flags,
|
||||||
.pidfd = u64_to_user_ptr(args.pidfd),
|
.pidfd = u64_to_user_ptr(args.pidfd),
|
||||||
|
@ -2643,6 +2642,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
|
||||||
.stack_size = args.stack_size,
|
.stack_size = args.stack_size,
|
||||||
.tls = args.tls,
|
.tls = args.tls,
|
||||||
.set_tid_size = args.set_tid_size,
|
.set_tid_size = args.set_tid_size,
|
||||||
|
.cgroup = args.cgroup,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (args.set_tid &&
|
if (args.set_tid &&
|
||||||
|
@ -2686,7 +2686,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
|
||||||
static bool clone3_args_valid(struct kernel_clone_args *kargs)
|
static bool clone3_args_valid(struct kernel_clone_args *kargs)
|
||||||
{
|
{
|
||||||
/* Verify that no unknown flags are passed along. */
|
/* Verify that no unknown flags are passed along. */
|
||||||
if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND))
|
if (kargs->flags &
|
||||||
|
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -3243,7 +3243,7 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler,
|
||||||
struct shmem_inode_info *info = SHMEM_I(inode);
|
struct shmem_inode_info *info = SHMEM_I(inode);
|
||||||
|
|
||||||
name = xattr_full_name(handler, name);
|
name = xattr_full_name(handler, name);
|
||||||
return simple_xattr_set(&info->xattrs, name, value, size, flags);
|
return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const struct xattr_handler shmem_security_xattr_handler = {
|
static const struct xattr_handler shmem_security_xattr_handler = {
|
||||||
|
|
|
@ -11,6 +11,6 @@ TEST_GEN_PROGS += test_freezer
|
||||||
|
|
||||||
include ../lib.mk
|
include ../lib.mk
|
||||||
|
|
||||||
$(OUTPUT)/test_memcontrol: cgroup_util.c
|
$(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h
|
||||||
$(OUTPUT)/test_core: cgroup_util.c
|
$(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h
|
||||||
$(OUTPUT)/test_freezer: cgroup_util.c
|
$(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
#include "cgroup_util.h"
|
#include "cgroup_util.h"
|
||||||
|
#include "../clone3/clone3_selftests.h"
|
||||||
|
|
||||||
static ssize_t read_text(const char *path, char *buf, size_t max_len)
|
static ssize_t read_text(const char *path, char *buf, size_t max_len)
|
||||||
{
|
{
|
||||||
|
@ -331,12 +332,112 @@ int cg_run(const char *cgroup,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pid_t clone_into_cgroup(int cgroup_fd)
|
||||||
|
{
|
||||||
|
#ifdef CLONE_ARGS_SIZE_VER2
|
||||||
|
pid_t pid;
|
||||||
|
|
||||||
|
struct clone_args args = {
|
||||||
|
.flags = CLONE_INTO_CGROUP,
|
||||||
|
.exit_signal = SIGCHLD,
|
||||||
|
.cgroup = cgroup_fd,
|
||||||
|
};
|
||||||
|
|
||||||
|
pid = sys_clone3(&args, sizeof(struct clone_args));
|
||||||
|
/*
|
||||||
|
* Verify that this is a genuine test failure:
|
||||||
|
* ENOSYS -> clone3() not available
|
||||||
|
* E2BIG -> CLONE_INTO_CGROUP not available
|
||||||
|
*/
|
||||||
|
if (pid < 0 && (errno == ENOSYS || errno == E2BIG))
|
||||||
|
goto pretend_enosys;
|
||||||
|
|
||||||
|
return pid;
|
||||||
|
|
||||||
|
pretend_enosys:
|
||||||
|
#endif
|
||||||
|
errno = ENOSYS;
|
||||||
|
return -ENOSYS;
|
||||||
|
}
|
||||||
|
|
||||||
|
int clone_reap(pid_t pid, int options)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
siginfo_t info = {
|
||||||
|
.si_signo = 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
again:
|
||||||
|
ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD);
|
||||||
|
if (ret < 0) {
|
||||||
|
if (errno == EINTR)
|
||||||
|
goto again;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options & WEXITED) {
|
||||||
|
if (WIFEXITED(info.si_status))
|
||||||
|
return WEXITSTATUS(info.si_status);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options & WSTOPPED) {
|
||||||
|
if (WIFSTOPPED(info.si_status))
|
||||||
|
return WSTOPSIG(info.si_status);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options & WCONTINUED) {
|
||||||
|
if (WIFCONTINUED(info.si_status))
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int dirfd_open_opath(const char *dir)
|
||||||
|
{
|
||||||
|
return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define close_prot_errno(fd) \
|
||||||
|
if (fd >= 0) { \
|
||||||
|
int _e_ = errno; \
|
||||||
|
close(fd); \
|
||||||
|
errno = _e_; \
|
||||||
|
}
|
||||||
|
|
||||||
|
static int clone_into_cgroup_run_nowait(const char *cgroup,
|
||||||
|
int (*fn)(const char *cgroup, void *arg),
|
||||||
|
void *arg)
|
||||||
|
{
|
||||||
|
int cgroup_fd;
|
||||||
|
pid_t pid;
|
||||||
|
|
||||||
|
cgroup_fd = dirfd_open_opath(cgroup);
|
||||||
|
if (cgroup_fd < 0)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
pid = clone_into_cgroup(cgroup_fd);
|
||||||
|
close_prot_errno(cgroup_fd);
|
||||||
|
if (pid == 0)
|
||||||
|
exit(fn(cgroup, arg));
|
||||||
|
|
||||||
|
return pid;
|
||||||
|
}
|
||||||
|
|
||||||
int cg_run_nowait(const char *cgroup,
|
int cg_run_nowait(const char *cgroup,
|
||||||
int (*fn)(const char *cgroup, void *arg),
|
int (*fn)(const char *cgroup, void *arg),
|
||||||
void *arg)
|
void *arg)
|
||||||
{
|
{
|
||||||
int pid;
|
int pid;
|
||||||
|
|
||||||
|
pid = clone_into_cgroup_run_nowait(cgroup, fn, arg);
|
||||||
|
if (pid > 0)
|
||||||
|
return pid;
|
||||||
|
|
||||||
|
/* Genuine test failure. */
|
||||||
|
if (pid < 0 && errno != ENOSYS)
|
||||||
|
return -1;
|
||||||
|
|
||||||
pid = fork();
|
pid = fork();
|
||||||
if (pid == 0) {
|
if (pid == 0) {
|
||||||
char buf[64];
|
char buf[64];
|
||||||
|
@ -450,3 +551,28 @@ int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
|
||||||
|
|
||||||
return strstr(buf, needle) ? 0 : -1;
|
return strstr(buf, needle) ? 0 : -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int clone_into_cgroup_run_wait(const char *cgroup)
|
||||||
|
{
|
||||||
|
int cgroup_fd;
|
||||||
|
pid_t pid;
|
||||||
|
|
||||||
|
cgroup_fd = dirfd_open_opath(cgroup);
|
||||||
|
if (cgroup_fd < 0)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
pid = clone_into_cgroup(cgroup_fd);
|
||||||
|
close_prot_errno(cgroup_fd);
|
||||||
|
if (pid < 0)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
if (pid == 0)
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We don't care whether this fails. We only care whether the initial
|
||||||
|
* clone succeeded.
|
||||||
|
*/
|
||||||
|
(void)clone_reap(pid, WEXITED);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
|
@ -50,3 +50,7 @@ extern int cg_wait_for_proc_count(const char *cgroup, int count);
|
||||||
extern int cg_killall(const char *cgroup);
|
extern int cg_killall(const char *cgroup);
|
||||||
extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size);
|
extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size);
|
||||||
extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle);
|
extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle);
|
||||||
|
extern pid_t clone_into_cgroup(int cgroup_fd);
|
||||||
|
extern int clone_reap(pid_t pid, int options);
|
||||||
|
extern int clone_into_cgroup_run_wait(const char *cgroup);
|
||||||
|
extern int dirfd_open_opath(const char *dir);
|
||||||
|
|
|
@ -2,7 +2,10 @@
|
||||||
|
|
||||||
#include <linux/limits.h>
|
#include <linux/limits.h>
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
|
#include <sys/mman.h>
|
||||||
|
#include <sys/wait.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
#include <fcntl.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
|
@ -12,6 +15,115 @@
|
||||||
#include "../kselftest.h"
|
#include "../kselftest.h"
|
||||||
#include "cgroup_util.h"
|
#include "cgroup_util.h"
|
||||||
|
|
||||||
|
static int touch_anon(char *buf, size_t size)
|
||||||
|
{
|
||||||
|
int fd;
|
||||||
|
char *pos = buf;
|
||||||
|
|
||||||
|
fd = open("/dev/urandom", O_RDONLY);
|
||||||
|
if (fd < 0)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
while (size > 0) {
|
||||||
|
ssize_t ret = read(fd, pos, size);
|
||||||
|
|
||||||
|
if (ret < 0) {
|
||||||
|
if (errno != EINTR) {
|
||||||
|
close(fd);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
pos += ret;
|
||||||
|
size -= ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close(fd);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int alloc_and_touch_anon_noexit(const char *cgroup, void *arg)
|
||||||
|
{
|
||||||
|
int ppid = getppid();
|
||||||
|
size_t size = (size_t)arg;
|
||||||
|
void *buf;
|
||||||
|
|
||||||
|
buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
|
||||||
|
0, 0);
|
||||||
|
if (buf == MAP_FAILED)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
if (touch_anon((char *)buf, size)) {
|
||||||
|
munmap(buf, size);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (getppid() == ppid)
|
||||||
|
sleep(1);
|
||||||
|
|
||||||
|
munmap(buf, size);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create a child process that allocates and touches 100MB, then waits to be
|
||||||
|
* killed. Wait until the child is attached to the cgroup, kill all processes
|
||||||
|
* in that cgroup and wait until "cgroup.procs" is empty. At this point try to
|
||||||
|
* destroy the empty cgroup. The test helps detect race conditions between
|
||||||
|
* dying processes leaving the cgroup and cgroup destruction path.
|
||||||
|
*/
|
||||||
|
static int test_cgcore_destroy(const char *root)
|
||||||
|
{
|
||||||
|
int ret = KSFT_FAIL;
|
||||||
|
char *cg_test = NULL;
|
||||||
|
int child_pid;
|
||||||
|
char buf[PAGE_SIZE];
|
||||||
|
|
||||||
|
cg_test = cg_name(root, "cg_test");
|
||||||
|
|
||||||
|
if (!cg_test)
|
||||||
|
goto cleanup;
|
||||||
|
|
||||||
|
for (int i = 0; i < 10; i++) {
|
||||||
|
if (cg_create(cg_test))
|
||||||
|
goto cleanup;
|
||||||
|
|
||||||
|
child_pid = cg_run_nowait(cg_test, alloc_and_touch_anon_noexit,
|
||||||
|
(void *) MB(100));
|
||||||
|
|
||||||
|
if (child_pid < 0)
|
||||||
|
goto cleanup;
|
||||||
|
|
||||||
|
/* wait for the child to enter cgroup */
|
||||||
|
if (cg_wait_for_proc_count(cg_test, 1))
|
||||||
|
goto cleanup;
|
||||||
|
|
||||||
|
if (cg_killall(cg_test))
|
||||||
|
goto cleanup;
|
||||||
|
|
||||||
|
/* wait for cgroup to be empty */
|
||||||
|
while (1) {
|
||||||
|
if (cg_read(cg_test, "cgroup.procs", buf, sizeof(buf)))
|
||||||
|
goto cleanup;
|
||||||
|
if (buf[0] == '\0')
|
||||||
|
break;
|
||||||
|
usleep(1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rmdir(cg_test))
|
||||||
|
goto cleanup;
|
||||||
|
|
||||||
|
if (waitpid(child_pid, NULL, 0) < 0)
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
ret = KSFT_PASS;
|
||||||
|
cleanup:
|
||||||
|
if (cg_test)
|
||||||
|
cg_destroy(cg_test);
|
||||||
|
free(cg_test);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* A(0) - B(0) - C(1)
|
* A(0) - B(0) - C(1)
|
||||||
* \ D(0)
|
* \ D(0)
|
||||||
|
@ -25,8 +137,11 @@
|
||||||
static int test_cgcore_populated(const char *root)
|
static int test_cgcore_populated(const char *root)
|
||||||
{
|
{
|
||||||
int ret = KSFT_FAIL;
|
int ret = KSFT_FAIL;
|
||||||
|
int err;
|
||||||
char *cg_test_a = NULL, *cg_test_b = NULL;
|
char *cg_test_a = NULL, *cg_test_b = NULL;
|
||||||
char *cg_test_c = NULL, *cg_test_d = NULL;
|
char *cg_test_c = NULL, *cg_test_d = NULL;
|
||||||
|
int cgroup_fd = -EBADF;
|
||||||
|
pid_t pid;
|
||||||
|
|
||||||
cg_test_a = cg_name(root, "cg_test_a");
|
cg_test_a = cg_name(root, "cg_test_a");
|
||||||
cg_test_b = cg_name(root, "cg_test_a/cg_test_b");
|
cg_test_b = cg_name(root, "cg_test_a/cg_test_b");
|
||||||
|
@ -78,6 +193,52 @@ static int test_cgcore_populated(const char *root)
|
||||||
if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
|
if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
|
|
||||||
|
/* Test that we can directly clone into a new cgroup. */
|
||||||
|
cgroup_fd = dirfd_open_opath(cg_test_d);
|
||||||
|
if (cgroup_fd < 0)
|
||||||
|
goto cleanup;
|
||||||
|
|
||||||
|
pid = clone_into_cgroup(cgroup_fd);
|
||||||
|
if (pid < 0) {
|
||||||
|
if (errno == ENOSYS)
|
||||||
|
goto cleanup_pass;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pid == 0) {
|
||||||
|
if (raise(SIGSTOP))
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
err = cg_read_strcmp(cg_test_d, "cgroup.events", "populated 1\n");
|
||||||
|
|
||||||
|
(void)clone_reap(pid, WSTOPPED);
|
||||||
|
(void)kill(pid, SIGCONT);
|
||||||
|
(void)clone_reap(pid, WEXITED);
|
||||||
|
|
||||||
|
if (err)
|
||||||
|
goto cleanup;
|
||||||
|
|
||||||
|
if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
|
||||||
|
goto cleanup;
|
||||||
|
|
||||||
|
/* Remove cgroup. */
|
||||||
|
if (cg_test_d) {
|
||||||
|
cg_destroy(cg_test_d);
|
||||||
|
free(cg_test_d);
|
||||||
|
cg_test_d = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
pid = clone_into_cgroup(cgroup_fd);
|
||||||
|
if (pid < 0)
|
||||||
|
goto cleanup_pass;
|
||||||
|
if (pid == 0)
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
|
(void)clone_reap(pid, WEXITED);
|
||||||
|
goto cleanup;
|
||||||
|
|
||||||
|
cleanup_pass:
|
||||||
ret = KSFT_PASS;
|
ret = KSFT_PASS;
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
|
@ -93,6 +254,8 @@ static int test_cgcore_populated(const char *root)
|
||||||
free(cg_test_c);
|
free(cg_test_c);
|
||||||
free(cg_test_b);
|
free(cg_test_b);
|
||||||
free(cg_test_a);
|
free(cg_test_a);
|
||||||
|
if (cgroup_fd >= 0)
|
||||||
|
close(cgroup_fd);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -136,6 +299,16 @@ static int test_cgcore_invalid_domain(const char *root)
|
||||||
if (errno != EOPNOTSUPP)
|
if (errno != EOPNOTSUPP)
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
|
|
||||||
|
if (!clone_into_cgroup_run_wait(child))
|
||||||
|
goto cleanup;
|
||||||
|
|
||||||
|
if (errno == ENOSYS)
|
||||||
|
goto cleanup_pass;
|
||||||
|
|
||||||
|
if (errno != EOPNOTSUPP)
|
||||||
|
goto cleanup;
|
||||||
|
|
||||||
|
cleanup_pass:
|
||||||
ret = KSFT_PASS;
|
ret = KSFT_PASS;
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
|
@ -345,6 +518,9 @@ static int test_cgcore_internal_process_constraint(const char *root)
|
||||||
if (!cg_enter_current(parent))
|
if (!cg_enter_current(parent))
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
|
|
||||||
|
if (!clone_into_cgroup_run_wait(parent))
|
||||||
|
goto cleanup;
|
||||||
|
|
||||||
ret = KSFT_PASS;
|
ret = KSFT_PASS;
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
|
@ -512,6 +688,7 @@ struct corecg_test {
|
||||||
T(test_cgcore_populated),
|
T(test_cgcore_populated),
|
||||||
T(test_cgcore_proc_migration),
|
T(test_cgcore_proc_migration),
|
||||||
T(test_cgcore_thread_migration),
|
T(test_cgcore_thread_migration),
|
||||||
|
T(test_cgcore_destroy),
|
||||||
};
|
};
|
||||||
#undef T
|
#undef T
|
||||||
|
|
||||||
|
|
|
@ -5,12 +5,24 @@
|
||||||
|
|
||||||
#define _GNU_SOURCE
|
#define _GNU_SOURCE
|
||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
|
#include <linux/sched.h>
|
||||||
|
#include <linux/types.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <syscall.h>
|
#include <syscall.h>
|
||||||
#include <linux/types.h>
|
#include <sys/wait.h>
|
||||||
|
|
||||||
|
#include "../kselftest.h"
|
||||||
|
|
||||||
#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
|
#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
|
||||||
|
|
||||||
|
#ifndef CLONE_INTO_CGROUP
|
||||||
|
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef CLONE_ARGS_SIZE_VER0
|
||||||
|
#define CLONE_ARGS_SIZE_VER0 64
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef __NR_clone3
|
#ifndef __NR_clone3
|
||||||
#define __NR_clone3 -1
|
#define __NR_clone3 -1
|
||||||
struct clone_args {
|
struct clone_args {
|
||||||
|
@ -22,10 +34,13 @@ struct clone_args {
|
||||||
__aligned_u64 stack;
|
__aligned_u64 stack;
|
||||||
__aligned_u64 stack_size;
|
__aligned_u64 stack_size;
|
||||||
__aligned_u64 tls;
|
__aligned_u64 tls;
|
||||||
|
#define CLONE_ARGS_SIZE_VER1 80
|
||||||
__aligned_u64 set_tid;
|
__aligned_u64 set_tid;
|
||||||
__aligned_u64 set_tid_size;
|
__aligned_u64 set_tid_size;
|
||||||
|
#define CLONE_ARGS_SIZE_VER2 88
|
||||||
|
__aligned_u64 cgroup;
|
||||||
};
|
};
|
||||||
#endif
|
#endif /* __NR_clone3 */
|
||||||
|
|
||||||
static pid_t sys_clone3(struct clone_args *args, size_t size)
|
static pid_t sys_clone3(struct clone_args *args, size_t size)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue
Block a user