kernel_optimize_test/fs/xattr.c

1002 lines
24 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
File: fs/xattr.c
Extended attribute handling.
Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com>
Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/evm.h>
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fsnotify.h>
#include <linux/audit.h>
#include <linux/vmalloc.h>
2012-02-08 10:52:57 +08:00
#include <linux/posix_acl_xattr.h>
#include <linux/uaccess.h>
static const char *
strcmp_prefix(const char *a, const char *a_prefix)
{
while (*a_prefix && *a == *a_prefix) {
a++;
a_prefix++;
}
return *a_prefix ? NULL : a;
}
/*
* In order to implement different sets of xattr operations for each xattr
* prefix, a filesystem should create a null-terminated array of struct
* xattr_handler (one for each prefix) and hang a pointer to it off of the
* s_xattr field of the superblock.
*/
#define for_each_xattr_handler(handlers, handler) \
if (handlers) \
for ((handler) = *(handlers)++; \
(handler) != NULL; \
(handler) = *(handlers)++)
/*
* Find the xattr_handler with the matching prefix.
*/
static const struct xattr_handler *
xattr_resolve_name(struct inode *inode, const char **name)
{
const struct xattr_handler **handlers = inode->i_sb->s_xattr;
const struct xattr_handler *handler;
if (!(inode->i_opflags & IOP_XATTR)) {
if (unlikely(is_bad_inode(inode)))
return ERR_PTR(-EIO);
return ERR_PTR(-EOPNOTSUPP);
}
for_each_xattr_handler(handlers, handler) {
const char *n;
n = strcmp_prefix(*name, xattr_prefix(handler));
if (n) {
if (!handler->prefix ^ !*n) {
if (*n)
continue;
return ERR_PTR(-EINVAL);
}
*name = n;
return handler;
}
}
return ERR_PTR(-EOPNOTSUPP);
}
/*
* Check permissions for extended attribute access. This is a bit complicated
* because different namespaces have very different rules.
*/
static int
xattr_permission(struct inode *inode, const char *name, int mask)
{
/*
* We can never set or remove an extended attribute on a read-only
* filesystem or on an immutable / append-only inode.
*/
if (mask & MAY_WRITE) {
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
vfs: Don't modify inodes with a uid or gid unknown to the vfs When a filesystem outside of init_user_ns is mounted it could have uids and gids stored in it that do not map to init_user_ns. The plan is to allow those filesystems to set i_uid to INVALID_UID and i_gid to INVALID_GID for unmapped uids and gids and then to handle that strange case in the vfs to ensure there is consistent robust handling of the weirdness. Upon a careful review of the vfs and filesystems about the only case where there is any possibility of confusion or trouble is when the inode is written back to disk. In that case filesystems typically read the inode->i_uid and inode->i_gid and write them to disk even when just an inode timestamp is being updated. Which leads to a rule that is very simple to implement and understand inodes whose i_uid or i_gid is not valid may not be written. In dealing with access times this means treat those inodes as if the inode flag S_NOATIME was set. Reads of the inodes appear safe and useful, but any write or modification is disallowed. The only inode write that is allowed is a chown that sets the uid and gid on the inode to valid values. After such a chown the inode is normal and may be treated as such. Denying all writes to inodes with uids or gids unknown to the vfs also prevents several oddball cases where corruption would have occurred because the vfs does not have complete information. One problem case that is prevented is attempting to use the gid of a directory for new inodes where the directories sgid bit is set but the directories gid is not mapped. Another problem case avoided is attempting to update the evm hash after setxattr, removexattr, and setattr. As the evm hash includeds the inode->i_uid or inode->i_gid not knowning the uid or gid prevents a correct evm hash from being computed. evm hash verification also fails when i_uid or i_gid is unknown but that is essentially harmless as it does not cause filesystem corruption. Acked-by: Seth Forshee <seth.forshee@canonical.com> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
2016-06-30 03:54:46 +08:00
/*
* Updating an xattr will likely cause i_uid and i_gid
* to be writen back improperly if their true value is
* unknown to the vfs.
*/
if (HAS_UNMAPPED_ID(inode))
return -EPERM;
}
/*
* No restriction for security.* and system.* from the VFS. Decision
* on these is left to the underlying filesystem / security module.
*/
if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
return 0;
/*
* The trusted.* namespace can only be accessed by privileged users.
*/
if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
if (!capable(CAP_SYS_ADMIN))
return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
return 0;
}
/*
* In the user.* namespace, only regular files and directories can have
* extended attributes. For sticky directories, only the owner and
* privileged users can write attributes.
*/
if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
(mask & MAY_WRITE) && !inode_owner_or_capable(inode))
return -EPERM;
}
return inode_permission(inode, mask);
}
int
__vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
const void *value, size_t size, int flags)
{
const struct xattr_handler *handler;
handler = xattr_resolve_name(inode, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
if (!handler->set)
return -EOPNOTSUPP;
if (size == 0)
value = ""; /* empty EA, do not remove */
return handler->set(handler, dentry, inode, name, value, size, flags);
}
EXPORT_SYMBOL(__vfs_setxattr);
/**
* __vfs_setxattr_noperm - perform setxattr operation without performing
* permission checks.
*
* @dentry - object to perform setxattr on
* @name - xattr name to set
* @value - value to set @name to
* @size - size of @value
* @flags - flags to pass into filesystem operations
*
* returns the result of the internal setxattr or setsecurity operations.
*
* This function requires the caller to lock the inode's i_mutex before it
* is executed. It also assumes that the caller will make the appropriate
* permission checks.
*/
int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
struct inode *inode = dentry->d_inode;
int error = -EAGAIN;
Cache xattr security drop check for write v2 Some recent benchmarking on btrfs showed that a major scaling bottleneck on large systems on btrfs is currently the xattr lookup on every write. Why xattr lookup on every write I hear you ask? write wants to drop suid and security related xattrs that could set o capabilities for executables. To do that it currently looks up security.capability on EVERY write (even for non executables) to decide whether to drop it or not. In btrfs this causes an additional tree walk, hitting some per file system locks and quite bad scalability. In a simple read workload on a 8S system I saw over 90% CPU time in spinlocks related to that. Chris Mason tells me this is also a problem in ext4, where it hits the global mbcache lock. This patch adds a simple per inode to avoid this problem. We only do the lookup once per file and then if there is no xattr cache the decision. All xattr changes clear the flag. I also used the same flag to avoid the suid check, although that one is pretty cheap. A file system can also set this flag when it creates the inode, if it has a cheap way to do so. This is done for some common file systems in followon patches. With this patch a major part of the lock contention disappears for btrfs. Some testing on smaller systems didn't show significant performance changes, but at least it helps the larger systems and is generally more efficient. v2: Rename is_sgid. add file system helper. Cc: chris.mason@oracle.com Cc: josef@redhat.com Cc: viro@zeniv.linux.org.uk Cc: agruen@linbit.com Cc: Serge E. Hallyn <serue@us.ibm.com> Signed-off-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-05-28 23:25:51 +08:00
int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN);
Cache xattr security drop check for write v2 Some recent benchmarking on btrfs showed that a major scaling bottleneck on large systems on btrfs is currently the xattr lookup on every write. Why xattr lookup on every write I hear you ask? write wants to drop suid and security related xattrs that could set o capabilities for executables. To do that it currently looks up security.capability on EVERY write (even for non executables) to decide whether to drop it or not. In btrfs this causes an additional tree walk, hitting some per file system locks and quite bad scalability. In a simple read workload on a 8S system I saw over 90% CPU time in spinlocks related to that. Chris Mason tells me this is also a problem in ext4, where it hits the global mbcache lock. This patch adds a simple per inode to avoid this problem. We only do the lookup once per file and then if there is no xattr cache the decision. All xattr changes clear the flag. I also used the same flag to avoid the suid check, although that one is pretty cheap. A file system can also set this flag when it creates the inode, if it has a cheap way to do so. This is done for some common file systems in followon patches. With this patch a major part of the lock contention disappears for btrfs. Some testing on smaller systems didn't show significant performance changes, but at least it helps the larger systems and is generally more efficient. v2: Rename is_sgid. add file system helper. Cc: chris.mason@oracle.com Cc: josef@redhat.com Cc: viro@zeniv.linux.org.uk Cc: agruen@linbit.com Cc: Serge E. Hallyn <serue@us.ibm.com> Signed-off-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2011-05-28 23:25:51 +08:00
if (issec)
inode->i_flags &= ~S_NOSEC;
if (inode->i_opflags & IOP_XATTR) {
error = __vfs_setxattr(dentry, inode, name, value, size, flags);
if (!error) {
fsnotify_xattr(dentry);
security_inode_post_setxattr(dentry, name, value,
size, flags);
}
} else {
if (unlikely(is_bad_inode(inode)))
return -EIO;
}
if (error == -EAGAIN) {
error = -EOPNOTSUPP;
if (issec) {
const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
error = security_inode_setsecurity(inode, suffix, value,
size, flags);
if (!error)
fsnotify_xattr(dentry);
}
}
return error;
}
int
vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags)
{
struct inode *inode = dentry->d_inode;
int error;
error = xattr_permission(inode, name, MAY_WRITE);
if (error)
return error;
inode_lock(inode);
error = security_inode_setxattr(dentry, name, value, size, flags);
if (error)
goto out;
error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
out:
inode_unlock(inode);
return error;
}
EXPORT_SYMBOL_GPL(vfs_setxattr);
static ssize_t
xattr_getsecurity(struct inode *inode, const char *name, void *value,
size_t size)
{
void *buffer = NULL;
ssize_t len;
if (!value || !size) {
len = security_inode_getsecurity(inode, name, &buffer, false);
goto out_noalloc;
}
len = security_inode_getsecurity(inode, name, &buffer, true);
if (len < 0)
return len;
if (size < len) {
len = -ERANGE;
goto out;
}
memcpy(value, buffer, len);
out:
kfree(buffer);
out_noalloc:
return len;
}
/*
* vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr
*
* Allocate memory, if not already allocated, or re-allocate correct size,
* before retrieving the extended attribute.
*
* Returns the result of alloc, if failed, or the getxattr operation.
*/
ssize_t
vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
size_t xattr_size, gfp_t flags)
{
const struct xattr_handler *handler;
struct inode *inode = dentry->d_inode;
char *value = *xattr_value;
int error;
error = xattr_permission(inode, name, MAY_READ);
if (error)
return error;
handler = xattr_resolve_name(inode, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
if (!handler->get)
return -EOPNOTSUPP;
error = handler->get(handler, dentry, inode, name, NULL, 0);
if (error < 0)
return error;
if (!value || (error > xattr_size)) {
value = krealloc(*xattr_value, error + 1, flags);
if (!value)
return -ENOMEM;
memset(value, 0, error + 1);
}
error = handler->get(handler, dentry, inode, name, value, error);
*xattr_value = value;
return error;
}
ssize_t
__vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name,
void *value, size_t size)
{
const struct xattr_handler *handler;
handler = xattr_resolve_name(inode, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
if (!handler->get)
return -EOPNOTSUPP;
return handler->get(handler, dentry, inode, name, value, size);
}
EXPORT_SYMBOL(__vfs_getxattr);
ssize_t
vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
{
struct inode *inode = dentry->d_inode;
int error;
error = xattr_permission(inode, name, MAY_READ);
if (error)
return error;
error = security_inode_getxattr(dentry, name);
if (error)
return error;
if (!strncmp(name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN)) {
const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
int ret = xattr_getsecurity(inode, suffix, value, size);
/*
* Only overwrite the return value if a security module
* is actually active.
*/
if (ret == -EOPNOTSUPP)
goto nolsm;
return ret;
}
nolsm:
return __vfs_getxattr(dentry, inode, name, value, size);
}
EXPORT_SYMBOL_GPL(vfs_getxattr);
ssize_t
vfs_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct inode *inode = d_inode(dentry);
ssize_t error;
error = security_inode_listxattr(dentry);
if (error)
return error;
if (inode->i_op->listxattr && (inode->i_opflags & IOP_XATTR)) {
error = inode->i_op->listxattr(dentry, list, size);
} else {
error = security_inode_listsecurity(inode, list, size);
if (size && error > size)
error = -ERANGE;
}
return error;
}
EXPORT_SYMBOL_GPL(vfs_listxattr);
int
__vfs_removexattr(struct dentry *dentry, const char *name)
{
struct inode *inode = d_inode(dentry);
const struct xattr_handler *handler;
handler = xattr_resolve_name(inode, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
if (!handler->set)
return -EOPNOTSUPP;
return handler->set(handler, dentry, inode, name, NULL, 0, XATTR_REPLACE);
}
EXPORT_SYMBOL(__vfs_removexattr);
int
vfs_removexattr(struct dentry *dentry, const char *name)
{
struct inode *inode = dentry->d_inode;
int error;
error = xattr_permission(inode, name, MAY_WRITE);
if (error)
return error;
inode_lock(inode);
error = security_inode_removexattr(dentry, name);
if (error)
goto out;
error = __vfs_removexattr(dentry, name);
if (!error) {
fsnotify_xattr(dentry);
evm_inode_post_removexattr(dentry, name);
}
out:
inode_unlock(inode);
return error;
}
EXPORT_SYMBOL_GPL(vfs_removexattr);
/*
* Extended attribute SET operations
*/
static long
setxattr(struct dentry *d, const char __user *name, const void __user *value,
size_t size, int flags)
{
int error;
void *kvalue = NULL;
char kname[XATTR_NAME_MAX + 1];
if (flags & ~(XATTR_CREATE|XATTR_REPLACE))
return -EINVAL;
error = strncpy_from_user(kname, name, sizeof(kname));
if (error == 0 || error == sizeof(kname))
error = -ERANGE;
if (error < 0)
return error;
if (size) {
if (size > XATTR_SIZE_MAX)
return -E2BIG;
treewide: use kv[mz]alloc* rather than opencoded variants There are many code paths opencoding kvmalloc. Let's use the helper instead. The main difference to kvmalloc is that those users are usually not considering all the aspects of the memory allocator. E.g. allocation requests <= 32kB (with 4kB pages) are basically never failing and invoke OOM killer to satisfy the allocation. This sounds too disruptive for something that has a reasonable fallback - the vmalloc. On the other hand those requests might fallback to vmalloc even when the memory allocator would succeed after several more reclaim/compaction attempts previously. There is no guarantee something like that happens though. This patch converts many of those places to kv[mz]alloc* helpers because they are more conservative. Link: http://lkml.kernel.org/r/20170306103327.2766-2-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> # Xen bits Acked-by: Kees Cook <keescook@chromium.org> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Andreas Dilger <andreas.dilger@intel.com> # Lustre Acked-by: Christian Borntraeger <borntraeger@de.ibm.com> # KVM/s390 Acked-by: Dan Williams <dan.j.williams@intel.com> # nvdim Acked-by: David Sterba <dsterba@suse.com> # btrfs Acked-by: Ilya Dryomov <idryomov@gmail.com> # Ceph Acked-by: Tariq Toukan <tariqt@mellanox.com> # mlx4 Acked-by: Leon Romanovsky <leonro@mellanox.com> # mlx5 Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: Anton Vorontsov <anton@enomsg.org> Cc: Colin Cross <ccross@android.com> Cc: Tony Luck <tony.luck@intel.com> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Kent Overstreet <kent.overstreet@gmail.com> Cc: Santosh Raspatur <santosh@chelsio.com> Cc: Hariprasad S <hariprasad@chelsio.com> Cc: Yishai Hadas <yishaih@mellanox.com> Cc: Oleg Drokin <oleg.drokin@intel.com> Cc: "Yan, Zheng" <zyan@redhat.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: David Miller <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-09 06:57:27 +08:00
kvalue = kvmalloc(size, GFP_KERNEL);
if (!kvalue)
return -ENOMEM;
if (copy_from_user(kvalue, value, size)) {
error = -EFAULT;
goto out;
}
2012-02-08 10:52:57 +08:00
if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
(strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
posix_acl_fix_xattr_from_user(kvalue, size);
Introduce v3 namespaced file capabilities Root in a non-initial user ns cannot be trusted to write a traditional security.capability xattr. If it were allowed to do so, then any unprivileged user on the host could map his own uid to root in a private namespace, write the xattr, and execute the file with privilege on the host. However supporting file capabilities in a user namespace is very desirable. Not doing so means that any programs designed to run with limited privilege must continue to support other methods of gaining and dropping privilege. For instance a program installer must detect whether file capabilities can be assigned, and assign them if so but set setuid-root otherwise. The program in turn must know how to drop partial capabilities, and do so only if setuid-root. This patch introduces v3 of the security.capability xattr. It builds a vfs_ns_cap_data struct by appending a uid_t rootid to struct vfs_cap_data. This is the absolute uid_t (that is, the uid_t in user namespace which mounted the filesystem, usually init_user_ns) of the root id in whose namespaces the file capabilities may take effect. When a task asks to write a v2 security.capability xattr, if it is privileged with respect to the userns which mounted the filesystem, then nothing should change. Otherwise, the kernel will transparently rewrite the xattr as a v3 with the appropriate rootid. This is done during the execution of setxattr() to catch user-space-initiated capability writes. Subsequently, any task executing the file which has the noted kuid as its root uid, or which is in a descendent user_ns of such a user_ns, will run the file with capabilities. Similarly when asking to read file capabilities, a v3 capability will be presented as v2 if it applies to the caller's namespace. If a task writes a v3 security.capability, then it can provide a uid for the xattr so long as the uid is valid in its own user namespace, and it is privileged with CAP_SETFCAP over its namespace. The kernel will translate that rootid to an absolute uid, and write that to disk. After this, a task in the writer's namespace will not be able to use those capabilities (unless rootid was 0), but a task in a namespace where the given uid is root will. Only a single security.capability xattr may exist at a time for a given file. A task may overwrite an existing xattr so long as it is privileged over the inode. Note this is a departure from previous semantics, which required privilege to remove a security.capability xattr. This check can be re-added if deemed useful. This allows a simple setxattr to work, allows tar/untar to work, and allows us to tar in one namespace and untar in another while preserving the capability, without risking leaking privilege into a parent namespace. Example using tar: $ cp /bin/sleep sleepx $ mkdir b1 b2 $ lxc-usernsexec -m b:0:100000:1 -m b:1:$(id -u):1 -- chown 0:0 b1 $ lxc-usernsexec -m b:0:100001:1 -m b:1:$(id -u):1 -- chown 0:0 b2 $ lxc-usernsexec -m b:0:100000:1000 -- tar --xattrs-include=security.capability --xattrs -cf b1/sleepx.tar sleepx $ lxc-usernsexec -m b:0:100001:1000 -- tar --xattrs-include=security.capability --xattrs -C b2 -xf b1/sleepx.tar $ lxc-usernsexec -m b:0:100001:1000 -- getcap b2/sleepx b2/sleepx = cap_sys_admin+ep # /opt/ltp/testcases/bin/getv3xattr b2/sleepx v3 xattr, rootid is 100001 A patch to linux-test-project adding a new set of tests for this functionality is in the nsfscaps branch at github.com/hallyn/ltp Changelog: Nov 02 2016: fix invalid check at refuse_fcap_overwrite() Nov 07 2016: convert rootid from and to fs user_ns (From ebiederm: mar 28 2017) commoncap.c: fix typos - s/v4/v3 get_vfs_caps_from_disk: clarify the fs_ns root access check nsfscaps: change the code split for cap_inode_setxattr() Apr 09 2017: don't return v3 cap for caps owned by current root. return a v2 cap for a true v2 cap in non-init ns Apr 18 2017: . Change the flow of fscap writing to support s_user_ns writing. . Remove refuse_fcap_overwrite(). The value of the previous xattr doesn't matter. Apr 24 2017: . incorporate Eric's incremental diff . move cap_convert_nscap to setxattr and simplify its usage May 8, 2017: . fix leaking dentry refcount in cap_inode_getsecurity Signed-off-by: Serge Hallyn <serge@hallyn.com> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
2017-05-09 02:11:56 +08:00
else if (strcmp(kname, XATTR_NAME_CAPS) == 0) {
error = cap_convert_nscap(d, &kvalue, size);
if (error < 0)
goto out;
size = error;
}
}
error = vfs_setxattr(d, kname, kvalue, size, flags);
out:
kvfree(kvalue);
return error;
}
static int path_setxattr(const char __user *pathname,
const char __user *name, const void __user *value,
size_t size, int flags, unsigned int lookup_flags)
{
struct path path;
int error;
retry:
error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = mnt_want_write(path.mnt);
if (!error) {
error = setxattr(path.dentry, name, value, size, flags);
mnt_drop_write(path.mnt);
}
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
return error;
}
SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
const char __user *, name, const void __user *, value,
size_t, size, int, flags)
{
return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW);
}
SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
const char __user *, name, const void __user *, value,
size_t, size, int, flags)
{
return path_setxattr(pathname, name, value, size, flags, 0);
}
SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
const void __user *,value, size_t, size, int, flags)
{
struct fd f = fdget(fd);
int error = -EBADF;
if (!f.file)
return error;
audit_file(f.file);
error = mnt_want_write_file(f.file);
if (!error) {
error = setxattr(f.file->f_path.dentry, name, value, size, flags);
mnt_drop_write_file(f.file);
}
fdput(f);
return error;
}
/*
* Extended attribute GET operations
*/
static ssize_t
getxattr(struct dentry *d, const char __user *name, void __user *value,
size_t size)
{
ssize_t error;
void *kvalue = NULL;
char kname[XATTR_NAME_MAX + 1];
error = strncpy_from_user(kname, name, sizeof(kname));
if (error == 0 || error == sizeof(kname))
error = -ERANGE;
if (error < 0)
return error;
if (size) {
if (size > XATTR_SIZE_MAX)
size = XATTR_SIZE_MAX;
treewide: use kv[mz]alloc* rather than opencoded variants There are many code paths opencoding kvmalloc. Let's use the helper instead. The main difference to kvmalloc is that those users are usually not considering all the aspects of the memory allocator. E.g. allocation requests <= 32kB (with 4kB pages) are basically never failing and invoke OOM killer to satisfy the allocation. This sounds too disruptive for something that has a reasonable fallback - the vmalloc. On the other hand those requests might fallback to vmalloc even when the memory allocator would succeed after several more reclaim/compaction attempts previously. There is no guarantee something like that happens though. This patch converts many of those places to kv[mz]alloc* helpers because they are more conservative. Link: http://lkml.kernel.org/r/20170306103327.2766-2-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> # Xen bits Acked-by: Kees Cook <keescook@chromium.org> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Andreas Dilger <andreas.dilger@intel.com> # Lustre Acked-by: Christian Borntraeger <borntraeger@de.ibm.com> # KVM/s390 Acked-by: Dan Williams <dan.j.williams@intel.com> # nvdim Acked-by: David Sterba <dsterba@suse.com> # btrfs Acked-by: Ilya Dryomov <idryomov@gmail.com> # Ceph Acked-by: Tariq Toukan <tariqt@mellanox.com> # mlx4 Acked-by: Leon Romanovsky <leonro@mellanox.com> # mlx5 Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: Anton Vorontsov <anton@enomsg.org> Cc: Colin Cross <ccross@android.com> Cc: Tony Luck <tony.luck@intel.com> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Kent Overstreet <kent.overstreet@gmail.com> Cc: Santosh Raspatur <santosh@chelsio.com> Cc: Hariprasad S <hariprasad@chelsio.com> Cc: Yishai Hadas <yishaih@mellanox.com> Cc: Oleg Drokin <oleg.drokin@intel.com> Cc: "Yan, Zheng" <zyan@redhat.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: David Miller <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-09 06:57:27 +08:00
kvalue = kvzalloc(size, GFP_KERNEL);
if (!kvalue)
return -ENOMEM;
}
error = vfs_getxattr(d, kname, kvalue, size);
if (error > 0) {
2012-02-08 10:52:57 +08:00
if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
(strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
getxattr: use correct xattr length When running in a container with a user namespace, if you call getxattr with name = "system.posix_acl_access" and size % 8 != 4, then getxattr silently skips the user namespace fixup that it normally does resulting in un-fixed-up data being returned. This is caused by posix_acl_fix_xattr_to_user() being passed the total buffer size and not the actual size of the xattr as returned by vfs_getxattr(). This commit passes the actual length of the xattr as returned by vfs_getxattr() down. A reproducer for the issue is: touch acl_posix setfacl -m user:0:rwx acl_posix and the compile: #define _GNU_SOURCE #include <errno.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> #include <unistd.h> #include <attr/xattr.h> /* Run in user namespace with nsuid 0 mapped to uid != 0 on the host. */ int main(int argc, void **argv) { ssize_t ret1, ret2; char buf1[128], buf2[132]; int fret = EXIT_SUCCESS; char *file; if (argc < 2) { fprintf(stderr, "Please specify a file with " "\"system.posix_acl_access\" permissions set\n"); _exit(EXIT_FAILURE); } file = argv[1]; ret1 = getxattr(file, "system.posix_acl_access", buf1, sizeof(buf1)); if (ret1 < 0) { fprintf(stderr, "%s - Failed to retrieve " "\"system.posix_acl_access\" " "from \"%s\"\n", strerror(errno), file); _exit(EXIT_FAILURE); } ret2 = getxattr(file, "system.posix_acl_access", buf2, sizeof(buf2)); if (ret2 < 0) { fprintf(stderr, "%s - Failed to retrieve " "\"system.posix_acl_access\" " "from \"%s\"\n", strerror(errno), file); _exit(EXIT_FAILURE); } if (ret1 != ret2) { fprintf(stderr, "The value of \"system.posix_acl_" "access\" for file \"%s\" changed " "between two successive calls\n", file); _exit(EXIT_FAILURE); } for (ssize_t i = 0; i < ret2; i++) { if (buf1[i] == buf2[i]) continue; fprintf(stderr, "Unexpected different in byte %zd: " "%02x != %02x\n", i, buf1[i], buf2[i]); fret = EXIT_FAILURE; } if (fret == EXIT_SUCCESS) fprintf(stderr, "Test passed\n"); else fprintf(stderr, "Test failed\n"); _exit(fret); } and run: ./tester acl_posix On a non-fixed up kernel this should return something like: root@c1:/# ./t Unexpected different in byte 16: ffffffa0 != 00 Unexpected different in byte 17: ffffff86 != 00 Unexpected different in byte 18: 01 != 00 and on a fixed kernel: root@c1:~# ./t Test passed Cc: stable@vger.kernel.org Fixes: 2f6f0654ab61 ("userns: Convert vfs posix_acl support to use kuids and kgids") Link: https://bugzilla.kernel.org/show_bug.cgi?id=199945 Reported-by: Colin Watson <cjwatson@ubuntu.com> Signed-off-by: Christian Brauner <christian@brauner.io> Acked-by: Serge Hallyn <serge@hallyn.com> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
2018-06-07 19:43:48 +08:00
posix_acl_fix_xattr_to_user(kvalue, error);
if (size && copy_to_user(value, kvalue, error))
error = -EFAULT;
} else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
/* The file system tried to returned a value bigger
than XATTR_SIZE_MAX bytes. Not possible. */
error = -E2BIG;
}
kvfree(kvalue);
return error;
}
static ssize_t path_getxattr(const char __user *pathname,
const char __user *name, void __user *value,
size_t size, unsigned int lookup_flags)
{
struct path path;
ssize_t error;
retry:
error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = getxattr(path.dentry, name, value, size);
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
return error;
}
SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
const char __user *, name, void __user *, value, size_t, size)
{
return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW);
}
SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
const char __user *, name, void __user *, value, size_t, size)
{
return path_getxattr(pathname, name, value, size, 0);
}
SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
void __user *, value, size_t, size)
{
struct fd f = fdget(fd);
ssize_t error = -EBADF;
if (!f.file)
return error;
audit_file(f.file);
error = getxattr(f.file->f_path.dentry, name, value, size);
fdput(f);
return error;
}
/*
* Extended attribute LIST operations
*/
static ssize_t
listxattr(struct dentry *d, char __user *list, size_t size)
{
ssize_t error;
char *klist = NULL;
if (size) {
if (size > XATTR_LIST_MAX)
size = XATTR_LIST_MAX;
treewide: use kv[mz]alloc* rather than opencoded variants There are many code paths opencoding kvmalloc. Let's use the helper instead. The main difference to kvmalloc is that those users are usually not considering all the aspects of the memory allocator. E.g. allocation requests <= 32kB (with 4kB pages) are basically never failing and invoke OOM killer to satisfy the allocation. This sounds too disruptive for something that has a reasonable fallback - the vmalloc. On the other hand those requests might fallback to vmalloc even when the memory allocator would succeed after several more reclaim/compaction attempts previously. There is no guarantee something like that happens though. This patch converts many of those places to kv[mz]alloc* helpers because they are more conservative. Link: http://lkml.kernel.org/r/20170306103327.2766-2-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> # Xen bits Acked-by: Kees Cook <keescook@chromium.org> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Andreas Dilger <andreas.dilger@intel.com> # Lustre Acked-by: Christian Borntraeger <borntraeger@de.ibm.com> # KVM/s390 Acked-by: Dan Williams <dan.j.williams@intel.com> # nvdim Acked-by: David Sterba <dsterba@suse.com> # btrfs Acked-by: Ilya Dryomov <idryomov@gmail.com> # Ceph Acked-by: Tariq Toukan <tariqt@mellanox.com> # mlx4 Acked-by: Leon Romanovsky <leonro@mellanox.com> # mlx5 Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: Anton Vorontsov <anton@enomsg.org> Cc: Colin Cross <ccross@android.com> Cc: Tony Luck <tony.luck@intel.com> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Kent Overstreet <kent.overstreet@gmail.com> Cc: Santosh Raspatur <santosh@chelsio.com> Cc: Hariprasad S <hariprasad@chelsio.com> Cc: Yishai Hadas <yishaih@mellanox.com> Cc: Oleg Drokin <oleg.drokin@intel.com> Cc: "Yan, Zheng" <zyan@redhat.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: David Miller <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-09 06:57:27 +08:00
klist = kvmalloc(size, GFP_KERNEL);
if (!klist)
return -ENOMEM;
}
error = vfs_listxattr(d, klist, size);
if (error > 0) {
if (size && copy_to_user(list, klist, error))
error = -EFAULT;
} else if (error == -ERANGE && size >= XATTR_LIST_MAX) {
/* The file system tried to returned a list bigger
than XATTR_LIST_MAX bytes. Not possible. */
error = -E2BIG;
}
kvfree(klist);
return error;
}
static ssize_t path_listxattr(const char __user *pathname, char __user *list,
size_t size, unsigned int lookup_flags)
{
struct path path;
ssize_t error;
retry:
error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = listxattr(path.dentry, list, size);
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
return error;
}
SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
size_t, size)
{
return path_listxattr(pathname, list, size, LOOKUP_FOLLOW);
}
SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
size_t, size)
{
return path_listxattr(pathname, list, size, 0);
}
SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
{
struct fd f = fdget(fd);
ssize_t error = -EBADF;
if (!f.file)
return error;
audit_file(f.file);
error = listxattr(f.file->f_path.dentry, list, size);
fdput(f);
return error;
}
/*
* Extended attribute REMOVE operations
*/
static long
removexattr(struct dentry *d, const char __user *name)
{
int error;
char kname[XATTR_NAME_MAX + 1];
error = strncpy_from_user(kname, name, sizeof(kname));
if (error == 0 || error == sizeof(kname))
error = -ERANGE;
if (error < 0)
return error;
return vfs_removexattr(d, kname);
}
static int path_removexattr(const char __user *pathname,
const char __user *name, unsigned int lookup_flags)
{
struct path path;
int error;
retry:
error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = mnt_want_write(path.mnt);
if (!error) {
error = removexattr(path.dentry, name);
mnt_drop_write(path.mnt);
}
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
return error;
}
SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
const char __user *, name)
{
return path_removexattr(pathname, name, LOOKUP_FOLLOW);
}
SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
const char __user *, name)
{
return path_removexattr(pathname, name, 0);
}
SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
{
struct fd f = fdget(fd);
int error = -EBADF;
if (!f.file)
return error;
audit_file(f.file);
error = mnt_want_write_file(f.file);
if (!error) {
error = removexattr(f.file->f_path.dentry, name);
mnt_drop_write_file(f.file);
}
fdput(f);
return error;
}
/*
* Combine the results of the list() operation from every xattr_handler in the
* list.
*/
ssize_t
generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
unsigned int size = 0;
if (!buffer) {
for_each_xattr_handler(handlers, handler) {
if (!handler->name ||
(handler->list && !handler->list(dentry)))
continue;
size += strlen(handler->name) + 1;
}
} else {
char *buf = buffer;
size_t len;
for_each_xattr_handler(handlers, handler) {
if (!handler->name ||
(handler->list && !handler->list(dentry)))
continue;
len = strlen(handler->name);
if (len + 1 > buffer_size)
return -ERANGE;
memcpy(buf, handler->name, len + 1);
buf += len + 1;
buffer_size -= len + 1;
}
size = buf - buffer;
}
return size;
}
EXPORT_SYMBOL(generic_listxattr);
/**
* xattr_full_name - Compute full attribute name from suffix
*
* @handler: handler of the xattr_handler operation
* @name: name passed to the xattr_handler operation
*
* The get and set xattr handler operations are called with the remainder of
* the attribute name after skipping the handler's prefix: for example, "foo"
* is passed to the get operation of a handler with prefix "user." to get
* attribute "user.foo". The full name is still "there" in the name though.
*
* Note: the list xattr handler operation when called from the vfs is passed a
* NULL name; some file systems use this operation internally, with varying
* semantics.
*/
const char *xattr_full_name(const struct xattr_handler *handler,
const char *name)
{
size_t prefix_len = strlen(xattr_prefix(handler));
return name - prefix_len;
}
EXPORT_SYMBOL(xattr_full_name);
/*
* Allocate new xattr and copy in the value; but leave the name to callers.
*/
struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
{
struct simple_xattr *new_xattr;
size_t len;
/* wrap around? */
len = sizeof(*new_xattr) + size;
if (len < sizeof(*new_xattr))
return NULL;
new_xattr = kvmalloc(len, GFP_KERNEL);
if (!new_xattr)
return NULL;
new_xattr->size = size;
memcpy(new_xattr->value, value, size);
return new_xattr;
}
/*
* xattr GET operation for in-memory/pseudo filesystems
*/
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
void *buffer, size_t size)
{
struct simple_xattr *xattr;
int ret = -ENODATA;
spin_lock(&xattrs->lock);
list_for_each_entry(xattr, &xattrs->head, list) {
if (strcmp(name, xattr->name))
continue;
ret = xattr->size;
if (buffer) {
if (size < xattr->size)
ret = -ERANGE;
else
memcpy(buffer, xattr->value, xattr->size);
}
break;
}
spin_unlock(&xattrs->lock);
return ret;
}
/**
* simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
* @xattrs: target simple_xattr list
* @name: name of the extended attribute
* @value: value of the xattr. If %NULL, will remove the attribute.
* @size: size of the new xattr
* @flags: %XATTR_{CREATE|REPLACE}
* @removed_size: returns size of the removed xattr, -1 if none removed
*
* %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
* with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
* otherwise, fails with -ENODATA.
*
* Returns 0 on success, -errno on failure.
*/
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
const void *value, size_t size, int flags,
ssize_t *removed_size)
{
struct simple_xattr *xattr;
struct simple_xattr *new_xattr = NULL;
int err = 0;
if (removed_size)
*removed_size = -1;
/* value == NULL means remove */
if (value) {
new_xattr = simple_xattr_alloc(value, size);
if (!new_xattr)
return -ENOMEM;
new_xattr->name = kstrdup(name, GFP_KERNEL);
if (!new_xattr->name) {
kvfree(new_xattr);
return -ENOMEM;
}
}
spin_lock(&xattrs->lock);
list_for_each_entry(xattr, &xattrs->head, list) {
if (!strcmp(name, xattr->name)) {
if (flags & XATTR_CREATE) {
xattr = new_xattr;
err = -EEXIST;
} else if (new_xattr) {
list_replace(&xattr->list, &new_xattr->list);
if (removed_size)
*removed_size = xattr->size;
} else {
list_del(&xattr->list);
if (removed_size)
*removed_size = xattr->size;
}
goto out;
}
}
if (flags & XATTR_REPLACE) {
xattr = new_xattr;
err = -ENODATA;
} else {
list_add(&new_xattr->list, &xattrs->head);
xattr = NULL;
}
out:
spin_unlock(&xattrs->lock);
if (xattr) {
kfree(xattr->name);
kvfree(xattr);
}
return err;
}
static bool xattr_is_trusted(const char *name)
{
return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}
static int xattr_list_one(char **buffer, ssize_t *remaining_size,
const char *name)
{
size_t len = strlen(name) + 1;
if (*buffer) {
if (*remaining_size < len)
return -ERANGE;
memcpy(*buffer, name, len);
*buffer += len;
}
*remaining_size -= len;
return 0;
}
/*
* xattr LIST operation for in-memory/pseudo filesystems
*/
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
char *buffer, size_t size)
{
bool trusted = capable(CAP_SYS_ADMIN);
struct simple_xattr *xattr;
ssize_t remaining_size = size;
int err = 0;
#ifdef CONFIG_FS_POSIX_ACL
if (IS_POSIXACL(inode)) {
if (inode->i_acl) {
err = xattr_list_one(&buffer, &remaining_size,
XATTR_NAME_POSIX_ACL_ACCESS);
if (err)
return err;
}
if (inode->i_default_acl) {
err = xattr_list_one(&buffer, &remaining_size,
XATTR_NAME_POSIX_ACL_DEFAULT);
if (err)
return err;
}
}
#endif
spin_lock(&xattrs->lock);
list_for_each_entry(xattr, &xattrs->head, list) {
/* skip "trusted." attributes for unprivileged callers */
if (!trusted && xattr_is_trusted(xattr->name))
continue;
err = xattr_list_one(&buffer, &remaining_size, xattr->name);
if (err)
break;
}
spin_unlock(&xattrs->lock);
return err ? err : size - remaining_size;
}
/*
* Adds an extended attribute to the list
*/
void simple_xattr_list_add(struct simple_xattrs *xattrs,
struct simple_xattr *new_xattr)
{
spin_lock(&xattrs->lock);
list_add(&new_xattr->list, &xattrs->head);
spin_unlock(&xattrs->lock);
}