kernel_optimize_test/include/linux/binfmts.h
David Drysdale 51f39a1f0c syscalls: implement execveat() system call
This patchset adds execveat(2) for x86, and is derived from Meredydd
Luff's patch from Sept 2012 (https://lkml.org/lkml/2012/9/11/528).

The primary aim of adding an execveat syscall is to allow an
implementation of fexecve(3) that does not rely on the /proc filesystem,
at least for executables (rather than scripts).  The current glibc version
of fexecve(3) is implemented via /proc, which causes problems in sandboxed
or otherwise restricted environments.

Given the desire for a /proc-free fexecve() implementation, HPA suggested
(https://lkml.org/lkml/2006/7/11/556) that an execveat(2) syscall would be
an appropriate generalization.

Also, having a new syscall means that it can take a flags argument without
back-compatibility concerns.  The current implementation just defines the
AT_EMPTY_PATH and AT_SYMLINK_NOFOLLOW flags, but other flags could be
added in future -- for example, flags for new namespaces (as suggested at
https://lkml.org/lkml/2006/7/11/474).

Related history:
 - https://lkml.org/lkml/2006/12/27/123 is an example of someone
   realizing that fexecve() is likely to fail in a chroot environment.
 - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514043 covered
   documenting the /proc requirement of fexecve(3) in its manpage, to
   "prevent other people from wasting their time".
 - https://bugzilla.redhat.com/show_bug.cgi?id=241609 described a
   problem where a process that did setuid() could not fexecve()
   because it no longer had access to /proc/self/fd; this has since
   been fixed.

This patch (of 4):

Add a new execveat(2) system call.  execveat() is to execve() as openat()
is to open(): it takes a file descriptor that refers to a directory, and
resolves the filename relative to that.

In addition, if the filename is empty and AT_EMPTY_PATH is specified,
execveat() executes the file to which the file descriptor refers.  This
replicates the functionality of fexecve(), which is a system call in other
UNIXen, but in Linux glibc it depends on opening "/proc/self/fd/<fd>" (and
so relies on /proc being mounted).

The filename fed to the executed program as argv[0] (or the name of the
script fed to a script interpreter) will be of the form "/dev/fd/<fd>"
(for an empty filename) or "/dev/fd/<fd>/<filename>", effectively
reflecting how the executable was found.  This does however mean that
execution of a script in a /proc-less environment won't work; also, script
execution via an O_CLOEXEC file descriptor fails (as the file will not be
accessible after exec).

Based on patches by Meredydd Luff.

Signed-off-by: David Drysdale <drysdale@google.com>
Cc: Meredydd Luff <meredydd@senatehouse.org>
Cc: Shuah Khan <shuah.kh@samsung.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Rich Felker <dalias@aerifal.cx>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-13 12:42:51 -08:00

124 lines
4.0 KiB
C

#ifndef _LINUX_BINFMTS_H
#define _LINUX_BINFMTS_H
#include <linux/sched.h>
#include <linux/unistd.h>
#include <asm/exec.h>
#include <uapi/linux/binfmts.h>
#define CORENAME_MAX_SIZE 128
/*
* This structure is used to hold the arguments that are used when loading binaries.
*/
struct linux_binprm {
char buf[BINPRM_BUF_SIZE];
#ifdef CONFIG_MMU
struct vm_area_struct *vma;
unsigned long vma_pages;
#else
# define MAX_ARG_PAGES 32
struct page *page[MAX_ARG_PAGES];
#endif
struct mm_struct *mm;
unsigned long p; /* current top of mem */
unsigned int
cred_prepared:1,/* true if creds already prepared (multiple
* preps happen for interpreters) */
cap_effective:1;/* true if has elevated effective capabilities,
* false if not; except for init which inherits
* its parent's caps anyway */
#ifdef __alpha__
unsigned int taso:1;
#endif
unsigned int recursion_depth; /* only for search_binary_handler() */
struct file * file;
struct cred *cred; /* new credentials */
int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */
unsigned int per_clear; /* bits to clear in current->personality */
int argc, envc;
const char * filename; /* Name of binary as seen by procps */
const char * interp; /* Name of the binary really executed. Most
of the time same as filename, but could be
different for binfmt_{misc,script} */
unsigned interp_flags;
unsigned interp_data;
unsigned long loader, exec;
};
#define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
#define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT)
/* fd of the binary should be passed to the interpreter */
#define BINPRM_FLAGS_EXECFD_BIT 1
#define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT)
/* filename of the binary will be inaccessible after exec */
#define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2
#define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT)
/* Function parameter for binfmt->coredump */
struct coredump_params {
const siginfo_t *siginfo;
struct pt_regs *regs;
struct file *file;
unsigned long limit;
unsigned long mm_flags;
loff_t written;
};
/*
* This structure defines the functions that are used to load the binary formats that
* linux accepts.
*/
struct linux_binfmt {
struct list_head lh;
struct module *module;
int (*load_binary)(struct linux_binprm *);
int (*load_shlib)(struct file *);
int (*core_dump)(struct coredump_params *cprm);
unsigned long min_coredump; /* minimal dump size */
};
extern void __register_binfmt(struct linux_binfmt *fmt, int insert);
/* Registration of default binfmt handlers */
static inline void register_binfmt(struct linux_binfmt *fmt)
{
__register_binfmt(fmt, 0);
}
/* Same as above, but adds a new binfmt at the top of the list */
static inline void insert_binfmt(struct linux_binfmt *fmt)
{
__register_binfmt(fmt, 1);
}
extern void unregister_binfmt(struct linux_binfmt *);
extern int prepare_binprm(struct linux_binprm *);
extern int __must_check remove_arg_zero(struct linux_binprm *);
extern int search_binary_handler(struct linux_binprm *);
extern int flush_old_exec(struct linux_binprm * bprm);
extern void setup_new_exec(struct linux_binprm * bprm);
extern void would_dump(struct linux_binprm *, struct file *);
extern int suid_dumpable;
/* Stack area protections */
#define EXSTACK_DEFAULT 0 /* Whatever the arch defaults to */
#define EXSTACK_DISABLE_X 1 /* Disable executable stacks */
#define EXSTACK_ENABLE_X 2 /* Enable executable stacks */
extern int setup_arg_pages(struct linux_binprm * bprm,
unsigned long stack_top,
int executable_stack);
extern int bprm_change_interp(char *interp, struct linux_binprm *bprm);
extern int copy_strings_kernel(int argc, const char *const *argv,
struct linux_binprm *bprm);
extern int prepare_bprm_creds(struct linux_binprm *bprm);
extern void install_exec_creds(struct linux_binprm *bprm);
extern void set_binfmt(struct linux_binfmt *new);
extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t);
#endif /* _LINUX_BINFMTS_H */