kernel_optimize_test/tools/perf/examples/bpf/augmented_raw_syscalls.c
Arnaldo Carvalho de Melo 693bd3949b perf trace: Beautify 'fspick' arguments
Use existing beautifiers for the first 2 args (dfd, path) and wire up
the recently introduced fspick flags table generator.

Now it should be possible to just use:

   perf trace -e fspick

As root and see all move_mount syscalls with its args beautified, either
using the vfs_getname perf probe method or using the
augmented_raw_syscalls.c eBPF helper to get the pathnames, the other
args should work in all cases, i.e. all that is needed can be obtained
directly from the raw_syscalls:sys_enter tracepoint args.

  # cat sys_fspick.c
  #define _GNU_SOURCE        /* See feature_test_macros(7) */
  #include <unistd.h>
  #include <sys/syscall.h>   /* For SYS_xxx definitions */
  #include <fcntl.h>

  #define __NR_fspick 433

  #define FSPICK_CLOEXEC          0x00000001
  #define FSPICK_SYMLINK_NOFOLLOW 0x00000002
  #define FSPICK_NO_AUTOMOUNT     0x00000004
  #define FSPICK_EMPTY_PATH       0x00000008

  static inline int sys_fspick(int fd, const char *path, int flags)
  {
  	syscall(__NR_fspick, fd, path, flags);
  }

  int main(int argc, char *argv[])
  {
  	int flags = 0, fd = 0;

  	open("/foo", 0);
  	sys_fspick(fd++, "/foo1", flags);
  	flags |= FSPICK_CLOEXEC;
  	sys_fspick(fd++, "/foo2", flags);
  	flags |= FSPICK_SYMLINK_NOFOLLOW;
  	sys_fspick(fd++, "/foo3", flags);
  	flags |= FSPICK_NO_AUTOMOUNT;
  	sys_fspick(fd++, "/foo4", flags);
  	flags |= FSPICK_EMPTY_PATH;
  	return sys_fspick(fd++, "/foo5", flags);
  }
  # perf trace -e fspick ./sys_fspick
  LLVM: dumping /home/acme/git/perf/tools/perf/examples/bpf/augmented_raw_syscalls.o
  fspick(0, "/foo1", 0)                   = -1 ENOENT (No such file or directory)
  fspick(1, "/foo2", FSPICK_CLOEXEC)      = -1 ENOENT (No such file or directory)
  fspick(2, "/foo3", FSPICK_CLOEXEC|FSPICK_SYMLINK_NOFOLLOW) = -1 ENOENT (No such file or directory)
  fspick(3, "/foo4", FSPICK_CLOEXEC|FSPICK_SYMLINK_NOFOLLOW|FSPICK_NO_AUTOMOUNT) = -1 ENOENT (No such file or directory)
  fspick(4, "/foo5", FSPICK_CLOEXEC|FSPICK_SYMLINK_NOFOLLOW|FSPICK_NO_AUTOMOUNT|FSPICK_EMPTY_PATH) = -1 ENOENT (No such file or directory)
  #

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Luis Cláudio Gonçalves <lclaudio@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lkml.kernel.org/n/tip-erau5xjtt8wvgnhvdbchstuk@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2019-05-28 18:37:42 -03:00

324 lines
9.5 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Augment the raw_syscalls tracepoints with the contents of the pointer arguments.
*
* Test it with:
*
* perf trace -e tools/perf/examples/bpf/augmented_raw_syscalls.c cat /etc/passwd > /dev/null
*
* This exactly matches what is marshalled into the raw_syscall:sys_enter
* payload expected by the 'perf trace' beautifiers.
*
* For now it just uses the existing tracepoint augmentation code in 'perf
* trace', in the next csets we'll hook up these with the sys_enter/sys_exit
* code that will combine entry/exit in a strace like way.
*/
#include <unistd.h>
#include <linux/limits.h>
#include <pid_filter.h>
/* bpf-output associated map */
bpf_map(__augmented_syscalls__, PERF_EVENT_ARRAY, int, u32, __NR_CPUS__);
struct syscall {
bool enabled;
};
bpf_map(syscalls, ARRAY, int, struct syscall, 512);
struct syscall_enter_args {
unsigned long long common_tp_fields;
long syscall_nr;
unsigned long args[6];
};
struct syscall_exit_args {
unsigned long long common_tp_fields;
long syscall_nr;
long ret;
};
struct augmented_filename {
unsigned int size;
int reserved;
char value[PATH_MAX];
};
/* syscalls where the first arg is a string */
#define SYS_OPEN 2
#define SYS_STAT 4
#define SYS_LSTAT 6
#define SYS_ACCESS 21
#define SYS_EXECVE 59
#define SYS_TRUNCATE 76
#define SYS_CHDIR 80
#define SYS_RENAME 82
#define SYS_MKDIR 83
#define SYS_RMDIR 84
#define SYS_CREAT 85
#define SYS_LINK 86
#define SYS_UNLINK 87
#define SYS_SYMLINK 88
#define SYS_READLINK 89
#define SYS_CHMOD 90
#define SYS_CHOWN 92
#define SYS_LCHOWN 94
#define SYS_MKNOD 133
#define SYS_STATFS 137
#define SYS_PIVOT_ROOT 155
#define SYS_CHROOT 161
#define SYS_ACCT 163
#define SYS_SWAPON 167
#define SYS_SWAPOFF 168
#define SYS_DELETE_MODULE 176
#define SYS_SETXATTR 188
#define SYS_LSETXATTR 189
#define SYS_GETXATTR 191
#define SYS_LGETXATTR 192
#define SYS_LISTXATTR 194
#define SYS_LLISTXATTR 195
#define SYS_REMOVEXATTR 197
#define SYS_LREMOVEXATTR 198
#define SYS_MQ_OPEN 240
#define SYS_MQ_UNLINK 241
#define SYS_ADD_KEY 248
#define SYS_REQUEST_KEY 249
#define SYS_SYMLINKAT 266
#define SYS_MEMFD_CREATE 319
/* syscalls where the second arg is a string */
#define SYS_PWRITE64 18
#define SYS_EXECVE 59
#define SYS_RENAME 82
#define SYS_QUOTACTL 179
#define SYS_FSETXATTR 190
#define SYS_FGETXATTR 193
#define SYS_FREMOVEXATTR 199
#define SYS_MQ_TIMEDSEND 242
#define SYS_REQUEST_KEY 249
#define SYS_INOTIFY_ADD_WATCH 254
#define SYS_OPENAT 257
#define SYS_MKDIRAT 258
#define SYS_MKNODAT 259
#define SYS_FCHOWNAT 260
#define SYS_FUTIMESAT 261
#define SYS_NEWFSTATAT 262
#define SYS_UNLINKAT 263
#define SYS_RENAMEAT 264
#define SYS_LINKAT 265
#define SYS_READLINKAT 267
#define SYS_FCHMODAT 268
#define SYS_FACCESSAT 269
#define SYS_UTIMENSAT 280
#define SYS_NAME_TO_HANDLE_AT 303
#define SYS_FINIT_MODULE 313
#define SYS_RENAMEAT2 316
#define SYS_EXECVEAT 322
#define SYS_STATX 332
#define SYS_MOVE_MOUNT 429
#define SYS_FSPICK 433
pid_filter(pids_filtered);
struct augmented_args_filename {
struct syscall_enter_args args;
struct augmented_filename filename;
};
bpf_map(augmented_filename_map, PERCPU_ARRAY, int, struct augmented_args_filename, 1);
SEC("raw_syscalls:sys_enter")
int sys_enter(struct syscall_enter_args *args)
{
struct augmented_args_filename *augmented_args;
unsigned int len = sizeof(*augmented_args);
const void *filename_arg = NULL;
struct syscall *syscall;
int key = 0;
augmented_args = bpf_map_lookup_elem(&augmented_filename_map, &key);
if (augmented_args == NULL)
return 1;
if (pid_filter__has(&pids_filtered, getpid()))
return 0;
probe_read(&augmented_args->args, sizeof(augmented_args->args), args);
syscall = bpf_map_lookup_elem(&syscalls, &augmented_args->args.syscall_nr);
if (syscall == NULL || !syscall->enabled)
return 0;
/*
* Yonghong and Edward Cree sayz:
*
* https://www.spinics.net/lists/netdev/msg531645.html
*
* >> R0=inv(id=0) R1=inv2 R6=ctx(id=0,off=0,imm=0) R7=inv64 R10=fp0,call_-1
* >> 10: (bf) r1 = r6
* >> 11: (07) r1 += 16
* >> 12: (05) goto pc+2
* >> 15: (79) r3 = *(u64 *)(r1 +0)
* >> dereference of modified ctx ptr R1 off=16 disallowed
* > Aha, we at least got a different error message this time.
* > And indeed llvm has done that optimisation, rather than the more obvious
* > 11: r3 = *(u64 *)(r1 +16)
* > because it wants to have lots of reads share a single insn. You may be able
* > to defeat that optimisation by adding compiler barriers, idk. Maybe someone
* > with llvm knowledge can figure out how to stop it (ideally, llvm would know
* > when it's generating for bpf backend and not do that). -O0? ¯\_(ツ)_/¯
*
* The optimization mostly likes below:
*
* br1:
* ...
* r1 += 16
* goto merge
* br2:
* ...
* r1 += 20
* goto merge
* merge:
* *(u64 *)(r1 + 0)
*
* The compiler tries to merge common loads. There is no easy way to
* stop this compiler optimization without turning off a lot of other
* optimizations. The easiest way is to add barriers:
*
* __asm__ __volatile__("": : :"memory")
*
* after the ctx memory access to prevent their down stream merging.
*/
/*
* This table of what args are strings will be provided by userspace,
* in the syscalls map, i.e. we will already have to do the lookup to
* see if this specific syscall is filtered, so we can as well get more
* info about what syscall args are strings or pointers, and how many
* bytes to copy, per arg, etc.
*
* For now hard code it, till we have all the basic mechanisms in place
* to automate everything and make the kernel part be completely driven
* by information obtained in userspace for each kernel version and
* processor architecture, making the kernel part the same no matter what
* kernel version or processor architecture it runs on.
*/
switch (augmented_args->args.syscall_nr) {
case SYS_ACCT:
case SYS_ADD_KEY:
case SYS_CHDIR:
case SYS_CHMOD:
case SYS_CHOWN:
case SYS_CHROOT:
case SYS_CREAT:
case SYS_DELETE_MODULE:
case SYS_EXECVE:
case SYS_GETXATTR:
case SYS_LCHOWN:
case SYS_LGETXATTR:
case SYS_LINK:
case SYS_LISTXATTR:
case SYS_LLISTXATTR:
case SYS_LREMOVEXATTR:
case SYS_LSETXATTR:
case SYS_LSTAT:
case SYS_MEMFD_CREATE:
case SYS_MKDIR:
case SYS_MKNOD:
case SYS_MQ_OPEN:
case SYS_MQ_UNLINK:
case SYS_PIVOT_ROOT:
case SYS_READLINK:
case SYS_REMOVEXATTR:
case SYS_RENAME:
case SYS_REQUEST_KEY:
case SYS_RMDIR:
case SYS_SETXATTR:
case SYS_STAT:
case SYS_STATFS:
case SYS_SWAPOFF:
case SYS_SWAPON:
case SYS_SYMLINK:
case SYS_SYMLINKAT:
case SYS_TRUNCATE:
case SYS_UNLINK:
case SYS_ACCESS:
case SYS_OPEN: filename_arg = (const void *)args->args[0];
__asm__ __volatile__("": : :"memory");
break;
case SYS_EXECVEAT:
case SYS_FACCESSAT:
case SYS_FCHMODAT:
case SYS_FCHOWNAT:
case SYS_FGETXATTR:
case SYS_FINIT_MODULE:
case SYS_FREMOVEXATTR:
case SYS_FSETXATTR:
case SYS_FSPICK:
case SYS_FUTIMESAT:
case SYS_INOTIFY_ADD_WATCH:
case SYS_LINKAT:
case SYS_MKDIRAT:
case SYS_MKNODAT:
// case SYS_MOVE_MOUNT:
// For now don't copy move_mount first string arg, as it has two and
// 'perf trace's syscall_arg__scnprintf_filename() will use the one
// copied here, the first, for both args, duplicating the first and
// ignoring the second.
//
// We need to copy both here and make syscall_arg__scnprintf_filename
// skip the first when reading the second, using the size of the first, etc.
// Shouldn't be difficult, but now its perf/urgent time, lets wait for
// the next devel window.
case SYS_MQ_TIMEDSEND:
case SYS_NAME_TO_HANDLE_AT:
case SYS_NEWFSTATAT:
case SYS_PWRITE64:
case SYS_QUOTACTL:
case SYS_READLINKAT:
case SYS_RENAMEAT:
case SYS_RENAMEAT2:
case SYS_STATX:
case SYS_UNLINKAT:
case SYS_UTIMENSAT:
case SYS_OPENAT: filename_arg = (const void *)args->args[1];
break;
}
if (filename_arg != NULL) {
augmented_args->filename.reserved = 0;
augmented_args->filename.size = probe_read_str(&augmented_args->filename.value,
sizeof(augmented_args->filename.value),
filename_arg);
if (augmented_args->filename.size < sizeof(augmented_args->filename.value)) {
len -= sizeof(augmented_args->filename.value) - augmented_args->filename.size;
len &= sizeof(augmented_args->filename.value) - 1;
}
} else {
len = sizeof(augmented_args->args);
}
/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, augmented_args, len);
}
SEC("raw_syscalls:sys_exit")
int sys_exit(struct syscall_exit_args *args)
{
struct syscall_exit_args exit_args;
struct syscall *syscall;
if (pid_filter__has(&pids_filtered, getpid()))
return 0;
probe_read(&exit_args, sizeof(exit_args), args);
syscall = bpf_map_lookup_elem(&syscalls, &exit_args.syscall_nr);
if (syscall == NULL || !syscall->enabled)
return 0;
return 1;
}
license(GPL);