bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY

Add ability to memory-map contents of BPF array map. This is extremely useful
for working with BPF global data from userspace programs. It allows to avoid
typical bpf_map_{lookup,update}_elem operations, improving both performance
and usability.

There had to be special considerations for map freezing, to avoid having
writable memory view into a frozen map. To solve this issue, map freezing and
mmap-ing is happening under mutex now:
  - if map is already frozen, no writable mapping is allowed;
  - if map has writable memory mappings active (accounted in map->writecnt),
    map freezing will keep failing with -EBUSY;
  - once number of writable memory mappings drops to zero, map freezing can be
    performed again.

Only non-per-CPU plain arrays are supported right now. Maps with spinlocks
can't be memory mapped either.

For BPF_F_MMAPABLE array, memory allocation has to be done through vmalloc()
to be mmap()'able. We also need to make sure that array data memory is
page-sized and page-aligned, so we over-allocate memory in such a way that
struct bpf_array is at the end of a single page of memory with array->value
being aligned with the start of the second page. On deallocation we need to
accomodate this memory arrangement to free vmalloc()'ed memory correctly.

One important consideration regarding how memory-mapping subsystem functions.
Memory-mapping subsystem provides few optional callbacks, among them open()
and close().  close() is called for each memory region that is unmapped, so
that users can decrease their reference counters and free up resources, if
necessary. open() is *almost* symmetrical: it's called for each memory region
that is being mapped, **except** the very first one. So bpf_map_mmap does
initial refcnt bump, while open() will do any extra ones after that. Thus
number of close() calls is equal to number of open() calls plus one more.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/bpf/20191117172806.2195367-4-andriin@fb.com
This commit is contained in:
Andrii Nakryiko 2019-11-17 09:28:04 -08:00 committed by Daniel Borkmann
parent 85192dbf4d
commit fc9702273e
7 changed files with 183 additions and 12 deletions

View File

@ -12,6 +12,7 @@
#include <linux/err.h> #include <linux/err.h>
#include <linux/rbtree_latch.h> #include <linux/rbtree_latch.h>
#include <linux/numa.h> #include <linux/numa.h>
#include <linux/mm_types.h>
#include <linux/wait.h> #include <linux/wait.h>
#include <linux/u64_stats_sync.h> #include <linux/u64_stats_sync.h>
#include <linux/refcount.h> #include <linux/refcount.h>
@ -68,6 +69,7 @@ struct bpf_map_ops {
u64 *imm, u32 off); u64 *imm, u32 off);
int (*map_direct_value_meta)(const struct bpf_map *map, int (*map_direct_value_meta)(const struct bpf_map *map,
u64 imm, u32 *off); u64 imm, u32 *off);
int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma);
}; };
struct bpf_map_memory { struct bpf_map_memory {
@ -96,9 +98,10 @@ struct bpf_map {
u32 btf_value_type_id; u32 btf_value_type_id;
struct btf *btf; struct btf *btf;
struct bpf_map_memory memory; struct bpf_map_memory memory;
char name[BPF_OBJ_NAME_LEN];
bool unpriv_array; bool unpriv_array;
bool frozen; /* write-once */ bool frozen; /* write-once; write-protected by freeze_mutex */
/* 48 bytes hole */ /* 22 bytes hole */
/* The 3rd and 4th cacheline with misc members to avoid false sharing /* The 3rd and 4th cacheline with misc members to avoid false sharing
* particularly with refcounting. * particularly with refcounting.
@ -106,7 +109,8 @@ struct bpf_map {
atomic64_t refcnt ____cacheline_aligned; atomic64_t refcnt ____cacheline_aligned;
atomic64_t usercnt; atomic64_t usercnt;
struct work_struct work; struct work_struct work;
char name[BPF_OBJ_NAME_LEN]; struct mutex freeze_mutex;
u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */
}; };
static inline bool map_value_has_spin_lock(const struct bpf_map *map) static inline bool map_value_has_spin_lock(const struct bpf_map *map)
@ -795,6 +799,7 @@ void bpf_map_charge_finish(struct bpf_map_memory *mem);
void bpf_map_charge_move(struct bpf_map_memory *dst, void bpf_map_charge_move(struct bpf_map_memory *dst,
struct bpf_map_memory *src); struct bpf_map_memory *src);
void *bpf_map_area_alloc(size_t size, int numa_node); void *bpf_map_area_alloc(size_t size, int numa_node);
void *bpf_map_area_mmapable_alloc(size_t size, int numa_node);
void bpf_map_area_free(void *base); void bpf_map_area_free(void *base);
void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);

View File

@ -93,6 +93,7 @@ extern void *vzalloc(unsigned long size);
extern void *vmalloc_user(unsigned long size); extern void *vmalloc_user(unsigned long size);
extern void *vmalloc_node(unsigned long size, int node); extern void *vmalloc_node(unsigned long size, int node);
extern void *vzalloc_node(unsigned long size, int node); extern void *vzalloc_node(unsigned long size, int node);
extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags);
extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_exec(unsigned long size);
extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32(unsigned long size);
extern void *vmalloc_32_user(unsigned long size); extern void *vmalloc_32_user(unsigned long size);

View File

@ -348,6 +348,9 @@ enum bpf_attach_type {
/* Clone map from listener for newly accepted socket */ /* Clone map from listener for newly accepted socket */
#define BPF_F_CLONE (1U << 9) #define BPF_F_CLONE (1U << 9)
/* Enable memory-mapping BPF map */
#define BPF_F_MMAPABLE (1U << 10)
/* flags for BPF_PROG_QUERY */ /* flags for BPF_PROG_QUERY */
#define BPF_F_QUERY_EFFECTIVE (1U << 0) #define BPF_F_QUERY_EFFECTIVE (1U << 0)

View File

@ -14,7 +14,7 @@
#include "map_in_map.h" #include "map_in_map.h"
#define ARRAY_CREATE_FLAG_MASK \ #define ARRAY_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK)
static void bpf_array_free_percpu(struct bpf_array *array) static void bpf_array_free_percpu(struct bpf_array *array)
{ {
@ -59,6 +59,10 @@ int array_map_alloc_check(union bpf_attr *attr)
(percpu && numa_node != NUMA_NO_NODE)) (percpu && numa_node != NUMA_NO_NODE))
return -EINVAL; return -EINVAL;
if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
attr->map_flags & BPF_F_MMAPABLE)
return -EINVAL;
if (attr->value_size > KMALLOC_MAX_SIZE) if (attr->value_size > KMALLOC_MAX_SIZE)
/* if value_size is bigger, the user space won't be able to /* if value_size is bigger, the user space won't be able to
* access the elements. * access the elements.
@ -102,10 +106,19 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
} }
array_size = sizeof(*array); array_size = sizeof(*array);
if (percpu) if (percpu) {
array_size += (u64) max_entries * sizeof(void *); array_size += (u64) max_entries * sizeof(void *);
else } else {
/* rely on vmalloc() to return page-aligned memory and
* ensure array->value is exactly page-aligned
*/
if (attr->map_flags & BPF_F_MMAPABLE) {
array_size = PAGE_ALIGN(array_size);
array_size += PAGE_ALIGN((u64) max_entries * elem_size);
} else {
array_size += (u64) max_entries * elem_size; array_size += (u64) max_entries * elem_size;
}
}
/* make sure there is no u32 overflow later in round_up() */ /* make sure there is no u32 overflow later in round_up() */
cost = array_size; cost = array_size;
@ -117,7 +130,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return ERR_PTR(ret); return ERR_PTR(ret);
/* allocate all map elements and zero-initialize them */ /* allocate all map elements and zero-initialize them */
if (attr->map_flags & BPF_F_MMAPABLE) {
void *data;
/* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */
data = bpf_map_area_mmapable_alloc(array_size, numa_node);
if (!data) {
bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
}
array = data + PAGE_ALIGN(sizeof(struct bpf_array))
- offsetof(struct bpf_array, value);
} else {
array = bpf_map_area_alloc(array_size, numa_node); array = bpf_map_area_alloc(array_size, numa_node);
}
if (!array) { if (!array) {
bpf_map_charge_finish(&mem); bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
@ -350,6 +376,11 @@ static int array_map_delete_elem(struct bpf_map *map, void *key)
return -EINVAL; return -EINVAL;
} }
static void *array_map_vmalloc_addr(struct bpf_array *array)
{
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void array_map_free(struct bpf_map *map) static void array_map_free(struct bpf_map *map)
{ {
@ -365,6 +396,9 @@ static void array_map_free(struct bpf_map *map)
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
bpf_array_free_percpu(array); bpf_array_free_percpu(array);
if (array->map.map_flags & BPF_F_MMAPABLE)
bpf_map_area_free(array_map_vmalloc_addr(array));
else
bpf_map_area_free(array); bpf_map_area_free(array);
} }
@ -444,6 +478,17 @@ static int array_map_check_btf(const struct bpf_map *map,
return 0; return 0;
} }
int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT;
if (!(map->map_flags & BPF_F_MMAPABLE))
return -EINVAL;
return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), pgoff);
}
const struct bpf_map_ops array_map_ops = { const struct bpf_map_ops array_map_ops = {
.map_alloc_check = array_map_alloc_check, .map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc, .map_alloc = array_map_alloc,
@ -455,6 +500,7 @@ const struct bpf_map_ops array_map_ops = {
.map_gen_lookup = array_map_gen_lookup, .map_gen_lookup = array_map_gen_lookup,
.map_direct_value_addr = array_map_direct_value_addr, .map_direct_value_addr = array_map_direct_value_addr,
.map_direct_value_meta = array_map_direct_value_meta, .map_direct_value_meta = array_map_direct_value_meta,
.map_mmap = array_map_mmap,
.map_seq_show_elem = array_map_seq_show_elem, .map_seq_show_elem = array_map_seq_show_elem,
.map_check_btf = array_map_check_btf, .map_check_btf = array_map_check_btf,
}; };

View File

@ -127,7 +127,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
return map; return map;
} }
void *bpf_map_area_alloc(size_t size, int numa_node) static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable)
{ {
/* We really just want to fail instead of triggering OOM killer /* We really just want to fail instead of triggering OOM killer
* under memory pressure, therefore we set __GFP_NORETRY to kmalloc, * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
@ -142,18 +142,33 @@ void *bpf_map_area_alloc(size_t size, int numa_node)
const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
void *area; void *area;
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { /* kmalloc()'ed memory can't be mmap()'ed */
if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
numa_node); numa_node);
if (area != NULL) if (area != NULL)
return area; return area;
} }
if (mmapable) {
BUG_ON(!PAGE_ALIGNED(size));
return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL |
__GFP_RETRY_MAYFAIL | flags);
}
return __vmalloc_node_flags_caller(size, numa_node, return __vmalloc_node_flags_caller(size, numa_node,
GFP_KERNEL | __GFP_RETRY_MAYFAIL | GFP_KERNEL | __GFP_RETRY_MAYFAIL |
flags, __builtin_return_address(0)); flags, __builtin_return_address(0));
} }
void *bpf_map_area_alloc(size_t size, int numa_node)
{
return __bpf_map_area_alloc(size, numa_node, false);
}
void *bpf_map_area_mmapable_alloc(size_t size, int numa_node)
{
return __bpf_map_area_alloc(size, numa_node, true);
}
void bpf_map_area_free(void *area) void bpf_map_area_free(void *area)
{ {
kvfree(area); kvfree(area);
@ -425,6 +440,74 @@ static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
return -EINVAL; return -EINVAL;
} }
/* called for any extra memory-mapped regions (except initial) */
static void bpf_map_mmap_open(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
bpf_map_inc_with_uref(map);
if (vma->vm_flags & VM_WRITE) {
mutex_lock(&map->freeze_mutex);
map->writecnt++;
mutex_unlock(&map->freeze_mutex);
}
}
/* called for all unmapped memory region (including initial) */
static void bpf_map_mmap_close(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
if (vma->vm_flags & VM_WRITE) {
mutex_lock(&map->freeze_mutex);
map->writecnt--;
mutex_unlock(&map->freeze_mutex);
}
bpf_map_put_with_uref(map);
}
static const struct vm_operations_struct bpf_map_default_vmops = {
.open = bpf_map_mmap_open,
.close = bpf_map_mmap_close,
};
static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct bpf_map *map = filp->private_data;
int err;
if (!map->ops->map_mmap || map_value_has_spin_lock(map))
return -ENOTSUPP;
if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
mutex_lock(&map->freeze_mutex);
if ((vma->vm_flags & VM_WRITE) && map->frozen) {
err = -EPERM;
goto out;
}
/* set default open/close callbacks */
vma->vm_ops = &bpf_map_default_vmops;
vma->vm_private_data = map;
err = map->ops->map_mmap(map, vma);
if (err)
goto out;
bpf_map_inc_with_uref(map);
if (vma->vm_flags & VM_WRITE)
map->writecnt++;
out:
mutex_unlock(&map->freeze_mutex);
return err;
}
const struct file_operations bpf_map_fops = { const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo, .show_fdinfo = bpf_map_show_fdinfo,
@ -432,6 +515,7 @@ const struct file_operations bpf_map_fops = {
.release = bpf_map_release, .release = bpf_map_release,
.read = bpf_dummy_read, .read = bpf_dummy_read,
.write = bpf_dummy_write, .write = bpf_dummy_write,
.mmap = bpf_map_mmap,
}; };
int bpf_map_new_fd(struct bpf_map *map, int flags) int bpf_map_new_fd(struct bpf_map *map, int flags)
@ -577,6 +661,7 @@ static int map_create(union bpf_attr *attr)
atomic64_set(&map->refcnt, 1); atomic64_set(&map->refcnt, 1);
atomic64_set(&map->usercnt, 1); atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
if (attr->btf_key_type_id || attr->btf_value_type_id) { if (attr->btf_key_type_id || attr->btf_value_type_id) {
struct btf *btf; struct btf *btf;
@ -1163,6 +1248,13 @@ static int map_freeze(const union bpf_attr *attr)
map = __bpf_map_get(f); map = __bpf_map_get(f);
if (IS_ERR(map)) if (IS_ERR(map))
return PTR_ERR(map); return PTR_ERR(map);
mutex_lock(&map->freeze_mutex);
if (map->writecnt) {
err = -EBUSY;
goto err_put;
}
if (READ_ONCE(map->frozen)) { if (READ_ONCE(map->frozen)) {
err = -EBUSY; err = -EBUSY;
goto err_put; goto err_put;
@ -1174,6 +1266,7 @@ static int map_freeze(const union bpf_attr *attr)
WRITE_ONCE(map->frozen, true); WRITE_ONCE(map->frozen, true);
err_put: err_put:
mutex_unlock(&map->freeze_mutex);
fdput(f); fdput(f);
return err; return err;
} }

View File

@ -2671,6 +2671,26 @@ void *vzalloc_node(unsigned long size, int node)
} }
EXPORT_SYMBOL(vzalloc_node); EXPORT_SYMBOL(vzalloc_node);
/**
* vmalloc_user_node_flags - allocate memory for userspace on a specific node
* @size: allocation size
* @node: numa node
* @flags: flags for the page level allocator
*
* The resulting memory area is zeroed so it can be mapped to userspace
* without leaking data.
*
* Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
{
return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
flags | __GFP_ZERO, PAGE_KERNEL,
VM_USERMAP, node,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_user_node_flags);
/** /**
* vmalloc_exec - allocate virtually contiguous, executable memory * vmalloc_exec - allocate virtually contiguous, executable memory
* @size: allocation size * @size: allocation size

View File

@ -348,6 +348,9 @@ enum bpf_attach_type {
/* Clone map from listener for newly accepted socket */ /* Clone map from listener for newly accepted socket */
#define BPF_F_CLONE (1U << 9) #define BPF_F_CLONE (1U << 9)
/* Enable memory-mapping BPF map */
#define BPF_F_MMAPABLE (1U << 10)
/* flags for BPF_PROG_QUERY */ /* flags for BPF_PROG_QUERY */
#define BPF_F_QUERY_EFFECTIVE (1U << 0) #define BPF_F_QUERY_EFFECTIVE (1U << 0)