io_uring-5.10-2020-10-24

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAl+UQh8QHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpl7WEADOTslFOof1RUPMb0Qvj4GO4cjvoFLW7KLt
 B83PmlW3WJpZrSiqZlrSPwcDELVphw67RL/2hp0jAfT1t00OdCOYQDmh7+kg9lnI
 fzu4NzfTKbriRWEtodIqZCiDoGXjzJGxNffhxPEt33YxRErI/fvuD/TzxwGGUInW
 OZ3Aze9Nj2DQ/eXhio48n4letTK6xNsjGDWvzwinthHWeBbID01isLlTei20PKU5
 Dk1buueUuEr/vNjJwEeRd8yDXZeLZ/br3gw/3B71MJoi2PUaXvuS8DV4LmXg2SS5
 yN0udSNk4AP/UlrVqN9bEqdbSTBSf2JIEW3k3/SEUjcjw6hMnbLeoW2vZx6Xvk6T
 vvAVHesLpCu8oEdWAkFm6Rb6ptJ1XpRrWWYxi1J1SB2Y8cGyGS1GoZWWPknM5M3I
 b1dNj18Bb+MmFvuKr7YYrb77tECuywxTHVGj6WwBOIlYrg44XQOumYYH9OmvZFz1
 6vWaXjLPOIM8fpAKX5Tx5sAy/FMl17H8I5AD2bZVvD0h0MqzLnvHEYahcAfOfb9y
 qpkdGnbAWo6IIkCrDcSOV4q6dmWu3as9eSs1j/6Xl4WoJ2MT9C//Gpv7iNMxxozy
 CznEPcbA8N9QazQmoebtB3gTBVyGUUKVDdVNzleMj9KD6yPlKFZ6+FZdikX59I9M
 t9QGh3+gow==
 =xidc
 -----END PGP SIGNATURE-----

Merge tag 'io_uring-5.10-2020-10-24' of git://git.kernel.dk/linux-block

Pull io_uring fixes from Jens Axboe:

 - fsize was missed in previous unification of work flags

 - Few fixes cleaning up the flags unification creds cases (Pavel)

 - Fix NUMA affinities for completely unplugged/replugged node for io-wq

 - Two fallout fixes from the set_fs changes. One local to io_uring, one
   for the splice entry point that io_uring uses.

 - Linked timeout fixes (Pavel)

 - Removal of ->flush() ->files work-around that we don't need anymore
   with referenced files (Pavel)

 - Various cleanups (Pavel)

* tag 'io_uring-5.10-2020-10-24' of git://git.kernel.dk/linux-block:
  splice: change exported internal do_splice() helper to take kernel offset
  io_uring: make loop_rw_iter() use original user supplied pointers
  io_uring: remove req cancel in ->flush()
  io-wq: re-set NUMA node affinities if CPUs come online
  io_uring: don't reuse linked_timeout
  io_uring: unify fsize with def->work_flags
  io_uring: fix racy REQ_F_LINK_TIMEOUT clearing
  io_uring: do poll's hash_node init in common code
  io_uring: inline io_poll_task_handler()
  io_uring: remove extra ->file check in poll prep
  io_uring: make cached_cq_overflow non atomic_t
  io_uring: inline io_fail_links()
  io_uring: kill ref get/drop in personality init
  io_uring: flags-based creds init in queue
This commit is contained in:
Linus Torvalds 2020-10-24 12:40:18 -07:00
commit af0041875c
5 changed files with 193 additions and 120 deletions

View File

@ -19,7 +19,9 @@
#include <linux/task_work.h> #include <linux/task_work.h>
#include <linux/blk-cgroup.h> #include <linux/blk-cgroup.h>
#include <linux/audit.h> #include <linux/audit.h>
#include <linux/cpu.h>
#include "../kernel/sched/sched.h"
#include "io-wq.h" #include "io-wq.h"
#define WORKER_IDLE_TIMEOUT (5 * HZ) #define WORKER_IDLE_TIMEOUT (5 * HZ)
@ -123,9 +125,13 @@ struct io_wq {
refcount_t refs; refcount_t refs;
struct completion done; struct completion done;
struct hlist_node cpuhp_node;
refcount_t use_refs; refcount_t use_refs;
}; };
static enum cpuhp_state io_wq_online;
static bool io_worker_get(struct io_worker *worker) static bool io_worker_get(struct io_worker *worker)
{ {
return refcount_inc_not_zero(&worker->ref); return refcount_inc_not_zero(&worker->ref);
@ -187,7 +193,8 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
worker->blkcg_css = NULL; worker->blkcg_css = NULL;
} }
#endif #endif
if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
return dropped_lock; return dropped_lock;
} }
@ -483,7 +490,10 @@ static void io_impersonate_work(struct io_worker *worker,
if ((work->flags & IO_WQ_WORK_CREDS) && if ((work->flags & IO_WQ_WORK_CREDS) &&
worker->cur_creds != work->identity->creds) worker->cur_creds != work->identity->creds)
io_wq_switch_creds(worker, work); io_wq_switch_creds(worker, work);
if (work->flags & IO_WQ_WORK_FSIZE)
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize; current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize;
else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
io_wq_switch_blkcg(worker, work); io_wq_switch_blkcg(worker, work);
#ifdef CONFIG_AUDIT #ifdef CONFIG_AUDIT
current->loginuid = work->identity->loginuid; current->loginuid = work->identity->loginuid;
@ -1087,10 +1097,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL); wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
if (!wq->wqes) { if (!wq->wqes)
kfree(wq); goto err_wq;
return ERR_PTR(-ENOMEM);
} ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
if (ret)
goto err_wqes;
wq->free_work = data->free_work; wq->free_work = data->free_work;
wq->do_work = data->do_work; wq->do_work = data->do_work;
@ -1098,6 +1110,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
/* caller must already hold a reference to this */ /* caller must already hold a reference to this */
wq->user = data->user; wq->user = data->user;
ret = -ENOMEM;
for_each_node(node) { for_each_node(node) {
struct io_wqe *wqe; struct io_wqe *wqe;
int alloc_node = node; int alloc_node = node;
@ -1141,9 +1154,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
ret = PTR_ERR(wq->manager); ret = PTR_ERR(wq->manager);
complete(&wq->done); complete(&wq->done);
err: err:
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
for_each_node(node) for_each_node(node)
kfree(wq->wqes[node]); kfree(wq->wqes[node]);
err_wqes:
kfree(wq->wqes); kfree(wq->wqes);
err_wq:
kfree(wq); kfree(wq);
return ERR_PTR(ret); return ERR_PTR(ret);
} }
@ -1160,6 +1176,8 @@ static void __io_wq_destroy(struct io_wq *wq)
{ {
int node; int node;
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
set_bit(IO_WQ_BIT_EXIT, &wq->state); set_bit(IO_WQ_BIT_EXIT, &wq->state);
if (wq->manager) if (wq->manager)
kthread_stop(wq->manager); kthread_stop(wq->manager);
@ -1187,3 +1205,41 @@ struct task_struct *io_wq_get_task(struct io_wq *wq)
{ {
return wq->manager; return wq->manager;
} }
static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
{
struct task_struct *task = worker->task;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(task, &rf);
do_set_cpus_allowed(task, cpumask_of_node(worker->wqe->node));
task->flags |= PF_NO_SETAFFINITY;
task_rq_unlock(rq, task, &rf);
return false;
}
static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node)
{
struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
int i;
rcu_read_lock();
for_each_node(i)
io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, NULL);
rcu_read_unlock();
return 0;
}
static __init int io_wq_init(void)
{
int ret;
ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online",
io_wq_cpu_online, NULL);
if (ret < 0)
return ret;
io_wq_online = ret;
return 0;
}
subsys_initcall(io_wq_init);

View File

@ -17,6 +17,7 @@ enum {
IO_WQ_WORK_MM = 128, IO_WQ_WORK_MM = 128,
IO_WQ_WORK_CREDS = 256, IO_WQ_WORK_CREDS = 256,
IO_WQ_WORK_BLKCG = 512, IO_WQ_WORK_BLKCG = 512,
IO_WQ_WORK_FSIZE = 1024,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
}; };

View File

@ -277,7 +277,7 @@ struct io_ring_ctx {
unsigned sq_mask; unsigned sq_mask;
unsigned sq_thread_idle; unsigned sq_thread_idle;
unsigned cached_sq_dropped; unsigned cached_sq_dropped;
atomic_t cached_cq_overflow; unsigned cached_cq_overflow;
unsigned long sq_check_overflow; unsigned long sq_check_overflow;
struct list_head defer_list; struct list_head defer_list;
@ -585,6 +585,7 @@ enum {
REQ_F_BUFFER_SELECTED_BIT, REQ_F_BUFFER_SELECTED_BIT,
REQ_F_NO_FILE_TABLE_BIT, REQ_F_NO_FILE_TABLE_BIT,
REQ_F_WORK_INITIALIZED_BIT, REQ_F_WORK_INITIALIZED_BIT,
REQ_F_LTIMEOUT_ACTIVE_BIT,
/* not a real bit, just to check we're not overflowing the space */ /* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT, __REQ_F_LAST_BIT,
@ -614,7 +615,7 @@ enum {
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
/* must not punt to workers */ /* must not punt to workers */
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
/* has linked timeout */ /* has or had linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
/* regular file */ /* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
@ -628,6 +629,8 @@ enum {
REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
/* io_wq_work is initialized */ /* io_wq_work is initialized */
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
/* linked timeout is active, i.e. prepared by link's head */
REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
}; };
struct async_poll { struct async_poll {
@ -750,8 +753,6 @@ struct io_op_def {
unsigned pollout : 1; unsigned pollout : 1;
/* op supports buffer selection */ /* op supports buffer selection */
unsigned buffer_select : 1; unsigned buffer_select : 1;
/* needs rlimit(RLIMIT_FSIZE) assigned */
unsigned needs_fsize : 1;
/* must always have async data allocated */ /* must always have async data allocated */
unsigned needs_async_data : 1; unsigned needs_async_data : 1;
/* size of async data needed, if any */ /* size of async data needed, if any */
@ -775,10 +776,10 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1, .hash_reg_file = 1,
.unbound_nonreg_file = 1, .unbound_nonreg_file = 1,
.pollout = 1, .pollout = 1,
.needs_fsize = 1,
.needs_async_data = 1, .needs_async_data = 1,
.async_size = sizeof(struct io_async_rw), .async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FSIZE,
}, },
[IORING_OP_FSYNC] = { [IORING_OP_FSYNC] = {
.needs_file = 1, .needs_file = 1,
@ -789,16 +790,16 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1, .unbound_nonreg_file = 1,
.pollin = 1, .pollin = 1,
.async_size = sizeof(struct io_async_rw), .async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG, .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
}, },
[IORING_OP_WRITE_FIXED] = { [IORING_OP_WRITE_FIXED] = {
.needs_file = 1, .needs_file = 1,
.hash_reg_file = 1, .hash_reg_file = 1,
.unbound_nonreg_file = 1, .unbound_nonreg_file = 1,
.pollout = 1, .pollout = 1,
.needs_fsize = 1,
.async_size = sizeof(struct io_async_rw), .async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG, .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
IO_WQ_WORK_MM,
}, },
[IORING_OP_POLL_ADD] = { [IORING_OP_POLL_ADD] = {
.needs_file = 1, .needs_file = 1,
@ -856,8 +857,7 @@ static const struct io_op_def io_op_defs[] = {
}, },
[IORING_OP_FALLOCATE] = { [IORING_OP_FALLOCATE] = {
.needs_file = 1, .needs_file = 1,
.needs_fsize = 1, .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE,
.work_flags = IO_WQ_WORK_BLKCG,
}, },
[IORING_OP_OPENAT] = { [IORING_OP_OPENAT] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG | .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
@ -887,9 +887,9 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1, .needs_file = 1,
.unbound_nonreg_file = 1, .unbound_nonreg_file = 1,
.pollout = 1, .pollout = 1,
.needs_fsize = 1,
.async_size = sizeof(struct io_async_rw), .async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FSIZE,
}, },
[IORING_OP_FADVISE] = { [IORING_OP_FADVISE] = {
.needs_file = 1, .needs_file = 1,
@ -1070,6 +1070,12 @@ static void io_init_identity(struct io_identity *id)
refcount_set(&id->count, 1); refcount_set(&id->count, 1);
} }
static inline void __io_req_init_async(struct io_kiocb *req)
{
memset(&req->work, 0, sizeof(req->work));
req->flags |= REQ_F_WORK_INITIALIZED;
}
/* /*
* Note: must call io_req_init_async() for the first time you * Note: must call io_req_init_async() for the first time you
* touch any members of io_wq_work. * touch any members of io_wq_work.
@ -1081,8 +1087,7 @@ static inline void io_req_init_async(struct io_kiocb *req)
if (req->flags & REQ_F_WORK_INITIALIZED) if (req->flags & REQ_F_WORK_INITIALIZED)
return; return;
memset(&req->work, 0, sizeof(req->work)); __io_req_init_async(req);
req->flags |= REQ_F_WORK_INITIALIZED;
/* Grab a ref if this isn't our static identity */ /* Grab a ref if this isn't our static identity */
req->work.identity = tctx->identity; req->work.identity = tctx->identity;
@ -1174,7 +1179,7 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
return seq != ctx->cached_cq_tail return seq != ctx->cached_cq_tail
+ atomic_read(&ctx->cached_cq_overflow); + READ_ONCE(ctx->cached_cq_overflow);
} }
return false; return false;
@ -1285,8 +1290,11 @@ static bool io_grab_identity(struct io_kiocb *req)
struct io_identity *id = req->work.identity; struct io_identity *id = req->work.identity;
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
if (def->needs_fsize && id->fsize != rlimit(RLIMIT_FSIZE)) if (def->work_flags & IO_WQ_WORK_FSIZE) {
if (id->fsize != rlimit(RLIMIT_FSIZE))
return false; return false;
req->work.flags |= IO_WQ_WORK_FSIZE;
}
if (!(req->work.flags & IO_WQ_WORK_FILES) && if (!(req->work.flags & IO_WQ_WORK_FILES) &&
(def->work_flags & IO_WQ_WORK_FILES) && (def->work_flags & IO_WQ_WORK_FILES) &&
@ -1619,8 +1627,9 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
WRITE_ONCE(cqe->res, req->result); WRITE_ONCE(cqe->res, req->result);
WRITE_ONCE(cqe->flags, req->compl.cflags); WRITE_ONCE(cqe->flags, req->compl.cflags);
} else { } else {
ctx->cached_cq_overflow++;
WRITE_ONCE(ctx->rings->cq_overflow, WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow)); ctx->cached_cq_overflow);
} }
} }
@ -1662,8 +1671,8 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
* then we cannot store the request for later flushing, we need * then we cannot store the request for later flushing, we need
* to drop it on the floor. * to drop it on the floor.
*/ */
WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow++;
atomic_inc_return(&ctx->cached_cq_overflow)); WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
} else { } else {
if (list_empty(&ctx->cq_overflow_list)) { if (list_empty(&ctx->cq_overflow_list)) {
set_bit(0, &ctx->sq_check_overflow); set_bit(0, &ctx->sq_check_overflow);
@ -1865,6 +1874,12 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req)
link = list_first_entry(&req->link_list, struct io_kiocb, link_list); link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
if (link->opcode != IORING_OP_LINK_TIMEOUT) if (link->opcode != IORING_OP_LINK_TIMEOUT)
return false; return false;
/*
* Can happen if a linked timeout fired and link had been like
* req -> link t-out -> link t-out [-> ...]
*/
if (!(link->flags & REQ_F_LTIMEOUT_ACTIVE))
return false;
list_del_init(&link->link_list); list_del_init(&link->link_list);
wake_ev = io_link_cancel_timeout(link); wake_ev = io_link_cancel_timeout(link);
@ -1908,10 +1923,12 @@ static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
/* /*
* Called if REQ_F_LINK_HEAD is set, and we fail the head request * Called if REQ_F_LINK_HEAD is set, and we fail the head request
*/ */
static void __io_fail_links(struct io_kiocb *req) static void io_fail_links(struct io_kiocb *req)
{ {
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
while (!list_empty(&req->link_list)) { while (!list_empty(&req->link_list)) {
struct io_kiocb *link = list_first_entry(&req->link_list, struct io_kiocb *link = list_first_entry(&req->link_list,
struct io_kiocb, link_list); struct io_kiocb, link_list);
@ -1933,15 +1950,6 @@ static void __io_fail_links(struct io_kiocb *req)
} }
io_commit_cqring(ctx); io_commit_cqring(ctx);
}
static void io_fail_links(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
__io_fail_links(req);
spin_unlock_irqrestore(&ctx->completion_lock, flags); spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx); io_cqring_ev_posted(ctx);
@ -3109,9 +3117,10 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
* For files that don't have ->read_iter() and ->write_iter(), handle them * For files that don't have ->read_iter() and ->write_iter(), handle them
* by looping over ->read() or ->write() manually. * by looping over ->read() or ->write() manually.
*/ */
static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
struct iov_iter *iter)
{ {
struct kiocb *kiocb = &req->rw.kiocb;
struct file *file = req->file;
ssize_t ret = 0; ssize_t ret = 0;
/* /*
@ -3131,11 +3140,8 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
if (!iov_iter_is_bvec(iter)) { if (!iov_iter_is_bvec(iter)) {
iovec = iov_iter_iovec(iter); iovec = iov_iter_iovec(iter);
} else { } else {
/* fixed buffers import bvec */ iovec.iov_base = u64_to_user_ptr(req->rw.addr);
iovec.iov_base = kmap(iter->bvec->bv_page) iovec.iov_len = req->rw.len;
+ iter->iov_offset;
iovec.iov_len = min(iter->count,
iter->bvec->bv_len - iter->iov_offset);
} }
if (rw == READ) { if (rw == READ) {
@ -3146,9 +3152,6 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
iovec.iov_len, io_kiocb_ppos(kiocb)); iovec.iov_len, io_kiocb_ppos(kiocb));
} }
if (iov_iter_is_bvec(iter))
kunmap(iter->bvec->bv_page);
if (nr < 0) { if (nr < 0) {
if (!ret) if (!ret)
ret = nr; ret = nr;
@ -3157,6 +3160,8 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
ret += nr; ret += nr;
if (nr != iovec.iov_len) if (nr != iovec.iov_len)
break; break;
req->rw.len -= nr;
req->rw.addr += nr;
iov_iter_advance(iter, nr); iov_iter_advance(iter, nr);
} }
@ -3346,7 +3351,7 @@ static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
if (req->file->f_op->read_iter) if (req->file->f_op->read_iter)
return call_read_iter(req->file, &req->rw.kiocb, iter); return call_read_iter(req->file, &req->rw.kiocb, iter);
else if (req->file->f_op->read) else if (req->file->f_op->read)
return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); return loop_rw_iter(READ, req, iter);
else else
return -EINVAL; return -EINVAL;
} }
@ -3537,7 +3542,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
if (req->file->f_op->write_iter) if (req->file->f_op->write_iter)
ret2 = call_write_iter(req->file, kiocb, iter); ret2 = call_write_iter(req->file, kiocb, iter);
else if (req->file->f_op->write) else if (req->file->f_op->write)
ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter); ret2 = loop_rw_iter(WRITE, req, iter);
else else
ret2 = -EINVAL; ret2 = -EINVAL;
@ -4927,32 +4932,25 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
io_commit_cqring(ctx); io_commit_cqring(ctx);
} }
static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
{
struct io_ring_ctx *ctx = req->ctx;
if (io_poll_rewait(req, &req->poll)) {
spin_unlock_irq(&ctx->completion_lock);
return;
}
hash_del(&req->hash_node);
io_poll_complete(req, req->result, 0);
spin_unlock_irq(&ctx->completion_lock);
*nxt = io_put_req_find_next(req);
io_cqring_ev_posted(ctx);
}
static void io_poll_task_func(struct callback_head *cb) static void io_poll_task_func(struct callback_head *cb)
{ {
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *nxt = NULL; struct io_kiocb *nxt;
io_poll_task_handler(req, &nxt); if (io_poll_rewait(req, &req->poll)) {
spin_unlock_irq(&ctx->completion_lock);
} else {
hash_del(&req->hash_node);
io_poll_complete(req, req->result, 0);
spin_unlock_irq(&ctx->completion_lock);
nxt = io_put_req_find_next(req);
io_cqring_ev_posted(ctx);
if (nxt) if (nxt)
__io_req_task_submit(nxt); __io_req_task_submit(nxt);
}
percpu_ref_put(&ctx->refs); percpu_ref_put(&ctx->refs);
} }
@ -5106,6 +5104,7 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
bool cancel = false; bool cancel = false;
INIT_HLIST_NODE(&req->hash_node);
io_init_poll_iocb(poll, mask, wake_func); io_init_poll_iocb(poll, mask, wake_func);
poll->file = req->file; poll->file = req->file;
poll->wait.private = req; poll->wait.private = req;
@ -5167,7 +5166,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
req->flags |= REQ_F_POLLED; req->flags |= REQ_F_POLLED;
req->apoll = apoll; req->apoll = apoll;
INIT_HLIST_NODE(&req->hash_node);
mask = 0; mask = 0;
if (def->pollin) if (def->pollin)
@ -5349,8 +5347,6 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return -EINVAL; return -EINVAL;
if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
return -EINVAL; return -EINVAL;
if (!poll->file)
return -EBADF;
events = READ_ONCE(sqe->poll32_events); events = READ_ONCE(sqe->poll32_events);
#ifdef __BIG_ENDIAN #ifdef __BIG_ENDIAN
@ -5368,7 +5364,6 @@ static int io_poll_add(struct io_kiocb *req)
struct io_poll_table ipt; struct io_poll_table ipt;
__poll_t mask; __poll_t mask;
INIT_HLIST_NODE(&req->hash_node);
ipt.pt._qproc = io_poll_queue_proc; ipt.pt._qproc = io_poll_queue_proc;
mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
@ -6118,10 +6113,9 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
if (!list_empty(&req->link_list)) { if (!list_empty(&req->link_list)) {
prev = list_entry(req->link_list.prev, struct io_kiocb, prev = list_entry(req->link_list.prev, struct io_kiocb,
link_list); link_list);
if (refcount_inc_not_zero(&prev->refs)) { if (refcount_inc_not_zero(&prev->refs))
list_del_init(&req->link_list); list_del_init(&req->link_list);
prev->flags &= ~REQ_F_LINK_TIMEOUT; else
} else
prev = NULL; prev = NULL;
} }
@ -6178,6 +6172,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
return NULL; return NULL;
nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
req->flags |= REQ_F_LINK_TIMEOUT; req->flags |= REQ_F_LINK_TIMEOUT;
return nxt; return nxt;
} }
@ -6192,7 +6187,8 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
again: again:
linked_timeout = io_prep_linked_timeout(req); linked_timeout = io_prep_linked_timeout(req);
if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.identity->creds && if ((req->flags & REQ_F_WORK_INITIALIZED) &&
(req->work.flags & IO_WQ_WORK_CREDS) &&
req->work.identity->creds != current_cred()) { req->work.identity->creds != current_cred()) {
if (old_creds) if (old_creds)
revert_creds(old_creds); revert_creds(old_creds);
@ -6200,7 +6196,6 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
old_creds = NULL; /* restored original creds */ old_creds = NULL; /* restored original creds */
else else
old_creds = override_creds(req->work.identity->creds); old_creds = override_creds(req->work.identity->creds);
req->work.flags |= IO_WQ_WORK_CREDS;
} }
ret = io_issue_sqe(req, true, cs); ret = io_issue_sqe(req, true, cs);
@ -6241,8 +6236,10 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
if (nxt) { if (nxt) {
req = nxt; req = nxt;
if (req->flags & REQ_F_FORCE_ASYNC) if (req->flags & REQ_F_FORCE_ASYNC) {
linked_timeout = NULL;
goto punt; goto punt;
}
goto again; goto again;
} }
exit: exit:
@ -6505,12 +6502,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (id) { if (id) {
struct io_identity *iod; struct io_identity *iod;
io_req_init_async(req);
iod = idr_find(&ctx->personality_idr, id); iod = idr_find(&ctx->personality_idr, id);
if (unlikely(!iod)) if (unlikely(!iod))
return -EINVAL; return -EINVAL;
refcount_inc(&iod->count); refcount_inc(&iod->count);
io_put_identity(current->io_uring, req);
__io_req_init_async(req);
get_cred(iod->creds); get_cred(iod->creds);
req->work.identity = iod; req->work.identity = iod;
req->work.flags |= IO_WQ_WORK_CREDS; req->work.flags |= IO_WQ_WORK_CREDS;
@ -8686,19 +8683,11 @@ static void io_uring_del_task_file(struct file *file)
fput(file); fput(file);
} }
static void __io_uring_attempt_task_drop(struct file *file)
{
struct file *old = xa_load(&current->io_uring->xa, (unsigned long)file);
if (old == file)
io_uring_del_task_file(file);
}
/* /*
* Drop task note for this file if we're the only ones that hold it after * Drop task note for this file if we're the only ones that hold it after
* pending fput() * pending fput()
*/ */
static void io_uring_attempt_task_drop(struct file *file, bool exiting) static void io_uring_attempt_task_drop(struct file *file)
{ {
if (!current->io_uring) if (!current->io_uring)
return; return;
@ -8706,10 +8695,9 @@ static void io_uring_attempt_task_drop(struct file *file, bool exiting)
* fput() is pending, will be 2 if the only other ref is our potential * fput() is pending, will be 2 if the only other ref is our potential
* task file note. If the task is exiting, drop regardless of count. * task file note. If the task is exiting, drop regardless of count.
*/ */
if (!exiting && atomic_long_read(&file->f_count) != 2) if (fatal_signal_pending(current) || (current->flags & PF_EXITING) ||
return; atomic_long_read(&file->f_count) == 2)
io_uring_del_task_file(file);
__io_uring_attempt_task_drop(file);
} }
void __io_uring_files_cancel(struct files_struct *files) void __io_uring_files_cancel(struct files_struct *files)
@ -8767,16 +8755,7 @@ void __io_uring_task_cancel(void)
static int io_uring_flush(struct file *file, void *data) static int io_uring_flush(struct file *file, void *data)
{ {
struct io_ring_ctx *ctx = file->private_data; io_uring_attempt_task_drop(file);
/*
* If the task is going away, cancel work it may have pending
*/
if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
data = NULL;
io_uring_cancel_task_requests(ctx, data);
io_uring_attempt_task_drop(file, !data);
return 0; return 0;
} }

View File

@ -1005,9 +1005,8 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
/* /*
* Determine where to splice to/from. * Determine where to splice to/from.
*/ */
long do_splice(struct file *in, loff_t __user *off_in, long do_splice(struct file *in, loff_t *off_in, struct file *out,
struct file *out, loff_t __user *off_out, loff_t *off_out, size_t len, unsigned int flags)
size_t len, unsigned int flags)
{ {
struct pipe_inode_info *ipipe; struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe; struct pipe_inode_info *opipe;
@ -1041,8 +1040,7 @@ long do_splice(struct file *in, loff_t __user *off_in,
if (off_out) { if (off_out) {
if (!(out->f_mode & FMODE_PWRITE)) if (!(out->f_mode & FMODE_PWRITE))
return -EINVAL; return -EINVAL;
if (copy_from_user(&offset, off_out, sizeof(loff_t))) offset = *off_out;
return -EFAULT;
} else { } else {
offset = out->f_pos; offset = out->f_pos;
} }
@ -1063,8 +1061,8 @@ long do_splice(struct file *in, loff_t __user *off_in,
if (!off_out) if (!off_out)
out->f_pos = offset; out->f_pos = offset;
else if (copy_to_user(off_out, &offset, sizeof(loff_t))) else
ret = -EFAULT; *off_out = offset;
return ret; return ret;
} }
@ -1075,8 +1073,7 @@ long do_splice(struct file *in, loff_t __user *off_in,
if (off_in) { if (off_in) {
if (!(in->f_mode & FMODE_PREAD)) if (!(in->f_mode & FMODE_PREAD))
return -EINVAL; return -EINVAL;
if (copy_from_user(&offset, off_in, sizeof(loff_t))) offset = *off_in;
return -EFAULT;
} else { } else {
offset = in->f_pos; offset = in->f_pos;
} }
@ -1100,8 +1097,8 @@ long do_splice(struct file *in, loff_t __user *off_in,
wakeup_pipe_readers(opipe); wakeup_pipe_readers(opipe);
if (!off_in) if (!off_in)
in->f_pos = offset; in->f_pos = offset;
else if (copy_to_user(off_in, &offset, sizeof(loff_t))) else
ret = -EFAULT; *off_in = offset;
return ret; return ret;
} }
@ -1109,6 +1106,46 @@ long do_splice(struct file *in, loff_t __user *off_in,
return -EINVAL; return -EINVAL;
} }
static long __do_splice(struct file *in, loff_t __user *off_in,
struct file *out, loff_t __user *off_out,
size_t len, unsigned int flags)
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
loff_t offset, *__off_in = NULL, *__off_out = NULL;
long ret;
ipipe = get_pipe_info(in, true);
opipe = get_pipe_info(out, true);
if (ipipe && off_in)
return -ESPIPE;
if (opipe && off_out)
return -ESPIPE;
if (off_out) {
if (copy_from_user(&offset, off_out, sizeof(loff_t)))
return -EFAULT;
__off_out = &offset;
}
if (off_in) {
if (copy_from_user(&offset, off_in, sizeof(loff_t)))
return -EFAULT;
__off_in = &offset;
}
ret = do_splice(in, __off_in, out, __off_out, len, flags);
if (ret < 0)
return ret;
if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
return -EFAULT;
if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
return -EFAULT;
return ret;
}
static int iter_to_pipe(struct iov_iter *from, static int iter_to_pipe(struct iov_iter *from,
struct pipe_inode_info *pipe, struct pipe_inode_info *pipe,
unsigned flags) unsigned flags)
@ -1303,7 +1340,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
if (in.file) { if (in.file) {
out = fdget(fd_out); out = fdget(fd_out);
if (out.file) { if (out.file) {
error = do_splice(in.file, off_in, out.file, off_out, error = __do_splice(in.file, off_in, out.file, off_out,
len, flags); len, flags);
fdput(out); fdput(out);
} }

View File

@ -78,8 +78,8 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *,
struct pipe_buffer *); struct pipe_buffer *);
extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
splice_direct_actor *); splice_direct_actor *);
extern long do_splice(struct file *in, loff_t __user *off_in, extern long do_splice(struct file *in, loff_t *off_in,
struct file *out, loff_t __user *off_out, struct file *out, loff_t *off_out,
size_t len, unsigned int flags); size_t len, unsigned int flags);
extern long do_tee(struct file *in, struct file *out, size_t len, extern long do_tee(struct file *in, struct file *out, size_t len,