LCOV - Real - fs/io

LCOV - code coverage report

Current view:	top level - fs - io_uring.c (source / functions)		Hit	Total	Coverage
Test:	Real	Lines:	5	1490	0.3 %
Date:	2020-10-17 15:46:16	Functions:	0	107	0.0 %
Legend:	Neither, QEMU, Real, Both	Branches:	0	0	-

           Branch data     Line data    Source code

       1                 :            : // SPDX-License-Identifier: GPL-2.0
       2                 :            : /*
       3                 :            :  * Shared application/kernel submission and completion ring pairs, for
       4                 :            :  * supporting fast/efficient IO.
       5                 :            :  *
       6                 :            :  * A note on the read/write ordering memory barriers that are matched between
       7                 :            :  * the application and kernel side.
       8                 :            :  *
       9                 :            :  * After the application reads the CQ ring tail, it must use an
      10                 :            :  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
      11                 :            :  * before writing the tail (using smp_load_acquire to read the tail will
      12                 :            :  * do). It also needs a smp_mb() before updating CQ head (ordering the
      13                 :            :  * entry load(s) with the head store), pairing with an implicit barrier
      14                 :            :  * through a control-dependency in io_get_cqring (smp_store_release to
      15                 :            :  * store head will do). Failure to do so could lead to reading invalid
      16                 :            :  * CQ entries.
      17                 :            :  *
      18                 :            :  * Likewise, the application must use an appropriate smp_wmb() before
      19                 :            :  * writing the SQ tail (ordering SQ entry stores with the tail store),
      20                 :            :  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
      21                 :            :  * to store the tail will do). And it needs a barrier ordering the SQ
      22                 :            :  * head load before writing new SQ entries (smp_load_acquire to read
      23                 :            :  * head will do).
      24                 :            :  *
      25                 :            :  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
      26                 :            :  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
      27                 :            :  * updating the SQ tail; a full memory barrier smp_mb() is needed
      28                 :            :  * between.
      29                 :            :  *
      30                 :            :  * Also see the examples in the liburing library:
      31                 :            :  *
      32                 :            :  *      git://git.kernel.dk/liburing
      33                 :            :  *
      34                 :            :  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
      35                 :            :  * from data shared between the kernel and application. This is done both
      36                 :            :  * for ordering purposes, but also to ensure that once a value is loaded from
      37                 :            :  * data that the application could potentially modify, it remains stable.
      38                 :            :  *
      39                 :            :  * Copyright (C) 2018-2019 Jens Axboe
      40                 :            :  * Copyright (c) 2018-2019 Christoph Hellwig
      41                 :            :  */
      42                 :            : #include <linux/kernel.h>
      43                 :            : #include <linux/init.h>
      44                 :            : #include <linux/errno.h>
      45                 :            : #include <linux/syscalls.h>
      46                 :            : #include <linux/compat.h>
      47                 :            : #include <linux/refcount.h>
      48                 :            : #include <linux/uio.h>
      49                 :            : 
      50                 :            : #include <linux/sched/signal.h>
      51                 :            : #include <linux/fs.h>
      52                 :            : #include <linux/file.h>
      53                 :            : #include <linux/fdtable.h>
      54                 :            : #include <linux/mm.h>
      55                 :            : #include <linux/mman.h>
      56                 :            : #include <linux/mmu_context.h>
      57                 :            : #include <linux/percpu.h>
      58                 :            : #include <linux/slab.h>
      59                 :            : #include <linux/workqueue.h>
      60                 :            : #include <linux/kthread.h>
      61                 :            : #include <linux/blkdev.h>
      62                 :            : #include <linux/bvec.h>
      63                 :            : #include <linux/net.h>
      64                 :            : #include <net/sock.h>
      65                 :            : #include <net/af_unix.h>
      66                 :            : #include <net/scm.h>
      67                 :            : #include <linux/anon_inodes.h>
      68                 :            : #include <linux/sched/mm.h>
      69                 :            : #include <linux/uaccess.h>
      70                 :            : #include <linux/nospec.h>
      71                 :            : #include <linux/sizes.h>
      72                 :            : #include <linux/hugetlb.h>
      73                 :            : #include <linux/highmem.h>
      74                 :            : #include <linux/fs_struct.h>
      75                 :            : 
      76                 :            : #include <uapi/linux/io_uring.h>
      77                 :            : 
      78                 :            : #include "internal.h"
      79                 :            : 
      80                 :            : #define IORING_MAX_ENTRIES      32768
      81                 :            : #define IORING_MAX_FIXED_FILES  1024
      82                 :            : 
      83                 :            : struct io_uring {
      84                 :            :         u32 head ____cacheline_aligned_in_smp;
      85                 :            :         u32 tail ____cacheline_aligned_in_smp;
      86                 :            : };
      87                 :            : 
      88                 :            : /*
      89                 :            :  * This data is shared with the application through the mmap at offsets
      90                 :            :  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
      91                 :            :  *
      92                 :            :  * The offsets to the member fields are published through struct
      93                 :            :  * io_sqring_offsets when calling io_uring_setup.
      94                 :            :  */
      95                 :            : struct io_rings {
      96                 :            :         /*
      97                 :            :          * Head and tail offsets into the ring; the offsets need to be
      98                 :            :          * masked to get valid indices.
      99                 :            :          *
     100                 :            :          * The kernel controls head of the sq ring and the tail of the cq ring,
     101                 :            :          * and the application controls tail of the sq ring and the head of the
     102                 :            :          * cq ring.
     103                 :            :          */
     104                 :            :         struct io_uring         sq, cq;
     105                 :            :         /*
     106                 :            :          * Bitmasks to apply to head and tail offsets (constant, equals
     107                 :            :          * ring_entries - 1)
     108                 :            :          */
     109                 :            :         u32                     sq_ring_mask, cq_ring_mask;
     110                 :            :         /* Ring sizes (constant, power of 2) */
     111                 :            :         u32                     sq_ring_entries, cq_ring_entries;
     112                 :            :         /*
     113                 :            :          * Number of invalid entries dropped by the kernel due to
     114                 :            :          * invalid index stored in array
     115                 :            :          *
     116                 :            :          * Written by the kernel, shouldn't be modified by the
     117                 :            :          * application (i.e. get number of "new events" by comparing to
     118                 :            :          * cached value).
     119                 :            :          *
     120                 :            :          * After a new SQ head value was read by the application this
     121                 :            :          * counter includes all submissions that were dropped reaching
     122                 :            :          * the new SQ head (and possibly more).
     123                 :            :          */
     124                 :            :         u32                     sq_dropped;
     125                 :            :         /*
     126                 :            :          * Runtime flags
     127                 :            :          *
     128                 :            :          * Written by the kernel, shouldn't be modified by the
     129                 :            :          * application.
     130                 :            :          *
     131                 :            :          * The application needs a full memory barrier before checking
     132                 :            :          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
     133                 :            :          */
     134                 :            :         u32                     sq_flags;
     135                 :            :         /*
     136                 :            :          * Number of completion events lost because the queue was full;
     137                 :            :          * this should be avoided by the application by making sure
     138                 :            :          * there are not more requests pending thatn there is space in
     139                 :            :          * the completion queue.
     140                 :            :          *
     141                 :            :          * Written by the kernel, shouldn't be modified by the
     142                 :            :          * application (i.e. get number of "new events" by comparing to
     143                 :            :          * cached value).
     144                 :            :          *
     145                 :            :          * As completion events come in out of order this counter is not
     146                 :            :          * ordered with any other data.
     147                 :            :          */
     148                 :            :         u32                     cq_overflow;
     149                 :            :         /*
     150                 :            :          * Ring buffer of completion events.
     151                 :            :          *
     152                 :            :          * The kernel writes completion events fresh every time they are
     153                 :            :          * produced, so the application is allowed to modify pending
     154                 :            :          * entries.
     155                 :            :          */
     156                 :            :         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
     157                 :            : };
     158                 :            : 
     159                 :            : struct io_mapped_ubuf {
     160                 :            :         u64             ubuf;
     161                 :            :         size_t          len;
     162                 :            :         struct          bio_vec *bvec;
     163                 :            :         unsigned int    nr_bvecs;
     164                 :            : };
     165                 :            : 
     166                 :            : struct async_list {
     167                 :            :         spinlock_t              lock;
     168                 :            :         atomic_t                cnt;
     169                 :            :         struct list_head        list;
     170                 :            : 
     171                 :            :         struct file             *file;
     172                 :            :         off_t                   io_start;
     173                 :            :         size_t                  io_len;
     174                 :            : };
     175                 :            : 
     176                 :            : struct io_ring_ctx {
     177                 :            :         struct {
     178                 :            :                 struct percpu_ref       refs;
     179                 :            :         } ____cacheline_aligned_in_smp;
     180                 :            : 
     181                 :            :         struct {
     182                 :            :                 unsigned int            flags;
     183                 :            :                 bool                    compat;
     184                 :            :                 bool                    account_mem;
     185                 :            : 
     186                 :            :                 /*
     187                 :            :                  * Ring buffer of indices into array of io_uring_sqe, which is
     188                 :            :                  * mmapped by the application using the IORING_OFF_SQES offset.
     189                 :            :                  *
     190                 :            :                  * This indirection could e.g. be used to assign fixed
     191                 :            :                  * io_uring_sqe entries to operations and only submit them to
     192                 :            :                  * the queue when needed.
     193                 :            :                  *
     194                 :            :                  * The kernel modifies neither the indices array nor the entries
     195                 :            :                  * array.
     196                 :            :                  */
     197                 :            :                 u32                     *sq_array;
     198                 :            :                 unsigned                cached_sq_head;
     199                 :            :                 unsigned                sq_entries;
     200                 :            :                 unsigned                sq_mask;
     201                 :            :                 unsigned                sq_thread_idle;
     202                 :            :                 unsigned                cached_sq_dropped;
     203                 :            :                 struct io_uring_sqe     *sq_sqes;
     204                 :            : 
     205                 :            :                 struct list_head        defer_list;
     206                 :            :                 struct list_head        timeout_list;
     207                 :            :         } ____cacheline_aligned_in_smp;
     208                 :            : 
     209                 :            :         /* IO offload */
     210                 :            :         struct workqueue_struct *sqo_wq[2];
     211                 :            :         struct task_struct      *sqo_thread;    /* if using sq thread polling */
     212                 :            :         struct mm_struct        *sqo_mm;
     213                 :            :         wait_queue_head_t       sqo_wait;
     214                 :            :         struct completion       sqo_thread_started;
     215                 :            : 
     216                 :            :         struct {
     217                 :            :                 unsigned                cached_cq_tail;
     218                 :            :                 atomic_t                cached_cq_overflow;
     219                 :            :                 unsigned                cq_entries;
     220                 :            :                 unsigned                cq_mask;
     221                 :            :                 struct wait_queue_head  cq_wait;
     222                 :            :                 struct fasync_struct    *cq_fasync;
     223                 :            :                 struct eventfd_ctx      *cq_ev_fd;
     224                 :            :                 atomic_t                cq_timeouts;
     225                 :            :         } ____cacheline_aligned_in_smp;
     226                 :            : 
     227                 :            :         struct io_rings *rings;
     228                 :            : 
     229                 :            :         /*
     230                 :            :          * If used, fixed file set. Writers must ensure that ->refs is dead,
     231                 :            :          * readers must ensure that ->refs is alive as long as the file* is
     232                 :            :          * used. Only updated through io_uring_register(2).
     233                 :            :          */
     234                 :            :         struct file             **user_files;
     235                 :            :         unsigned                nr_user_files;
     236                 :            : 
     237                 :            :         /* if used, fixed mapped user buffers */
     238                 :            :         unsigned                nr_user_bufs;
     239                 :            :         struct io_mapped_ubuf   *user_bufs;
     240                 :            : 
     241                 :            :         struct user_struct      *user;
     242                 :            : 
     243                 :            :         const struct cred       *creds;
     244                 :            : 
     245                 :            :         struct completion       ctx_done;
     246                 :            : 
     247                 :            :         struct {
     248                 :            :                 struct mutex            uring_lock;
     249                 :            :                 wait_queue_head_t       wait;
     250                 :            :         } ____cacheline_aligned_in_smp;
     251                 :            : 
     252                 :            :         struct {
     253                 :            :                 spinlock_t              completion_lock;
     254                 :            :                 bool                    poll_multi_file;
     255                 :            :                 /*
     256                 :            :                  * ->poll_list is protected by the ctx->uring_lock for
     257                 :            :                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
     258                 :            :                  * For SQPOLL, only the single threaded io_sq_thread() will
     259                 :            :                  * manipulate the list, hence no extra locking is needed there.
     260                 :            :                  */
     261                 :            :                 struct list_head        poll_list;
     262                 :            :                 struct list_head        cancel_list;
     263                 :            :         } ____cacheline_aligned_in_smp;
     264                 :            : 
     265                 :            :         struct async_list       pending_async[2];
     266                 :            : 
     267                 :            : #if defined(CONFIG_UNIX)
     268                 :            :         struct socket           *ring_sock;
     269                 :            : #endif
     270                 :            : 
     271                 :            :         struct list_head        task_list;
     272                 :            :         spinlock_t              task_lock;
     273                 :            : };
     274                 :            : 
     275                 :            : struct sqe_submit {
     276                 :            :         const struct io_uring_sqe       *sqe;
     277                 :            :         unsigned short                  index;
     278                 :            :         u32                             sequence;
     279                 :            :         bool                            has_user;
     280                 :            :         bool                            needs_lock;
     281                 :            :         bool                            needs_fixed_file;
     282                 :            :         u8                              opcode;
     283                 :            : };
     284                 :            : 
     285                 :            : /*
     286                 :            :  * First field must be the file pointer in all the
     287                 :            :  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
     288                 :            :  */
     289                 :            : struct io_poll_iocb {
     290                 :            :         struct file                     *file;
     291                 :            :         struct wait_queue_head          *head;
     292                 :            :         __poll_t                        events;
     293                 :            :         bool                            done;
     294                 :            :         bool                            canceled;
     295                 :            :         struct wait_queue_entry         wait;
     296                 :            : };
     297                 :            : 
     298                 :            : struct io_timeout {
     299                 :            :         struct file                     *file;
     300                 :            :         struct hrtimer                  timer;
     301                 :            : };
     302                 :            : 
     303                 :            : /*
     304                 :            :  * NOTE! Each of the iocb union members has the file pointer
     305                 :            :  * as the first entry in their struct definition. So you can
     306                 :            :  * access the file pointer through any of the sub-structs,
     307                 :            :  * or directly as just 'ki_filp' in this struct.
     308                 :            :  */
     309                 :            : struct io_kiocb {
     310                 :            :         union {
     311                 :            :                 struct file             *file;
     312                 :            :                 struct kiocb            rw;
     313                 :            :                 struct io_poll_iocb     poll;
     314                 :            :                 struct io_timeout       timeout;
     315                 :            :         };
     316                 :            : 
     317                 :            :         struct sqe_submit       submit;
     318                 :            : 
     319                 :            :         struct io_ring_ctx      *ctx;
     320                 :            :         struct list_head        list;
     321                 :            :         struct list_head        link_list;
     322                 :            :         unsigned int            flags;
     323                 :            :         refcount_t              refs;
     324                 :            : #define REQ_F_NOWAIT            1       /* must not punt to workers */
     325                 :            : #define REQ_F_IOPOLL_COMPLETED  2       /* polled IO has completed */
     326                 :            : #define REQ_F_FIXED_FILE        4       /* ctx owns file */
     327                 :            : #define REQ_F_SEQ_PREV          8       /* sequential with previous */
     328                 :            : #define REQ_F_IO_DRAIN          16      /* drain existing IO first */
     329                 :            : #define REQ_F_IO_DRAINED        32      /* drain done */
     330                 :            : #define REQ_F_LINK              64      /* linked sqes */
     331                 :            : #define REQ_F_LINK_DONE         128     /* linked sqes done */
     332                 :            : #define REQ_F_FAIL_LINK         256     /* fail rest of links */
     333                 :            : #define REQ_F_SHADOW_DRAIN      512     /* link-drain shadow req */
     334                 :            : #define REQ_F_TIMEOUT           1024    /* timeout request */
     335                 :            : #define REQ_F_ISREG             2048    /* regular file */
     336                 :            : #define REQ_F_MUST_PUNT         4096    /* must be punted even for NONBLOCK */
     337                 :            : #define REQ_F_TIMEOUT_NOSEQ     8192    /* no timeout sequence */
     338                 :            : #define REQ_F_CANCEL            16384   /* cancel request */
     339                 :            :         unsigned long           fsize;
     340                 :            :         u64                     user_data;
     341                 :            :         u32                     result;
     342                 :            :         u32                     sequence;
     343                 :            :         struct task_struct      *task;
     344                 :            : 
     345                 :            :         struct fs_struct        *fs;
     346                 :            : 
     347                 :            :         struct work_struct      work;
     348                 :            :         struct task_struct      *work_task;
     349                 :            :         struct list_head        task_list;
     350                 :            : };
     351                 :            : 
     352                 :            : #define IO_PLUG_THRESHOLD               2
     353                 :            : #define IO_IOPOLL_BATCH                 8
     354                 :            : 
     355                 :            : struct io_submit_state {
     356                 :            :         struct blk_plug         plug;
     357                 :            : 
     358                 :            :         /*
     359                 :            :          * io_kiocb alloc cache
     360                 :            :          */
     361                 :            :         void                    *reqs[IO_IOPOLL_BATCH];
     362                 :            :         unsigned                int free_reqs;
     363                 :            :         unsigned                int cur_req;
     364                 :            : 
     365                 :            :         /*
     366                 :            :          * File reference cache
     367                 :            :          */
     368                 :            :         struct file             *file;
     369                 :            :         unsigned int            fd;
     370                 :            :         unsigned int            has_refs;
     371                 :            :         unsigned int            used_refs;
     372                 :            :         unsigned int            ios_left;
     373                 :            : };
     374                 :            : 
     375                 :            : static void io_sq_wq_submit_work(struct work_struct *work);
     376                 :            : static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
     377                 :            :                                  long res);
     378                 :            : static void __io_free_req(struct io_kiocb *req);
     379                 :            : 
     380                 :            : static struct kmem_cache *req_cachep;
     381                 :            : 
     382                 :            : static const struct file_operations io_uring_fops;
     383                 :            : 
     384                 :          3 : struct sock *io_uring_get_socket(struct file *file)
     385                 :            : {
     386                 :            : #if defined(CONFIG_UNIX)
     387                 :          3 :         if (file->f_op == &io_uring_fops) {
     388                 :          0 :                 struct io_ring_ctx *ctx = file->private_data;
     389                 :            : 
     390                 :          0 :                 return ctx->ring_sock->sk;
     391                 :            :         }
     392                 :            : #endif
     393                 :            :         return NULL;
     394                 :            : }
     395                 :            : EXPORT_SYMBOL(io_uring_get_socket);
     396                 :            : 
     397                 :          0 : static void io_ring_ctx_ref_free(struct percpu_ref *ref)
     398                 :            : {
     399                 :            :         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
     400                 :            : 
     401                 :          0 :         complete(&ctx->ctx_done);
     402                 :          0 : }
     403                 :            : 
     404                 :          0 : static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
     405                 :            : {
     406                 :            :         struct io_ring_ctx *ctx;
     407                 :            :         int i;
     408                 :            : 
     409                 :          0 :         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
     410                 :          0 :         if (!ctx)
     411                 :            :                 return NULL;
     412                 :            : 
     413                 :          0 :         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
     414                 :            :                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
     415                 :          0 :                 kfree(ctx);
     416                 :          0 :                 return NULL;
     417                 :            :         }
     418                 :            : 
     419                 :          0 :         ctx->flags = p->flags;
     420                 :          0 :         init_waitqueue_head(&ctx->sqo_wait);
     421                 :          0 :         init_waitqueue_head(&ctx->cq_wait);
     422                 :            :         init_completion(&ctx->ctx_done);
     423                 :            :         init_completion(&ctx->sqo_thread_started);
     424                 :          0 :         mutex_init(&ctx->uring_lock);
     425                 :          0 :         init_waitqueue_head(&ctx->wait);
     426                 :          0 :         for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
     427                 :          0 :                 spin_lock_init(&ctx->pending_async[i].lock);
     428                 :          0 :                 INIT_LIST_HEAD(&ctx->pending_async[i].list);
     429                 :          0 :                 atomic_set(&ctx->pending_async[i].cnt, 0);
     430                 :            :         }
     431                 :          0 :         spin_lock_init(&ctx->completion_lock);
     432                 :          0 :         INIT_LIST_HEAD(&ctx->poll_list);
     433                 :          0 :         INIT_LIST_HEAD(&ctx->cancel_list);
     434                 :          0 :         INIT_LIST_HEAD(&ctx->defer_list);
     435                 :          0 :         INIT_LIST_HEAD(&ctx->timeout_list);
     436                 :          0 :         INIT_LIST_HEAD(&ctx->task_list);
     437                 :          0 :         spin_lock_init(&ctx->task_lock);
     438                 :          0 :         return ctx;
     439                 :            : }
     440                 :            : 
     441                 :            : static inline bool __io_sequence_defer(struct io_ring_ctx *ctx,
     442                 :            :                                        struct io_kiocb *req)
     443                 :            : {
     444                 :          0 :         return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
     445                 :          0 :                                         + atomic_read(&ctx->cached_cq_overflow);
     446                 :            : }
     447                 :            : 
     448                 :            : static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
     449                 :            :                                      struct io_kiocb *req)
     450                 :            : {
     451                 :          0 :         if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
     452                 :            :                 return false;
     453                 :            : 
     454                 :            :         return __io_sequence_defer(ctx, req);
     455                 :            : }
     456                 :            : 
     457                 :          0 : static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
     458                 :            : {
     459                 :            :         struct io_kiocb *req;
     460                 :            : 
     461                 :          0 :         req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
     462                 :          0 :         if (req && !io_sequence_defer(ctx, req)) {
     463                 :          0 :                 list_del_init(&req->list);
     464                 :          0 :                 return req;
     465                 :            :         }
     466                 :            : 
     467                 :            :         return NULL;
     468                 :            : }
     469                 :            : 
     470                 :          0 : static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
     471                 :            : {
     472                 :            :         struct io_kiocb *req;
     473                 :            : 
     474                 :          0 :         req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
     475                 :          0 :         if (req) {
     476                 :          0 :                 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
     477                 :            :                         return NULL;
     478                 :          0 :                 if (!__io_sequence_defer(ctx, req)) {
     479                 :          0 :                         list_del_init(&req->list);
     480                 :          0 :                         return req;
     481                 :            :                 }
     482                 :            :         }
     483                 :            : 
     484                 :            :         return NULL;
     485                 :            : }
     486                 :            : 
     487                 :          0 : static void __io_commit_cqring(struct io_ring_ctx *ctx)
     488                 :            : {
     489                 :          0 :         struct io_rings *rings = ctx->rings;
     490                 :            : 
     491                 :          0 :         if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
     492                 :            :                 /* order cqe stores with ring update */
     493                 :          0 :                 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
     494                 :            : 
     495                 :          0 :                 if (wq_has_sleeper(&ctx->cq_wait)) {
     496                 :          0 :                         wake_up_interruptible(&ctx->cq_wait);
     497                 :          0 :                         kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
     498                 :            :                 }
     499                 :            :         }
     500                 :          0 : }
     501                 :            : 
     502                 :          0 : static inline void io_queue_async_work(struct io_ring_ctx *ctx,
     503                 :            :                                        struct io_kiocb *req)
     504                 :            : {
     505                 :            :         unsigned long flags;
     506                 :            :         int rw = 0;
     507                 :            : 
     508                 :          0 :         if (req->submit.sqe) {
     509                 :          0 :                 switch (req->submit.opcode) {
     510                 :            :                 case IORING_OP_WRITEV:
     511                 :            :                 case IORING_OP_WRITE_FIXED:
     512                 :          0 :                         rw = !(req->rw.ki_flags & IOCB_DIRECT);
     513                 :          0 :                         break;
     514                 :            :                 }
     515                 :            :         }
     516                 :            : 
     517                 :          0 :         req->task = current;
     518                 :            : 
     519                 :          0 :         spin_lock_irqsave(&ctx->task_lock, flags);
     520                 :          0 :         list_add(&req->task_list, &ctx->task_list);
     521                 :          0 :         req->work_task = NULL;
     522                 :            :         spin_unlock_irqrestore(&ctx->task_lock, flags);
     523                 :            : 
     524                 :          0 :         queue_work(ctx->sqo_wq[rw], &req->work);
     525                 :          0 : }
     526                 :            : 
     527                 :          0 : static void io_kill_timeout(struct io_kiocb *req)
     528                 :            : {
     529                 :            :         int ret;
     530                 :            : 
     531                 :          0 :         ret = hrtimer_try_to_cancel(&req->timeout.timer);
     532                 :          0 :         if (ret != -1) {
     533                 :          0 :                 atomic_inc(&req->ctx->cq_timeouts);
     534                 :            :                 list_del(&req->list);
     535                 :          0 :                 io_cqring_fill_event(req->ctx, req->user_data, 0);
     536                 :          0 :                 __io_free_req(req);
     537                 :            :         }
     538                 :          0 : }
     539                 :            : 
     540                 :          0 : static void io_kill_timeouts(struct io_ring_ctx *ctx)
     541                 :            : {
     542                 :            :         struct io_kiocb *req, *tmp;
     543                 :            : 
     544                 :            :         spin_lock_irq(&ctx->completion_lock);
     545                 :          0 :         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
     546                 :          0 :                 io_kill_timeout(req);
     547                 :            :         spin_unlock_irq(&ctx->completion_lock);
     548                 :          0 : }
     549                 :            : 
     550                 :          0 : static void io_commit_cqring(struct io_ring_ctx *ctx)
     551                 :            : {
     552                 :            :         struct io_kiocb *req;
     553                 :            : 
     554                 :          0 :         while ((req = io_get_timeout_req(ctx)) != NULL)
     555                 :          0 :                 io_kill_timeout(req);
     556                 :            : 
     557                 :          0 :         __io_commit_cqring(ctx);
     558                 :            : 
     559                 :          0 :         while ((req = io_get_deferred_req(ctx)) != NULL) {
     560                 :          0 :                 if (req->flags & REQ_F_SHADOW_DRAIN) {
     561                 :            :                         /* Just for drain, free it. */
     562                 :          0 :                         __io_free_req(req);
     563                 :          0 :                         continue;
     564                 :            :                 }
     565                 :          0 :                 req->flags |= REQ_F_IO_DRAINED;
     566                 :          0 :                 io_queue_async_work(ctx, req);
     567                 :            :         }
     568                 :          0 : }
     569                 :            : 
     570                 :            : static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
     571                 :            : {
     572                 :          0 :         struct io_rings *rings = ctx->rings;
     573                 :            :         unsigned tail;
     574                 :            : 
     575                 :          0 :         tail = ctx->cached_cq_tail;
     576                 :            :         /*
     577                 :            :          * writes to the cq entry need to come after reading head; the
     578                 :            :          * control dependency is enough as we're using WRITE_ONCE to
     579                 :            :          * fill the cq entry
     580                 :            :          */
     581                 :          0 :         if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
     582                 :            :                 return NULL;
     583                 :            : 
     584                 :          0 :         ctx->cached_cq_tail++;
     585                 :          0 :         return &rings->cqes[tail & ctx->cq_mask];
     586                 :            : }
     587                 :            : 
     588                 :          0 : static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
     589                 :            :                                  long res)
     590                 :            : {
     591                 :            :         struct io_uring_cqe *cqe;
     592                 :            : 
     593                 :            :         /*
     594                 :            :          * If we can't get a cq entry, userspace overflowed the
     595                 :            :          * submission (by quite a lot). Increment the overflow count in
     596                 :            :          * the ring.
     597                 :            :          */
     598                 :            :         cqe = io_get_cqring(ctx);
     599                 :          0 :         if (cqe) {
     600                 :            :                 WRITE_ONCE(cqe->user_data, ki_user_data);
     601                 :            :                 WRITE_ONCE(cqe->res, res);
     602                 :            :                 WRITE_ONCE(cqe->flags, 0);
     603                 :            :         } else {
     604                 :          0 :                 WRITE_ONCE(ctx->rings->cq_overflow,
     605                 :            :                                 atomic_inc_return(&ctx->cached_cq_overflow));
     606                 :            :         }
     607                 :          0 : }
     608                 :            : 
     609                 :          0 : static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
     610                 :            : {
     611                 :          0 :         if (waitqueue_active(&ctx->wait))
     612                 :          0 :                 wake_up(&ctx->wait);
     613                 :          0 :         if (waitqueue_active(&ctx->sqo_wait))
     614                 :          0 :                 wake_up(&ctx->sqo_wait);
     615                 :          0 :         if (ctx->cq_ev_fd)
     616                 :          0 :                 eventfd_signal(ctx->cq_ev_fd, 1);
     617                 :          0 : }
     618                 :            : 
     619                 :          0 : static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
     620                 :            :                                 long res)
     621                 :            : {
     622                 :            :         unsigned long flags;
     623                 :            : 
     624                 :          0 :         spin_lock_irqsave(&ctx->completion_lock, flags);
     625                 :          0 :         io_cqring_fill_event(ctx, user_data, res);
     626                 :          0 :         io_commit_cqring(ctx);
     627                 :            :         spin_unlock_irqrestore(&ctx->completion_lock, flags);
     628                 :            : 
     629                 :          0 :         io_cqring_ev_posted(ctx);
     630                 :          0 : }
     631                 :            : 
     632                 :          0 : static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
     633                 :            :                                    struct io_submit_state *state)
     634                 :            : {
     635                 :            :         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
     636                 :            :         struct io_kiocb *req;
     637                 :            : 
     638                 :          0 :         if (!percpu_ref_tryget(&ctx->refs))
     639                 :            :                 return NULL;
     640                 :            : 
     641                 :          0 :         if (!state) {
     642                 :          0 :                 req = kmem_cache_alloc(req_cachep, gfp);
     643                 :          0 :                 if (unlikely(!req))
     644                 :            :                         goto out;
     645                 :          0 :         } else if (!state->free_reqs) {
     646                 :            :                 size_t sz;
     647                 :            :                 int ret;
     648                 :            : 
     649                 :          0 :                 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
     650                 :          0 :                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
     651                 :            : 
     652                 :            :                 /*
     653                 :            :                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
     654                 :            :                  * retry single alloc to be on the safe side.
     655                 :            :                  */
     656                 :          0 :                 if (unlikely(ret <= 0)) {
     657                 :          0 :                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
     658                 :          0 :                         if (!state->reqs[0])
     659                 :            :                                 goto out;
     660                 :            :                         ret = 1;
     661                 :            :                 }
     662                 :          0 :                 state->free_reqs = ret - 1;
     663                 :          0 :                 state->cur_req = 1;
     664                 :          0 :                 req = state->reqs[0];
     665                 :            :         } else {
     666                 :          0 :                 req = state->reqs[state->cur_req];
     667                 :          0 :                 state->free_reqs--;
     668                 :          0 :                 state->cur_req++;
     669                 :            :         }
     670                 :            : 
     671                 :          0 :         req->file = NULL;
     672                 :          0 :         req->ctx = ctx;
     673                 :          0 :         req->flags = 0;
     674                 :            :         /* one is dropped after submission, the other at completion */
     675                 :            :         refcount_set(&req->refs, 2);
     676                 :          0 :         req->result = 0;
     677                 :          0 :         req->fs = NULL;
     678                 :          0 :         return req;
     679                 :            : out:
     680                 :            :         percpu_ref_put(&ctx->refs);
     681                 :          0 :         return NULL;
     682                 :            : }
     683                 :            : 
     684                 :          0 : static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
     685                 :            : {
     686                 :          0 :         if (*nr) {
     687                 :          0 :                 kmem_cache_free_bulk(req_cachep, *nr, reqs);
     688                 :          0 :                 percpu_ref_put_many(&ctx->refs, *nr);
     689                 :          0 :                 *nr = 0;
     690                 :            :         }
     691                 :          0 : }
     692                 :            : 
     693                 :          0 : static void __io_free_req(struct io_kiocb *req)
     694                 :            : {
     695                 :          0 :         if (req->file && !(req->flags & REQ_F_FIXED_FILE))
     696                 :          0 :                 fput(req->file);
     697                 :          0 :         percpu_ref_put(&req->ctx->refs);
     698                 :          0 :         kmem_cache_free(req_cachep, req);
     699                 :          0 : }
     700                 :            : 
     701                 :          0 : static void io_req_link_next(struct io_kiocb *req)
     702                 :            : {
     703                 :            :         struct io_kiocb *nxt;
     704                 :            : 
     705                 :            :         /*
     706                 :            :          * The list should never be empty when we are called here. But could
     707                 :            :          * potentially happen if the chain is messed up, check to be on the
     708                 :            :          * safe side.
     709                 :            :          */
     710                 :          0 :         nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
     711                 :          0 :         if (nxt) {
     712                 :            :                 list_del(&nxt->list);
     713                 :          0 :                 if (!list_empty(&req->link_list)) {
     714                 :          0 :                         INIT_LIST_HEAD(&nxt->link_list);
     715                 :            :                         list_splice(&req->link_list, &nxt->link_list);
     716                 :          0 :                         nxt->flags |= REQ_F_LINK;
     717                 :            :                 }
     718                 :            : 
     719                 :          0 :                 nxt->flags |= REQ_F_LINK_DONE;
     720                 :          0 :                 INIT_WORK(&nxt->work, io_sq_wq_submit_work);
     721                 :          0 :                 io_queue_async_work(req->ctx, nxt);
     722                 :            :         }
     723                 :          0 : }
     724                 :            : 
     725                 :            : /*
     726                 :            :  * Called if REQ_F_LINK is set, and we fail the head request
     727                 :            :  */
     728                 :          0 : static void io_fail_links(struct io_kiocb *req)
     729                 :            : {
     730                 :            :         struct io_kiocb *link;
     731                 :            : 
     732                 :          0 :         while (!list_empty(&req->link_list)) {
     733                 :          0 :                 link = list_first_entry(&req->link_list, struct io_kiocb, list);
     734                 :            :                 list_del(&link->list);
     735                 :            : 
     736                 :          0 :                 io_cqring_add_event(req->ctx, link->user_data, -ECANCELED);
     737                 :          0 :                 __io_free_req(link);
     738                 :            :         }
     739                 :          0 : }
     740                 :            : 
     741                 :          0 : static void io_free_req(struct io_kiocb *req)
     742                 :            : {
     743                 :            :         /*
     744                 :            :          * If LINK is set, we have dependent requests in this chain. If we
     745                 :            :          * didn't fail this request, queue the first one up, moving any other
     746                 :            :          * dependencies to the next request. In case of failure, fail the rest
     747                 :            :          * of the chain.
     748                 :            :          */
     749                 :          0 :         if (req->flags & REQ_F_LINK) {
     750                 :          0 :                 if (req->flags & REQ_F_FAIL_LINK)
     751                 :          0 :                         io_fail_links(req);
     752                 :            :                 else
     753                 :          0 :                         io_req_link_next(req);
     754                 :            :         }
     755                 :            : 
     756                 :          0 :         __io_free_req(req);
     757                 :          0 : }
     758                 :            : 
     759                 :          0 : static void io_put_req(struct io_kiocb *req)
     760                 :            : {
     761                 :          0 :         if (refcount_dec_and_test(&req->refs))
     762                 :          0 :                 io_free_req(req);
     763                 :          0 : }
     764                 :            : 
     765                 :            : static unsigned io_cqring_events(struct io_rings *rings)
     766                 :            : {
     767                 :            :         /* See comment at the top of this file */
     768                 :          0 :         smp_rmb();
     769                 :          0 :         return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
     770                 :            : }
     771                 :            : 
     772                 :            : static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
     773                 :            : {
     774                 :          0 :         struct io_rings *rings = ctx->rings;
     775                 :            : 
     776                 :            :         /* make sure SQ entry isn't read before tail */
     777                 :          0 :         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
     778                 :            : }
     779                 :            : 
     780                 :            : /*
     781                 :            :  * Find and free completed poll iocbs
     782                 :            :  */
     783                 :          0 : static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
     784                 :            :                                struct list_head *done)
     785                 :            : {
     786                 :            :         void *reqs[IO_IOPOLL_BATCH];
     787                 :            :         struct io_kiocb *req;
     788                 :            :         int to_free;
     789                 :            : 
     790                 :          0 :         to_free = 0;
     791                 :          0 :         while (!list_empty(done)) {
     792                 :          0 :                 req = list_first_entry(done, struct io_kiocb, list);
     793                 :            :                 list_del(&req->list);
     794                 :            : 
     795                 :          0 :                 io_cqring_fill_event(ctx, req->user_data, req->result);
     796                 :          0 :                 (*nr_events)++;
     797                 :            : 
     798                 :          0 :                 if (refcount_dec_and_test(&req->refs)) {
     799                 :            :                         /* If we're not using fixed files, we have to pair the
     800                 :            :                          * completion part with the file put. Use regular
     801                 :            :                          * completions for those, only batch free for fixed
     802                 :            :                          * file and non-linked commands.
     803                 :            :                          */
     804                 :          0 :                         if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
     805                 :            :                             REQ_F_FIXED_FILE) {
     806                 :          0 :                                 reqs[to_free++] = req;
     807                 :          0 :                                 if (to_free == ARRAY_SIZE(reqs))
     808                 :          0 :                                         io_free_req_many(ctx, reqs, &to_free);
     809                 :            :                         } else {
     810                 :          0 :                                 io_free_req(req);
     811                 :            :                         }
     812                 :            :                 }
     813                 :            :         }
     814                 :            : 
     815                 :          0 :         io_commit_cqring(ctx);
     816                 :          0 :         io_free_req_many(ctx, reqs, &to_free);
     817                 :          0 : }
     818                 :            : 
     819                 :          0 : static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
     820                 :            :                         long min)
     821                 :            : {
     822                 :            :         struct io_kiocb *req, *tmp;
     823                 :          0 :         LIST_HEAD(done);
     824                 :            :         bool spin;
     825                 :            :         int ret;
     826                 :            : 
     827                 :            :         /*
     828                 :            :          * Only spin for completions if we don't have multiple devices hanging
     829                 :            :          * off our complete list, and we're under the requested amount.
     830                 :            :          */
     831                 :          0 :         spin = !ctx->poll_multi_file && *nr_events < min;
     832                 :            : 
     833                 :            :         ret = 0;
     834                 :          0 :         list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
     835                 :          0 :                 struct kiocb *kiocb = &req->rw;
     836                 :            : 
     837                 :            :                 /*
     838                 :            :                  * Move completed entries to our local list. If we find a
     839                 :            :                  * request that requires polling, break out and complete
     840                 :            :                  * the done list first, if we have entries there.
     841                 :            :                  */
     842                 :          0 :                 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
     843                 :            :                         list_move_tail(&req->list, &done);
     844                 :          0 :                         continue;
     845                 :            :                 }
     846                 :          0 :                 if (!list_empty(&done))
     847                 :            :                         break;
     848                 :            : 
     849                 :          0 :                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
     850                 :          0 :                 if (ret < 0)
     851                 :            :                         break;
     852                 :            : 
     853                 :          0 :                 if (ret && spin)
     854                 :            :                         spin = false;
     855                 :            :                 ret = 0;
     856                 :            :         }
     857                 :            : 
     858                 :          0 :         if (!list_empty(&done))
     859                 :          0 :                 io_iopoll_complete(ctx, nr_events, &done);
     860                 :            : 
     861                 :          0 :         return ret;
     862                 :            : }
     863                 :            : 
     864                 :            : /*
     865                 :            :  * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
     866                 :            :  * non-spinning poll check - we'll still enter the driver poll loop, but only
     867                 :            :  * as a non-spinning completion check.
     868                 :            :  */
     869                 :          0 : static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
     870                 :            :                                 long min)
     871                 :            : {
     872                 :          0 :         while (!list_empty(&ctx->poll_list) && !need_resched()) {
     873                 :            :                 int ret;
     874                 :            : 
     875                 :          0 :                 ret = io_do_iopoll(ctx, nr_events, min);
     876                 :          0 :                 if (ret < 0)
     877                 :          0 :                         return ret;
     878                 :          0 :                 if (!min || *nr_events >= min)
     879                 :            :                         return 0;
     880                 :            :         }
     881                 :            : 
     882                 :            :         return 1;
     883                 :            : }
     884                 :            : 
     885                 :            : /*
     886                 :            :  * We can't just wait for polled events to come to us, we have to actively
     887                 :            :  * find and complete them.
     888                 :            :  */
     889                 :          0 : static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
     890                 :            : {
     891                 :          0 :         if (!(ctx->flags & IORING_SETUP_IOPOLL))
     892                 :          0 :                 return;
     893                 :            : 
     894                 :          0 :         mutex_lock(&ctx->uring_lock);
     895                 :          0 :         while (!list_empty(&ctx->poll_list)) {
     896                 :          0 :                 unsigned int nr_events = 0;
     897                 :            : 
     898                 :          0 :                 io_iopoll_getevents(ctx, &nr_events, 1);
     899                 :            : 
     900                 :            :                 /*
     901                 :            :                  * Ensure we allow local-to-the-cpu processing to take place,
     902                 :            :                  * in this case we need to ensure that we reap all events.
     903                 :            :                  */
     904                 :          0 :                 cond_resched();
     905                 :            :         }
     906                 :          0 :         mutex_unlock(&ctx->uring_lock);
     907                 :            : }
     908                 :            : 
     909                 :          0 : static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
     910                 :            :                            long min)
     911                 :            : {
     912                 :            :         int iters = 0, ret = 0;
     913                 :            : 
     914                 :            :         /*
     915                 :            :          * We disallow the app entering submit/complete with polling, but we
     916                 :            :          * still need to lock the ring to prevent racing with polled issue
     917                 :            :          * that got punted to a workqueue.
     918                 :            :          */
     919                 :          0 :         mutex_lock(&ctx->uring_lock);
     920                 :            :         do {
     921                 :            :                 int tmin = 0;
     922                 :            : 
     923                 :            :                 /*
     924                 :            :                  * Don't enter poll loop if we already have events pending.
     925                 :            :                  * If we do, we can potentially be spinning for commands that
     926                 :            :                  * already triggered a CQE (eg in error).
     927                 :            :                  */
     928                 :          0 :                 if (io_cqring_events(ctx->rings))
     929                 :            :                         break;
     930                 :            : 
     931                 :            :                 /*
     932                 :            :                  * If a submit got punted to a workqueue, we can have the
     933                 :            :                  * application entering polling for a command before it gets
     934                 :            :                  * issued. That app will hold the uring_lock for the duration
     935                 :            :                  * of the poll right here, so we need to take a breather every
     936                 :            :                  * now and then to ensure that the issue has a chance to add
     937                 :            :                  * the poll to the issued list. Otherwise we can spin here
     938                 :            :                  * forever, while the workqueue is stuck trying to acquire the
     939                 :            :                  * very same mutex.
     940                 :            :                  */
     941                 :          0 :                 if (!(++iters & 7)) {
     942                 :          0 :                         mutex_unlock(&ctx->uring_lock);
     943                 :          0 :                         mutex_lock(&ctx->uring_lock);
     944                 :            :                 }
     945                 :            : 
     946                 :          0 :                 if (*nr_events < min)
     947                 :          0 :                         tmin = min - *nr_events;
     948                 :            : 
     949                 :          0 :                 ret = io_iopoll_getevents(ctx, nr_events, tmin);
     950                 :          0 :                 if (ret <= 0)
     951                 :            :                         break;
     952                 :            :                 ret = 0;
     953                 :          0 :         } while (min && !*nr_events && !need_resched());
     954                 :            : 
     955                 :          0 :         mutex_unlock(&ctx->uring_lock);
     956                 :          0 :         return ret;
     957                 :            : }
     958                 :            : 
     959                 :            : static void kiocb_end_write(struct io_kiocb *req)
     960                 :            : {
     961                 :            :         /*
     962                 :            :          * Tell lockdep we inherited freeze protection from submission
     963                 :            :          * thread.
     964                 :            :          */
     965                 :            :         if (req->flags & REQ_F_ISREG) {
     966                 :            :                 struct inode *inode = file_inode(req->file);
     967                 :            : 
     968                 :            :                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
     969                 :            :         }
     970                 :          0 :         file_end_write(req->file);
     971                 :            : }
     972                 :            : 
     973                 :          0 : static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
     974                 :            : {
     975                 :            :         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
     976                 :            : 
     977                 :          0 :         if (kiocb->ki_flags & IOCB_WRITE)
     978                 :            :                 kiocb_end_write(req);
     979                 :            : 
     980                 :          0 :         if ((req->flags & REQ_F_LINK) && res != req->result)
     981                 :          0 :                 req->flags |= REQ_F_FAIL_LINK;
     982                 :          0 :         io_cqring_add_event(req->ctx, req->user_data, res);
     983                 :          0 :         io_put_req(req);
     984                 :          0 : }
     985                 :            : 
     986                 :          0 : static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
     987                 :            : {
     988                 :            :         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
     989                 :            : 
     990                 :          0 :         if (kiocb->ki_flags & IOCB_WRITE)
     991                 :            :                 kiocb_end_write(req);
     992                 :            : 
     993                 :          0 :         if ((req->flags & REQ_F_LINK) && res != req->result)
     994                 :          0 :                 req->flags |= REQ_F_FAIL_LINK;
     995                 :          0 :         req->result = res;
     996                 :          0 :         if (res != -EAGAIN)
     997                 :          0 :                 req->flags |= REQ_F_IOPOLL_COMPLETED;
     998                 :          0 : }
     999                 :            : 
    1000                 :            : /*
    1001                 :            :  * After the iocb has been issued, it's safe to be found on the poll list.
    1002                 :            :  * Adding the kiocb to the list AFTER submission ensures that we don't
    1003                 :            :  * find it from a io_iopoll_getevents() thread before the issuer is done
    1004                 :            :  * accessing the kiocb cookie.
    1005                 :            :  */
    1006                 :          0 : static void io_iopoll_req_issued(struct io_kiocb *req)
    1007                 :            : {
    1008                 :          0 :         struct io_ring_ctx *ctx = req->ctx;
    1009                 :            : 
    1010                 :            :         /*
    1011                 :            :          * Track whether we have multiple files in our lists. This will impact
    1012                 :            :          * how we do polling eventually, not spinning if we're on potentially
    1013                 :            :          * different devices.
    1014                 :            :          */
    1015                 :          0 :         if (list_empty(&ctx->poll_list)) {
    1016                 :          0 :                 ctx->poll_multi_file = false;
    1017                 :          0 :         } else if (!ctx->poll_multi_file) {
    1018                 :            :                 struct io_kiocb *list_req;
    1019                 :            : 
    1020                 :          0 :                 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
    1021                 :            :                                                 list);
    1022                 :          0 :                 if (list_req->rw.ki_filp != req->rw.ki_filp)
    1023                 :          0 :                         ctx->poll_multi_file = true;
    1024                 :            :         }
    1025                 :            : 
    1026                 :            :         /*
    1027                 :            :          * For fast devices, IO may have already completed. If it has, add
    1028                 :            :          * it to the front so we find it first.
    1029                 :            :          */
    1030                 :          0 :         if (req->flags & REQ_F_IOPOLL_COMPLETED)
    1031                 :          0 :                 list_add(&req->list, &ctx->poll_list);
    1032                 :            :         else
    1033                 :          0 :                 list_add_tail(&req->list, &ctx->poll_list);
    1034                 :          0 : }
    1035                 :            : 
    1036                 :          0 : static void io_file_put(struct io_submit_state *state)
    1037                 :            : {
    1038                 :          0 :         if (state->file) {
    1039                 :          0 :                 int diff = state->has_refs - state->used_refs;
    1040                 :            : 
    1041                 :          0 :                 if (diff)
    1042                 :          0 :                         fput_many(state->file, diff);
    1043                 :          0 :                 state->file = NULL;
    1044                 :            :         }
    1045                 :          0 : }
    1046                 :            : 
    1047                 :            : /*
    1048                 :            :  * Get as many references to a file as we have IOs left in this submission,
    1049                 :            :  * assuming most submissions are for one file, or at least that each file
    1050                 :            :  * has more than one submission.
    1051                 :            :  */
    1052                 :          0 : static struct file *io_file_get(struct io_submit_state *state, int fd)
    1053                 :            : {
    1054                 :          0 :         if (!state)
    1055                 :          0 :                 return fget(fd);
    1056                 :            : 
    1057                 :          0 :         if (state->file) {
    1058                 :          0 :                 if (state->fd == fd) {
    1059                 :          0 :                         state->used_refs++;
    1060                 :          0 :                         state->ios_left--;
    1061                 :          0 :                         return state->file;
    1062                 :            :                 }
    1063                 :          0 :                 io_file_put(state);
    1064                 :            :         }
    1065                 :          0 :         state->file = fget_many(fd, state->ios_left);
    1066                 :          0 :         if (!state->file)
    1067                 :            :                 return NULL;
    1068                 :            : 
    1069                 :          0 :         state->fd = fd;
    1070                 :          0 :         state->has_refs = state->ios_left;
    1071                 :          0 :         state->used_refs = 1;
    1072                 :          0 :         state->ios_left--;
    1073                 :          0 :         return state->file;
    1074                 :            : }
    1075                 :            : 
    1076                 :            : /*
    1077                 :            :  * If we tracked the file through the SCM inflight mechanism, we could support
    1078                 :            :  * any file. For now, just ensure that anything potentially problematic is done
    1079                 :            :  * inline.
    1080                 :            :  */
    1081                 :            : static bool io_file_supports_async(struct file *file)
    1082                 :            : {
    1083                 :          0 :         umode_t mode = file_inode(file)->i_mode;
    1084                 :            : 
    1085                 :          0 :         if (S_ISBLK(mode) || S_ISCHR(mode))
    1086                 :            :                 return true;
    1087                 :          0 :         if (S_ISREG(mode) && file->f_op != &io_uring_fops)
    1088                 :            :                 return true;
    1089                 :            : 
    1090                 :            :         return false;
    1091                 :            : }
    1092                 :            : 
    1093                 :          0 : static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
    1094                 :            :                       bool force_nonblock)
    1095                 :            : {
    1096                 :          0 :         const struct io_uring_sqe *sqe = s->sqe;
    1097                 :          0 :         struct io_ring_ctx *ctx = req->ctx;
    1098                 :          0 :         struct kiocb *kiocb = &req->rw;
    1099                 :            :         unsigned ioprio;
    1100                 :            :         int ret;
    1101                 :            : 
    1102                 :          0 :         if (!req->file)
    1103                 :            :                 return -EBADF;
    1104                 :            : 
    1105                 :          0 :         if (S_ISREG(file_inode(req->file)->i_mode))
    1106                 :          0 :                 req->flags |= REQ_F_ISREG;
    1107                 :            : 
    1108                 :          0 :         if (force_nonblock)
    1109                 :          0 :                 req->fsize = rlimit(RLIMIT_FSIZE);
    1110                 :            : 
    1111                 :            :         /*
    1112                 :            :          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
    1113                 :            :          * we know to async punt it even if it was opened O_NONBLOCK
    1114                 :            :          */
    1115                 :          0 :         if (force_nonblock && !io_file_supports_async(req->file)) {
    1116                 :          0 :                 req->flags |= REQ_F_MUST_PUNT;
    1117                 :          0 :                 return -EAGAIN;
    1118                 :            :         }
    1119                 :            : 
    1120                 :          0 :         kiocb->ki_pos = READ_ONCE(sqe->off);
    1121                 :          0 :         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
    1122                 :          0 :         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
    1123                 :            : 
    1124                 :          0 :         ioprio = READ_ONCE(sqe->ioprio);
    1125                 :          0 :         if (ioprio) {
    1126                 :          0 :                 ret = ioprio_check_cap(ioprio);
    1127                 :          0 :                 if (ret)
    1128                 :            :                         return ret;
    1129                 :            : 
    1130                 :          0 :                 kiocb->ki_ioprio = ioprio;
    1131                 :            :         } else
    1132                 :          0 :                 kiocb->ki_ioprio = get_current_ioprio();
    1133                 :            : 
    1134                 :          0 :         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
    1135                 :          0 :         if (unlikely(ret))
    1136                 :            :                 return ret;
    1137                 :            : 
    1138                 :            :         /* don't allow async punt if RWF_NOWAIT was requested */
    1139                 :          0 :         if ((kiocb->ki_flags & IOCB_NOWAIT) ||
    1140                 :          0 :             (req->file->f_flags & O_NONBLOCK))
    1141                 :          0 :                 req->flags |= REQ_F_NOWAIT;
    1142                 :            : 
    1143                 :          0 :         if (force_nonblock)
    1144                 :          0 :                 kiocb->ki_flags |= IOCB_NOWAIT;
    1145                 :            : 
    1146                 :          0 :         if (ctx->flags & IORING_SETUP_IOPOLL) {
    1147                 :          0 :                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
    1148                 :          0 :                     !kiocb->ki_filp->f_op->iopoll)
    1149                 :            :                         return -EOPNOTSUPP;
    1150                 :            : 
    1151                 :          0 :                 kiocb->ki_flags |= IOCB_HIPRI;
    1152                 :          0 :                 kiocb->ki_complete = io_complete_rw_iopoll;
    1153                 :          0 :                 req->result = 0;
    1154                 :            :         } else {
    1155                 :          0 :                 if (kiocb->ki_flags & IOCB_HIPRI)
    1156                 :            :                         return -EINVAL;
    1157                 :          0 :                 kiocb->ki_complete = io_complete_rw;
    1158                 :            :         }
    1159                 :            :         return 0;
    1160                 :            : }
    1161                 :            : 
    1162                 :          0 : static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
    1163                 :            : {
    1164                 :          0 :         switch (ret) {
    1165                 :            :         case -EIOCBQUEUED:
    1166                 :            :                 break;
    1167                 :            :         case -ERESTARTSYS:
    1168                 :            :         case -ERESTARTNOINTR:
    1169                 :            :         case -ERESTARTNOHAND:
    1170                 :            :         case -ERESTART_RESTARTBLOCK:
    1171                 :            :                 /*
    1172                 :            :                  * We can't just restart the syscall, since previously
    1173                 :            :                  * submitted sqes may already be in progress. Just fail this
    1174                 :            :                  * IO with EINTR.
    1175                 :            :                  */
    1176                 :            :                 ret = -EINTR;
    1177                 :            :                 /* fall through */
    1178                 :            :         default:
    1179                 :          0 :                 kiocb->ki_complete(kiocb, ret, 0);
    1180                 :            :         }
    1181                 :          0 : }
    1182                 :            : 
    1183                 :          0 : static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
    1184                 :            :                            const struct io_uring_sqe *sqe,
    1185                 :            :                            struct iov_iter *iter)
    1186                 :            : {
    1187                 :            :         size_t len = READ_ONCE(sqe->len);
    1188                 :            :         struct io_mapped_ubuf *imu;
    1189                 :            :         unsigned index, buf_index;
    1190                 :            :         size_t offset;
    1191                 :            :         u64 buf_addr;
    1192                 :            : 
    1193                 :            :         /* attempt to use fixed buffers without having provided iovecs */
    1194                 :          0 :         if (unlikely(!ctx->user_bufs))
    1195                 :            :                 return -EFAULT;
    1196                 :            : 
    1197                 :          0 :         buf_index = READ_ONCE(sqe->buf_index);
    1198                 :          0 :         if (unlikely(buf_index >= ctx->nr_user_bufs))
    1199                 :            :                 return -EFAULT;
    1200                 :            : 
    1201                 :          0 :         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
    1202                 :          0 :         imu = &ctx->user_bufs[index];
    1203                 :            :         buf_addr = READ_ONCE(sqe->addr);
    1204                 :            : 
    1205                 :            :         /* overflow */
    1206                 :          0 :         if (buf_addr + len < buf_addr)
    1207                 :            :                 return -EFAULT;
    1208                 :            :         /* not inside the mapped region */
    1209                 :          0 :         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
    1210                 :            :                 return -EFAULT;
    1211                 :            : 
    1212                 :            :         /*
    1213                 :            :          * May not be a start of buffer, set size appropriately
    1214                 :            :          * and advance us to the beginning.
    1215                 :            :          */
    1216                 :          0 :         offset = buf_addr - imu->ubuf;
    1217                 :          0 :         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
    1218                 :            : 
    1219                 :          0 :         if (offset) {
    1220                 :            :                 /*
    1221                 :            :                  * Don't use iov_iter_advance() here, as it's really slow for
    1222                 :            :                  * using the latter parts of a big fixed buffer - it iterates
    1223                 :            :                  * over each segment manually. We can cheat a bit here, because
    1224                 :            :                  * we know that:
    1225                 :            :                  *
    1226                 :            :                  * 1) it's a BVEC iter, we set it up
    1227                 :            :                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
    1228                 :            :                  *    first and last bvec
    1229                 :            :                  *
    1230                 :            :                  * So just find our index, and adjust the iterator afterwards.
    1231                 :            :                  * If the offset is within the first bvec (or the whole first
    1232                 :            :                  * bvec, just use iov_iter_advance(). This makes it easier
    1233                 :            :                  * since we can just skip the first segment, which may not
    1234                 :            :                  * be PAGE_SIZE aligned.
    1235                 :            :                  */
    1236                 :          0 :                 const struct bio_vec *bvec = imu->bvec;
    1237                 :            : 
    1238                 :          0 :                 if (offset <= bvec->bv_len) {
    1239                 :          0 :                         iov_iter_advance(iter, offset);
    1240                 :            :                 } else {
    1241                 :            :                         unsigned long seg_skip;
    1242                 :            : 
    1243                 :            :                         /* skip first vec */
    1244                 :          0 :                         offset -= bvec->bv_len;
    1245                 :          0 :                         seg_skip = 1 + (offset >> PAGE_SHIFT);
    1246                 :            : 
    1247                 :          0 :                         iter->bvec = bvec + seg_skip;
    1248                 :          0 :                         iter->nr_segs -= seg_skip;
    1249                 :          0 :                         iter->count -= bvec->bv_len + offset;
    1250                 :          0 :                         iter->iov_offset = offset & ~PAGE_MASK;
    1251                 :            :                 }
    1252                 :            :         }
    1253                 :            : 
    1254                 :          0 :         return len;
    1255                 :            : }
    1256                 :            : 
    1257                 :          0 : static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
    1258                 :            :                                struct io_kiocb *req, struct iovec **iovec,
    1259                 :            :                                struct iov_iter *iter)
    1260                 :            : {
    1261                 :          0 :         const struct io_uring_sqe *sqe = req->submit.sqe;
    1262                 :          0 :         void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
    1263                 :            :         size_t sqe_len = READ_ONCE(sqe->len);
    1264                 :            :         u8 opcode;
    1265                 :            : 
    1266                 :          0 :         opcode = req->submit.opcode;
    1267                 :          0 :         if (opcode == IORING_OP_READ_FIXED ||
    1268                 :            :             opcode == IORING_OP_WRITE_FIXED) {
    1269                 :          0 :                 ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
    1270                 :          0 :                 *iovec = NULL;
    1271                 :          0 :                 return ret;
    1272                 :            :         }
    1273                 :            : 
    1274                 :          0 :         if (!req->submit.has_user)
    1275                 :            :                 return -EFAULT;
    1276                 :            : 
    1277                 :            : #ifdef CONFIG_COMPAT
    1278                 :            :         if (ctx->compat)
    1279                 :            :                 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
    1280                 :            :                                                 iovec, iter);
    1281                 :            : #endif
    1282                 :            : 
    1283                 :          0 :         return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
    1284                 :            : }
    1285                 :            : 
    1286                 :            : static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb)
    1287                 :            : {
    1288                 :          0 :         if (al->file == kiocb->ki_filp) {
    1289                 :            :                 off_t start, end;
    1290                 :            : 
    1291                 :            :                 /*
    1292                 :            :                  * Allow merging if we're anywhere in the range of the same
    1293                 :            :                  * page. Generally this happens for sub-page reads or writes,
    1294                 :            :                  * and it's beneficial to allow the first worker to bring the
    1295                 :            :                  * page in and the piggy backed work can then work on the
    1296                 :            :                  * cached page.
    1297                 :            :                  */
    1298                 :          0 :                 start = al->io_start & PAGE_MASK;
    1299                 :          0 :                 end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK;
    1300                 :          0 :                 if (kiocb->ki_pos >= start && kiocb->ki_pos <= end)
    1301                 :            :                         return true;
    1302                 :            :         }
    1303                 :            : 
    1304                 :          0 :         al->file = NULL;
    1305                 :            :         return false;
    1306                 :            : }
    1307                 :            : 
    1308                 :            : /*
    1309                 :            :  * Make a note of the last file/offset/direction we punted to async
    1310                 :            :  * context. We'll use this information to see if we can piggy back a
    1311                 :            :  * sequential request onto the previous one, if it's still hasn't been
    1312                 :            :  * completed by the async worker.
    1313                 :            :  */
    1314                 :          0 : static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
    1315                 :            : {
    1316                 :          0 :         struct async_list *async_list = &req->ctx->pending_async[rw];
    1317                 :            :         struct kiocb *kiocb = &req->rw;
    1318                 :          0 :         struct file *filp = kiocb->ki_filp;
    1319                 :            : 
    1320                 :          0 :         if (io_should_merge(async_list, kiocb)) {
    1321                 :            :                 unsigned long max_bytes;
    1322                 :            : 
    1323                 :            :                 /* Use 8x RA size as a decent limiter for both reads/writes */
    1324                 :          0 :                 max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
    1325                 :          0 :                 if (!max_bytes)
    1326                 :            :                         max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
    1327                 :            : 
    1328                 :            :                 /* If max len are exceeded, reset the state */
    1329                 :          0 :                 if (async_list->io_len + len <= max_bytes) {
    1330                 :          0 :                         req->flags |= REQ_F_SEQ_PREV;
    1331                 :          0 :                         async_list->io_len += len;
    1332                 :            :                 } else {
    1333                 :          0 :                         async_list->file = NULL;
    1334                 :            :                 }
    1335                 :            :         }
    1336                 :            : 
    1337                 :            :         /* New file? Reset state. */
    1338                 :          0 :         if (async_list->file != filp) {
    1339                 :          0 :                 async_list->io_start = kiocb->ki_pos;
    1340                 :          0 :                 async_list->io_len = len;
    1341                 :          0 :                 async_list->file = filp;
    1342                 :            :         }
    1343                 :          0 : }
    1344                 :            : 
    1345                 :            : /*
    1346                 :            :  * For files that don't have ->read_iter() and ->write_iter(), handle them
    1347                 :            :  * by looping over ->read() or ->write() manually.
    1348                 :            :  */
    1349                 :          0 : static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
    1350                 :            :                            struct iov_iter *iter)
    1351                 :            : {
    1352                 :            :         ssize_t ret = 0;
    1353                 :            : 
    1354                 :            :         /*
    1355                 :            :          * Don't support polled IO through this interface, and we can't
    1356                 :            :          * support non-blocking either. For the latter, this just causes
    1357                 :            :          * the kiocb to be handled from an async context.
    1358                 :            :          */
    1359                 :          0 :         if (kiocb->ki_flags & IOCB_HIPRI)
    1360                 :            :                 return -EOPNOTSUPP;
    1361                 :          0 :         if (kiocb->ki_flags & IOCB_NOWAIT)
    1362                 :            :                 return -EAGAIN;
    1363                 :            : 
    1364                 :          0 :         while (iov_iter_count(iter)) {
    1365                 :            :                 struct iovec iovec;
    1366                 :            :                 ssize_t nr;
    1367                 :            : 
    1368                 :          0 :                 if (!iov_iter_is_bvec(iter)) {
    1369                 :            :                         iovec = iov_iter_iovec(iter);
    1370                 :            :                 } else {
    1371                 :            :                         /* fixed buffers import bvec */
    1372                 :          0 :                         iovec.iov_base = kmap(iter->bvec->bv_page)
    1373                 :          0 :                                                 + iter->iov_offset;
    1374                 :          0 :                         iovec.iov_len = min(iter->count,
    1375                 :            :                                         iter->bvec->bv_len - iter->iov_offset);
    1376                 :            :                 }
    1377                 :            : 
    1378                 :          0 :                 if (rw == READ) {
    1379                 :          0 :                         nr = file->f_op->read(file, iovec.iov_base,
    1380                 :            :                                               iovec.iov_len, &kiocb->ki_pos);
    1381                 :            :                 } else {
    1382                 :          0 :                         nr = file->f_op->write(file, iovec.iov_base,
    1383                 :            :                                                iovec.iov_len, &kiocb->ki_pos);
    1384                 :            :                 }
    1385                 :            : 
    1386                 :            :                 if (iov_iter_is_bvec(iter))
    1387                 :            :                         kunmap(iter->bvec->bv_page);
    1388                 :            : 
    1389                 :          0 :                 if (nr < 0) {
    1390                 :          0 :                         if (!ret)
    1391                 :          0 :                                 ret = nr;
    1392                 :            :                         break;
    1393                 :            :                 }
    1394                 :          0 :                 ret += nr;
    1395                 :          0 :                 if (nr != iovec.iov_len)
    1396                 :            :                         break;
    1397                 :          0 :                 iov_iter_advance(iter, nr);
    1398                 :            :         }
    1399                 :            : 
    1400                 :          0 :         return ret;
    1401                 :            : }
    1402                 :            : 
    1403                 :          0 : static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
    1404                 :            :                    bool force_nonblock)
    1405                 :            : {
    1406                 :          0 :         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
    1407                 :          0 :         struct kiocb *kiocb = &req->rw;
    1408                 :            :         struct iov_iter iter;
    1409                 :            :         struct file *file;
    1410                 :            :         size_t iov_count;
    1411                 :            :         ssize_t read_size, ret;
    1412                 :            : 
    1413                 :          0 :         ret = io_prep_rw(req, s, force_nonblock);
    1414                 :          0 :         if (ret)
    1415                 :            :                 return ret;
    1416                 :          0 :         file = kiocb->ki_filp;
    1417                 :            : 
    1418                 :          0 :         if (unlikely(!(file->f_mode & FMODE_READ)))
    1419                 :            :                 return -EBADF;
    1420                 :            : 
    1421                 :          0 :         ret = io_import_iovec(req->ctx, READ, req, &iovec, &iter);
    1422                 :          0 :         if (ret < 0)
    1423                 :            :                 return ret;
    1424                 :            : 
    1425                 :            :         read_size = ret;
    1426                 :          0 :         if (req->flags & REQ_F_LINK)
    1427                 :          0 :                 req->result = read_size;
    1428                 :            : 
    1429                 :            :         iov_count = iov_iter_count(&iter);
    1430                 :          0 :         ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
    1431                 :          0 :         if (!ret) {
    1432                 :            :                 ssize_t ret2;
    1433                 :            : 
    1434                 :          0 :                 if (file->f_op->read_iter)
    1435                 :            :                         ret2 = call_read_iter(file, kiocb, &iter);
    1436                 :          0 :                 else if (req->file->f_op->read)
    1437                 :          0 :                         ret2 = loop_rw_iter(READ, file, kiocb, &iter);
    1438                 :            :                 else
    1439                 :            :                         ret2 = -EINVAL;
    1440                 :            : 
    1441                 :            :                 /*
    1442                 :            :                  * In case of a short read, punt to async. This can happen
    1443                 :            :                  * if we have data partially cached. Alternatively we can
    1444                 :            :                  * return the short read, in which case the application will
    1445                 :            :                  * need to issue another SQE and wait for it. That SQE will
    1446                 :            :                  * need async punt anyway, so it's more efficient to do it
    1447                 :            :                  * here.
    1448                 :            :                  */
    1449                 :          0 :                 if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
    1450                 :            :                     (req->flags & REQ_F_ISREG) &&
    1451                 :          0 :                     ret2 > 0 && ret2 < read_size)
    1452                 :            :                         ret2 = -EAGAIN;
    1453                 :            :                 /* Catch -EAGAIN return for forced non-blocking submission */
    1454                 :          0 :                 if (!force_nonblock || ret2 != -EAGAIN) {
    1455                 :          0 :                         io_rw_done(kiocb, ret2);
    1456                 :            :                 } else {
    1457                 :            :                         /*
    1458                 :            :                          * If ->needs_lock is true, we're already in async
    1459                 :            :                          * context.
    1460                 :            :                          */
    1461                 :          0 :                         if (!s->needs_lock)
    1462                 :          0 :                                 io_async_list_note(READ, req, iov_count);
    1463                 :            :                         ret = -EAGAIN;
    1464                 :            :                 }
    1465                 :            :         }
    1466                 :          0 :         kfree(iovec);
    1467                 :          0 :         return ret;
    1468                 :            : }
    1469                 :            : 
    1470                 :          0 : static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
    1471                 :            :                     bool force_nonblock)
    1472                 :            : {
    1473                 :          0 :         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
    1474                 :          0 :         struct kiocb *kiocb = &req->rw;
    1475                 :            :         struct iov_iter iter;
    1476                 :            :         struct file *file;
    1477                 :            :         size_t iov_count;
    1478                 :            :         ssize_t ret;
    1479                 :            : 
    1480                 :          0 :         ret = io_prep_rw(req, s, force_nonblock);
    1481                 :          0 :         if (ret)
    1482                 :            :                 return ret;
    1483                 :            : 
    1484                 :          0 :         file = kiocb->ki_filp;
    1485                 :          0 :         if (unlikely(!(file->f_mode & FMODE_WRITE)))
    1486                 :            :                 return -EBADF;
    1487                 :            : 
    1488                 :          0 :         ret = io_import_iovec(req->ctx, WRITE, req, &iovec, &iter);
    1489                 :          0 :         if (ret < 0)
    1490                 :            :                 return ret;
    1491                 :            : 
    1492                 :          0 :         if (req->flags & REQ_F_LINK)
    1493                 :          0 :                 req->result = ret;
    1494                 :            : 
    1495                 :            :         iov_count = iov_iter_count(&iter);
    1496                 :            : 
    1497                 :            :         ret = -EAGAIN;
    1498                 :          0 :         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {
    1499                 :            :                 /* If ->needs_lock is true, we're already in async context. */
    1500                 :          0 :                 if (!s->needs_lock)
    1501                 :          0 :                         io_async_list_note(WRITE, req, iov_count);
    1502                 :            :                 goto out_free;
    1503                 :            :         }
    1504                 :            : 
    1505                 :          0 :         ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
    1506                 :          0 :         if (!ret) {
    1507                 :            :                 ssize_t ret2;
    1508                 :            : 
    1509                 :            :                 /*
    1510                 :            :                  * Open-code file_start_write here to grab freeze protection,
    1511                 :            :                  * which will be released by another thread in
    1512                 :            :                  * io_complete_rw().  Fool lockdep by telling it the lock got
    1513                 :            :                  * released so that it doesn't complain about the held lock when
    1514                 :            :                  * we return to userspace.
    1515                 :            :                  */
    1516                 :          0 :                 if (req->flags & REQ_F_ISREG) {
    1517                 :          0 :                         __sb_start_write(file_inode(file)->i_sb,
    1518                 :            :                                                 SB_FREEZE_WRITE, true);
    1519                 :            :                         __sb_writers_release(file_inode(file)->i_sb,
    1520                 :            :                                                 SB_FREEZE_WRITE);
    1521                 :            :                 }
    1522                 :          0 :                 kiocb->ki_flags |= IOCB_WRITE;
    1523                 :            : 
    1524                 :          0 :                 if (!force_nonblock)
    1525                 :          0 :                         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
    1526                 :            : 
    1527                 :          0 :                 if (file->f_op->write_iter)
    1528                 :            :                         ret2 = call_write_iter(file, kiocb, &iter);
    1529                 :          0 :                 else if (req->file->f_op->write)
    1530                 :          0 :                         ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
    1531                 :            :                 else
    1532                 :            :                         ret2 = -EINVAL;
    1533                 :            : 
    1534                 :          0 :                 if (!force_nonblock)
    1535                 :          0 :                         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
    1536                 :            : 
    1537                 :          0 :                 if (!force_nonblock || ret2 != -EAGAIN) {
    1538                 :          0 :                         io_rw_done(kiocb, ret2);
    1539                 :            :                 } else {
    1540                 :            :                         /*
    1541                 :            :                          * If ->needs_lock is true, we're already in async
    1542                 :            :                          * context.
    1543                 :            :                          */
    1544                 :          0 :                         if (!s->needs_lock)
    1545                 :          0 :                                 io_async_list_note(WRITE, req, iov_count);
    1546                 :            :                         ret = -EAGAIN;
    1547                 :            :                 }
    1548                 :            :         }
    1549                 :            : out_free:
    1550                 :          0 :         kfree(iovec);
    1551                 :          0 :         return ret;
    1552                 :            : }
    1553                 :            : 
    1554                 :            : /*
    1555                 :            :  * IORING_OP_NOP just posts a completion event, nothing else.
    1556                 :            :  */
    1557                 :          0 : static int io_nop(struct io_kiocb *req, u64 user_data)
    1558                 :            : {
    1559                 :          0 :         struct io_ring_ctx *ctx = req->ctx;
    1560                 :            :         long err = 0;
    1561                 :            : 
    1562                 :          0 :         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
    1563                 :            :                 return -EINVAL;
    1564                 :            : 
    1565                 :          0 :         io_cqring_add_event(ctx, user_data, err);
    1566                 :          0 :         io_put_req(req);
    1567                 :          0 :         return 0;
    1568                 :            : }
    1569                 :            : 
    1570                 :          0 : static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
    1571                 :            : {
    1572                 :          0 :         struct io_ring_ctx *ctx = req->ctx;
    1573                 :            : 
    1574                 :          0 :         if (!req->file)
    1575                 :            :                 return -EBADF;
    1576                 :            : 
    1577                 :          0 :         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
    1578                 :            :                 return -EINVAL;
    1579                 :          0 :         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
    1580                 :            :                 return -EINVAL;
    1581                 :            : 
    1582                 :          0 :         return 0;
    1583                 :            : }
    1584                 :            : 
    1585                 :          0 : static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
    1586                 :            :                     bool force_nonblock)
    1587                 :            : {
    1588                 :          0 :         loff_t sqe_off = READ_ONCE(sqe->off);
    1589                 :          0 :         loff_t sqe_len = READ_ONCE(sqe->len);
    1590                 :          0 :         loff_t end = sqe_off + sqe_len;
    1591                 :            :         unsigned fsync_flags;
    1592                 :            :         int ret;
    1593                 :            : 
    1594                 :            :         fsync_flags = READ_ONCE(sqe->fsync_flags);
    1595                 :          0 :         if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
    1596                 :            :                 return -EINVAL;
    1597                 :            : 
    1598                 :          0 :         ret = io_prep_fsync(req, sqe);
    1599                 :          0 :         if (ret)
    1600                 :            :                 return ret;
    1601                 :            : 
    1602                 :            :         /* fsync always requires a blocking context */
    1603                 :          0 :         if (force_nonblock)
    1604                 :            :                 return -EAGAIN;
    1605                 :            : 
    1606                 :          0 :         ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
    1607                 :            :                                 end > 0 ? end : LLONG_MAX,
    1608                 :            :                                 fsync_flags & IORING_FSYNC_DATASYNC);
    1609                 :            : 
    1610                 :          0 :         if (ret < 0 && (req->flags & REQ_F_LINK))
    1611                 :          0 :                 req->flags |= REQ_F_FAIL_LINK;
    1612                 :          0 :         io_cqring_add_event(req->ctx, sqe->user_data, ret);
    1613                 :          0 :         io_put_req(req);
    1614                 :          0 :         return 0;
    1615                 :            : }
    1616                 :            : 
    1617                 :          0 : static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
    1618                 :            : {
    1619                 :          0 :         struct io_ring_ctx *ctx = req->ctx;
    1620                 :            :         int ret = 0;
    1621                 :            : 
    1622                 :          0 :         if (!req->file)
    1623                 :            :                 return -EBADF;
    1624                 :            : 
    1625                 :          0 :         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
    1626                 :            :                 return -EINVAL;
    1627                 :          0 :         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
    1628                 :            :                 return -EINVAL;
    1629                 :            : 
    1630                 :          0 :         return ret;
    1631                 :            : }
    1632                 :            : 
    1633                 :          0 : static int io_sync_file_range(struct io_kiocb *req,
    1634                 :            :                               const struct io_uring_sqe *sqe,
    1635                 :            :                               bool force_nonblock)
    1636                 :            : {
    1637                 :            :         loff_t sqe_off;
    1638                 :            :         loff_t sqe_len;
    1639                 :            :         unsigned flags;
    1640                 :            :         int ret;
    1641                 :            : 
    1642                 :          0 :         ret = io_prep_sfr(req, sqe);
    1643                 :          0 :         if (ret)
    1644                 :            :                 return ret;
    1645                 :            : 
    1646                 :            :         /* sync_file_range always requires a blocking context */
    1647                 :          0 :         if (force_nonblock)
    1648                 :            :                 return -EAGAIN;
    1649                 :            : 
    1650                 :          0 :         sqe_off = READ_ONCE(sqe->off);
    1651                 :          0 :         sqe_len = READ_ONCE(sqe->len);
    1652                 :            :         flags = READ_ONCE(sqe->sync_range_flags);
    1653                 :            : 
    1654                 :          0 :         ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
    1655                 :            : 
    1656                 :          0 :         if (ret < 0 && (req->flags & REQ_F_LINK))
    1657                 :          0 :                 req->flags |= REQ_F_FAIL_LINK;
    1658                 :          0 :         io_cqring_add_event(req->ctx, sqe->user_data, ret);
    1659                 :          0 :         io_put_req(req);
    1660                 :          0 :         return 0;
    1661                 :            : }
    1662                 :            : 
    1663                 :            : #if defined(CONFIG_NET)
    1664                 :          0 : static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
    1665                 :            :                            bool force_nonblock,
    1666                 :            :                    long (*fn)(struct socket *, struct user_msghdr __user *,
    1667                 :            :                                 unsigned int))
    1668                 :            : {
    1669                 :            :         struct socket *sock;
    1670                 :            :         int ret;
    1671                 :            : 
    1672                 :          0 :         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
    1673                 :            :                 return -EINVAL;
    1674                 :            : 
    1675                 :          0 :         sock = sock_from_file(req->file, &ret);
    1676                 :          0 :         if (sock) {
    1677                 :            :                 struct user_msghdr __user *msg;
    1678                 :            :                 unsigned flags;
    1679                 :            : 
    1680                 :            :                 flags = READ_ONCE(sqe->msg_flags);
    1681                 :          0 :                 if (flags & MSG_DONTWAIT)
    1682                 :          0 :                         req->flags |= REQ_F_NOWAIT;
    1683                 :          0 :                 else if (force_nonblock)
    1684                 :          0 :                         flags |= MSG_DONTWAIT;
    1685                 :            : 
    1686                 :            : #ifdef CONFIG_COMPAT
    1687                 :            :                 if (req->ctx->compat)
    1688                 :            :                         flags |= MSG_CMSG_COMPAT;
    1689                 :            : #endif
    1690                 :            : 
    1691                 :          0 :                 msg = (struct user_msghdr __user *) (unsigned long)
    1692                 :            :                         READ_ONCE(sqe->addr);
    1693                 :            : 
    1694                 :          0 :                 ret = fn(sock, msg, flags);
    1695                 :          0 :                 if (force_nonblock && ret == -EAGAIN)
    1696                 :            :                         return ret;
    1697                 :          0 :                 if (ret == -ERESTARTSYS)
    1698                 :          0 :                         ret = -EINTR;
    1699                 :            :         }
    1700                 :            : 
    1701                 :          0 :         if (req->fs) {
    1702                 :            :                 struct fs_struct *fs = req->fs;
    1703                 :            : 
    1704                 :            :                 spin_lock(&req->fs->lock);
    1705                 :          0 :                 if (--fs->users)
    1706                 :            :                         fs = NULL;
    1707                 :          0 :                 spin_unlock(&req->fs->lock);
    1708                 :          0 :                 if (fs)
    1709                 :          0 :                         free_fs_struct(fs);
    1710                 :            :         }
    1711                 :          0 :         io_cqring_add_event(req->ctx, sqe->user_data, ret);
    1712                 :          0 :         io_put_req(req);
    1713                 :          0 :         return 0;
    1714                 :            : }
    1715                 :            : #endif
    1716                 :            : 
    1717                 :            : static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
    1718                 :            :                       bool force_nonblock)
    1719                 :            : {
    1720                 :            : #if defined(CONFIG_NET)
    1721                 :          0 :         return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);
    1722                 :            : #else
    1723                 :            :         return -EOPNOTSUPP;
    1724                 :            : #endif
    1725                 :            : }
    1726                 :            : 
    1727                 :            : static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
    1728                 :            :                       bool force_nonblock)
    1729                 :            : {
    1730                 :            : #if defined(CONFIG_NET)
    1731                 :          0 :         return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);
    1732                 :            : #else
    1733                 :            :         return -EOPNOTSUPP;
    1734                 :            : #endif
    1735                 :            : }
    1736                 :            : 
    1737                 :          0 : static void io_poll_remove_one(struct io_kiocb *req)
    1738                 :            : {
    1739                 :            :         struct io_poll_iocb *poll = &req->poll;
    1740                 :            : 
    1741                 :          0 :         spin_lock(&poll->head->lock);
    1742                 :            :         WRITE_ONCE(poll->canceled, true);
    1743                 :          0 :         if (!list_empty(&poll->wait.entry)) {
    1744                 :            :                 list_del_init(&poll->wait.entry);
    1745                 :          0 :                 io_queue_async_work(req->ctx, req);
    1746                 :            :         }
    1747                 :          0 :         spin_unlock(&poll->head->lock);
    1748                 :            : 
    1749                 :          0 :         list_del_init(&req->list);
    1750                 :          0 : }
    1751                 :            : 
    1752                 :          0 : static void io_poll_remove_all(struct io_ring_ctx *ctx)
    1753                 :            : {
    1754                 :            :         struct io_kiocb *req;
    1755                 :            : 
    1756                 :            :         spin_lock_irq(&ctx->completion_lock);
    1757                 :          0 :         while (!list_empty(&ctx->cancel_list)) {
    1758                 :          0 :                 req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
    1759                 :          0 :                 io_poll_remove_one(req);
    1760                 :            :         }
    1761                 :            :         spin_unlock_irq(&ctx->completion_lock);
    1762                 :          0 : }
    1763                 :            : 
    1764                 :            : /*
    1765                 :            :  * Find a running poll command that matches one specified in sqe->addr,
    1766                 :            :  * and remove it if found.
    1767                 :            :  */
    1768                 :          0 : static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
    1769                 :            : {
    1770                 :          0 :         struct io_ring_ctx *ctx = req->ctx;
    1771                 :            :         struct io_kiocb *poll_req, *next;
    1772                 :            :         int ret = -ENOENT;
    1773                 :            : 
    1774                 :          0 :         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
    1775                 :            :                 return -EINVAL;
    1776                 :          0 :         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
    1777                 :          0 :             sqe->poll_events)
    1778                 :            :                 return -EINVAL;
    1779                 :            : 
    1780                 :            :         spin_lock_irq(&ctx->completion_lock);
    1781                 :          0 :         list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
    1782                 :          0 :                 if (READ_ONCE(sqe->addr) == poll_req->user_data) {
    1783                 :          0 :                         io_poll_remove_one(poll_req);
    1784                 :            :                         ret = 0;
    1785                 :          0 :                         break;
    1786                 :            :                 }
    1787                 :            :         }
    1788                 :            :         spin_unlock_irq(&ctx->completion_lock);
    1789                 :            : 
    1790                 :          0 :         io_cqring_add_event(req->ctx, sqe->user_data, ret);
    1791                 :          0 :         io_put_req(req);
    1792                 :          0 :         return 0;
    1793                 :            : }
    1794                 :            : 
    1795                 :          0 : static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
    1796                 :            :                              __poll_t mask)
    1797                 :            : {
    1798                 :          0 :         req->poll.done = true;
    1799                 :          0 :         io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask));
    1800                 :          0 :         io_commit_cqring(ctx);
    1801                 :          0 : }
    1802                 :            : 
    1803                 :          0 : static void io_poll_complete_work(struct work_struct *work)
    1804                 :            : {
    1805                 :          0 :         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
    1806                 :            :         struct io_poll_iocb *poll = &req->poll;
    1807                 :          0 :         struct poll_table_struct pt = { ._key = poll->events };
    1808                 :          0 :         struct io_ring_ctx *ctx = req->ctx;
    1809                 :            :         const struct cred *old_cred;
    1810                 :            :         __poll_t mask = 0;
    1811                 :            : 
    1812                 :          0 :         old_cred = override_creds(ctx->creds);
    1813                 :            : 
    1814                 :          0 :         if (!READ_ONCE(poll->canceled))
    1815                 :          0 :                 mask = vfs_poll(poll->file, &pt) & poll->events;
    1816                 :            : 
    1817                 :            :         /*
    1818                 :            :          * Note that ->ki_cancel callers also delete iocb from active_reqs after
    1819                 :            :          * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
    1820                 :            :          * synchronize with them.  In the cancellation case the list_del_init
    1821                 :            :          * itself is not actually needed, but harmless so we keep it in to
    1822                 :            :          * avoid further branches in the fast path.
    1823                 :            :          */
    1824                 :            :         spin_lock_irq(&ctx->completion_lock);
    1825                 :          0 :         if (!mask && !READ_ONCE(poll->canceled)) {
    1826                 :          0 :                 add_wait_queue(poll->head, &poll->wait);
    1827                 :            :                 spin_unlock_irq(&ctx->completion_lock);
    1828                 :            :                 goto out;
    1829                 :            :         }
    1830                 :          0 :         list_del_init(&req->list);
    1831                 :          0 :         io_poll_complete(ctx, req, mask);
    1832                 :            :         spin_unlock_irq(&ctx->completion_lock);
    1833                 :            : 
    1834                 :          0 :         io_cqring_ev_posted(ctx);
    1835                 :          0 :         io_put_req(req);
    1836                 :            : out:
    1837                 :          0 :         revert_creds(old_cred);
    1838                 :          0 : }
    1839                 :            : 
    1840                 :          0 : static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
    1841                 :            :                         void *key)
    1842                 :            : {
    1843                 :          0 :         struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
    1844                 :            :                                                         wait);
    1845                 :            :         struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
    1846                 :          0 :         struct io_ring_ctx *ctx = req->ctx;
    1847                 :          0 :         __poll_t mask = key_to_poll(key);
    1848                 :            :         unsigned long flags;
    1849                 :            : 
    1850                 :            :         /* for instances that support it check for an event match first: */
    1851                 :          0 :         if (mask && !(mask & poll->events))
    1852                 :            :                 return 0;
    1853                 :            : 
    1854                 :          0 :         list_del_init(&poll->wait.entry);
    1855                 :            : 
    1856                 :          0 :         if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
    1857                 :            :                 list_del(&req->list);
    1858                 :          0 :                 io_poll_complete(ctx, req, mask);
    1859                 :            :                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
    1860                 :            : 
    1861                 :          0 :                 io_cqring_ev_posted(ctx);
    1862                 :          0 :                 io_put_req(req);
    1863                 :            :         } else {
    1864                 :          0 :                 io_queue_async_work(ctx, req);
    1865                 :            :         }
    1866                 :            : 
    1867                 :            :         return 1;
    1868                 :            : }
    1869                 :            : 
    1870                 :            : struct io_poll_table {
    1871                 :            :         struct poll_table_struct pt;
    1872                 :            :         struct io_kiocb *req;
    1873                 :            :         int error;
    1874                 :            : };
    1875                 :            : 
    1876                 :          0 : static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
    1877                 :            :                                struct poll_table_struct *p)
    1878                 :            : {
    1879                 :            :         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
    1880                 :            : 
    1881                 :          0 :         if (unlikely(pt->req->poll.head)) {
    1882                 :          0 :                 pt->error = -EINVAL;
    1883                 :          0 :                 return;
    1884                 :            :         }
    1885                 :            : 
    1886                 :          0 :         pt->error = 0;
    1887                 :          0 :         pt->req->poll.head = head;
    1888                 :          0 :         add_wait_queue(head, &pt->req->poll.wait);
    1889                 :            : }
    1890                 :            : 
    1891                 :          0 : static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
    1892                 :            : {
    1893                 :            :         struct io_poll_iocb *poll = &req->poll;
    1894                 :          0 :         struct io_ring_ctx *ctx = req->ctx;
    1895                 :            :         struct io_poll_table ipt;
    1896                 :            :         bool cancel = false;
    1897                 :            :         __poll_t mask;
    1898                 :            :         u16 events;
    1899                 :            : 
    1900                 :          0 :         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
    1901                 :            :                 return -EINVAL;
    1902                 :          0 :         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
    1903                 :            :                 return -EINVAL;
    1904                 :          0 :         if (!poll->file)
    1905                 :            :                 return -EBADF;
    1906                 :            : 
    1907                 :          0 :         req->submit.sqe = NULL;
    1908                 :          0 :         INIT_WORK(&req->work, io_poll_complete_work);
    1909                 :            :         events = READ_ONCE(sqe->poll_events);
    1910                 :          0 :         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
    1911                 :            : 
    1912                 :          0 :         poll->head = NULL;
    1913                 :          0 :         poll->done = false;
    1914                 :          0 :         poll->canceled = false;
    1915                 :            : 
    1916                 :          0 :         ipt.pt._qproc = io_poll_queue_proc;
    1917                 :          0 :         ipt.pt._key = poll->events;
    1918                 :          0 :         ipt.req = req;
    1919                 :          0 :         ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
    1920                 :            : 
    1921                 :            :         /* initialized the list so that we can do list_empty checks */
    1922                 :          0 :         INIT_LIST_HEAD(&poll->wait.entry);
    1923                 :            :         init_waitqueue_func_entry(&poll->wait, io_poll_wake);
    1924                 :            : 
    1925                 :          0 :         INIT_LIST_HEAD(&req->list);
    1926                 :            : 
    1927                 :          0 :         mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
    1928                 :            : 
    1929                 :            :         spin_lock_irq(&ctx->completion_lock);
    1930                 :          0 :         if (likely(poll->head)) {
    1931                 :            :                 spin_lock(&poll->head->lock);
    1932                 :          0 :                 if (unlikely(list_empty(&poll->wait.entry))) {
    1933                 :          0 :                         if (ipt.error)
    1934                 :            :                                 cancel = true;
    1935                 :          0 :                         ipt.error = 0;
    1936                 :            :                         mask = 0;
    1937                 :            :                 }
    1938                 :          0 :                 if (mask || ipt.error)
    1939                 :            :                         list_del_init(&poll->wait.entry);
    1940                 :          0 :                 else if (cancel)
    1941                 :            :                         WRITE_ONCE(poll->canceled, true);
    1942                 :          0 :                 else if (!poll->done) /* actually waiting for an event */
    1943                 :          0 :                         list_add_tail(&req->list, &ctx->cancel_list);
    1944                 :          0 :                 spin_unlock(&poll->head->lock);
    1945                 :            :         }
    1946                 :          0 :         if (mask) { /* no async, we'd stolen it */
    1947                 :          0 :                 ipt.error = 0;
    1948                 :          0 :                 io_poll_complete(ctx, req, mask);
    1949                 :            :         }
    1950                 :            :         spin_unlock_irq(&ctx->completion_lock);
    1951                 :            : 
    1952                 :          0 :         if (mask) {
    1953                 :          0 :                 io_cqring_ev_posted(ctx);
    1954                 :          0 :                 io_put_req(req);
    1955                 :            :         }
    1956                 :          0 :         return ipt.error;
    1957                 :            : }
    1958                 :            : 
    1959                 :          0 : static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
    1960                 :            : {
    1961                 :            :         struct io_ring_ctx *ctx;
    1962                 :            :         struct io_kiocb *req, *prev;
    1963                 :            :         unsigned long flags;
    1964                 :            : 
    1965                 :          0 :         req = container_of(timer, struct io_kiocb, timeout.timer);
    1966                 :          0 :         ctx = req->ctx;
    1967                 :          0 :         atomic_inc(&ctx->cq_timeouts);
    1968                 :            : 
    1969                 :          0 :         spin_lock_irqsave(&ctx->completion_lock, flags);
    1970                 :            :         /*
    1971                 :            :          * Adjust the reqs sequence before the current one because it
    1972                 :            :          * will consume a slot in the cq_ring and the the cq_tail pointer
    1973                 :            :          * will be increased, otherwise other timeout reqs may return in
    1974                 :            :          * advance without waiting for enough wait_nr.
    1975                 :            :          */
    1976                 :            :         prev = req;
    1977                 :          0 :         list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
    1978                 :          0 :                 prev->sequence++;
    1979                 :            :         list_del(&req->list);
    1980                 :            : 
    1981                 :          0 :         io_cqring_fill_event(ctx, req->user_data, -ETIME);
    1982                 :          0 :         io_commit_cqring(ctx);
    1983                 :            :         spin_unlock_irqrestore(&ctx->completion_lock, flags);
    1984                 :            : 
    1985                 :          0 :         io_cqring_ev_posted(ctx);
    1986                 :            : 
    1987                 :          0 :         io_put_req(req);
    1988                 :          0 :         return HRTIMER_NORESTART;
    1989                 :            : }
    1990                 :            : 
    1991                 :          0 : static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
    1992                 :            : {
    1993                 :            :         unsigned count;
    1994                 :          0 :         struct io_ring_ctx *ctx = req->ctx;
    1995                 :            :         struct list_head *entry;
    1996                 :            :         struct timespec64 ts;
    1997                 :            :         unsigned span = 0;
    1998                 :            : 
    1999                 :          0 :         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
    2000                 :            :                 return -EINVAL;
    2001                 :          0 :         if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags ||
    2002                 :          0 :             sqe->len != 1)
    2003                 :            :                 return -EINVAL;
    2004                 :            : 
    2005                 :          0 :         if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr)))
    2006                 :            :                 return -EFAULT;
    2007                 :            : 
    2008                 :          0 :         req->flags |= REQ_F_TIMEOUT;
    2009                 :            : 
    2010                 :            :         /*
    2011                 :            :          * sqe->off holds how many events that need to occur for this
    2012                 :            :          * timeout event to be satisfied. If it isn't set, then this is
    2013                 :            :          * a pure timeout request, sequence isn't used.
    2014                 :            :          */
    2015                 :          0 :         count = READ_ONCE(sqe->off);
    2016                 :          0 :         if (!count) {
    2017                 :          0 :                 req->flags |= REQ_F_TIMEOUT_NOSEQ;
    2018                 :            :                 spin_lock_irq(&ctx->completion_lock);
    2019                 :          0 :                 entry = ctx->timeout_list.prev;
    2020                 :          0 :                 goto add;
    2021                 :            :         }
    2022                 :            : 
    2023                 :          0 :         req->sequence = ctx->cached_sq_head + count - 1;
    2024                 :            :         /* reuse it to store the count */
    2025                 :          0 :         req->submit.sequence = count;
    2026                 :            : 
    2027                 :            :         /*
    2028                 :            :          * Insertion sort, ensuring the first entry in the list is always
    2029                 :            :          * the one we need first.
    2030                 :            :          */
    2031                 :            :         spin_lock_irq(&ctx->completion_lock);
    2032                 :          0 :         list_for_each_prev(entry, &ctx->timeout_list) {
    2033                 :            :                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
    2034                 :            :                 unsigned nxt_sq_head;
    2035                 :            :                 long long tmp, tmp_nxt;
    2036                 :            : 
    2037                 :          0 :                 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
    2038                 :          0 :                         continue;
    2039                 :            : 
    2040                 :            :                 /*
    2041                 :            :                  * Since cached_sq_head + count - 1 can overflow, use type long
    2042                 :            :                  * long to store it.
    2043                 :            :                  */
    2044                 :          0 :                 tmp = (long long)ctx->cached_sq_head + count - 1;
    2045                 :          0 :                 nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1;
    2046                 :          0 :                 tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1;
    2047                 :            : 
    2048                 :            :                 /*
    2049                 :            :                  * cached_sq_head may overflow, and it will never overflow twice
    2050                 :            :                  * once there is some timeout req still be valid.
    2051                 :            :                  */
    2052                 :          0 :                 if (ctx->cached_sq_head < nxt_sq_head)
    2053                 :          0 :                         tmp += UINT_MAX;
    2054                 :            : 
    2055                 :          0 :                 if (tmp > tmp_nxt)
    2056                 :            :                         break;
    2057                 :            : 
    2058                 :            :                 /*
    2059                 :            :                  * Sequence of reqs after the insert one and itself should
    2060                 :            :                  * be adjusted because each timeout req consumes a slot.
    2061                 :            :                  */
    2062                 :          0 :                 span++;
    2063                 :          0 :                 nxt->sequence++;
    2064                 :            :         }
    2065                 :          0 :         req->sequence -= span;
    2066                 :            : add:
    2067                 :          0 :         list_add(&req->list, entry);
    2068                 :            :         spin_unlock_irq(&ctx->completion_lock);
    2069                 :            : 
    2070                 :          0 :         hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
    2071                 :          0 :         req->timeout.timer.function = io_timeout_fn;
    2072                 :            :         hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts),
    2073                 :            :                         HRTIMER_MODE_REL);
    2074                 :          0 :         return 0;
    2075                 :            : }
    2076                 :            : 
    2077                 :          0 : static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
    2078                 :            :                         struct sqe_submit *s)
    2079                 :            : {
    2080                 :            :         struct io_uring_sqe *sqe_copy;
    2081                 :            : 
    2082                 :          0 :         if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list))
    2083                 :            :                 return 0;
    2084                 :            : 
    2085                 :            :         sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
    2086                 :          0 :         if (!sqe_copy)
    2087                 :            :                 return -EAGAIN;
    2088                 :            : 
    2089                 :            :         spin_lock_irq(&ctx->completion_lock);
    2090                 :          0 :         if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) {
    2091                 :            :                 spin_unlock_irq(&ctx->completion_lock);
    2092                 :          0 :                 kfree(sqe_copy);
    2093                 :          0 :                 return 0;
    2094                 :            :         }
    2095                 :            : 
    2096                 :          0 :         memcpy(&req->submit, s, sizeof(*s));
    2097                 :          0 :         memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
    2098                 :          0 :         req->submit.sqe = sqe_copy;
    2099                 :            : 
    2100                 :          0 :         INIT_WORK(&req->work, io_sq_wq_submit_work);
    2101                 :          0 :         list_add_tail(&req->list, &ctx->defer_list);
    2102                 :            :         spin_unlock_irq(&ctx->completion_lock);
    2103                 :          0 :         return -EIOCBQUEUED;
    2104                 :            : }
    2105                 :            : 
    2106                 :          0 : static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
    2107                 :            :                            const struct sqe_submit *s, bool force_nonblock)
    2108                 :            : {
    2109                 :            :         int ret;
    2110                 :            : 
    2111                 :          0 :         req->user_data = READ_ONCE(s->sqe->user_data);
    2112                 :            : 
    2113                 :          0 :         if (unlikely(s->index >= ctx->sq_entries))
    2114                 :            :                 return -EINVAL;
    2115                 :            : 
    2116                 :          0 :         switch (req->submit.opcode) {
    2117                 :            :         case IORING_OP_NOP:
    2118                 :          0 :                 ret = io_nop(req, req->user_data);
    2119                 :          0 :                 break;
    2120                 :            :         case IORING_OP_READV:
    2121                 :          0 :                 if (unlikely(s->sqe->buf_index))
    2122                 :            :                         return -EINVAL;
    2123                 :          0 :                 ret = io_read(req, s, force_nonblock);
    2124                 :          0 :                 break;
    2125                 :            :         case IORING_OP_WRITEV:
    2126                 :          0 :                 if (unlikely(s->sqe->buf_index))
    2127                 :            :                         return -EINVAL;
    2128                 :          0 :                 ret = io_write(req, s, force_nonblock);
    2129                 :          0 :                 break;
    2130                 :            :         case IORING_OP_READ_FIXED:
    2131                 :          0 :                 ret = io_read(req, s, force_nonblock);
    2132                 :          0 :                 break;
    2133                 :            :         case IORING_OP_WRITE_FIXED:
    2134                 :          0 :                 ret = io_write(req, s, force_nonblock);
    2135                 :          0 :                 break;
    2136                 :            :         case IORING_OP_FSYNC:
    2137                 :          0 :                 ret = io_fsync(req, s->sqe, force_nonblock);
    2138                 :          0 :                 break;
    2139                 :            :         case IORING_OP_POLL_ADD:
    2140                 :          0 :                 ret = io_poll_add(req, s->sqe);
    2141                 :          0 :                 break;
    2142                 :            :         case IORING_OP_POLL_REMOVE:
    2143                 :          0 :                 ret = io_poll_remove(req, s->sqe);
    2144                 :          0 :                 break;
    2145                 :            :         case IORING_OP_SYNC_FILE_RANGE:
    2146                 :          0 :                 ret = io_sync_file_range(req, s->sqe, force_nonblock);
    2147                 :          0 :                 break;
    2148                 :            :         case IORING_OP_SENDMSG:
    2149                 :          0 :                 ret = io_sendmsg(req, s->sqe, force_nonblock);
    2150                 :          0 :                 break;
    2151                 :            :         case IORING_OP_RECVMSG:
    2152                 :          0 :                 ret = io_recvmsg(req, s->sqe, force_nonblock);
    2153                 :          0 :                 break;
    2154                 :            :         case IORING_OP_TIMEOUT:
    2155                 :          0 :                 ret = io_timeout(req, s->sqe);
    2156                 :          0 :                 break;
    2157                 :            :         default:
    2158                 :            :                 ret = -EINVAL;
    2159                 :            :                 break;
    2160                 :            :         }
    2161                 :            : 
    2162                 :          0 :         if (ret)
    2163                 :            :                 return ret;
    2164                 :            : 
    2165                 :          0 :         if (ctx->flags & IORING_SETUP_IOPOLL) {
    2166                 :          0 :                 if (req->result == -EAGAIN)
    2167                 :            :                         return -EAGAIN;
    2168                 :            : 
    2169                 :            :                 /* workqueue context doesn't hold uring_lock, grab it now */
    2170                 :          0 :                 if (s->needs_lock)
    2171                 :          0 :                         mutex_lock(&ctx->uring_lock);
    2172                 :          0 :                 io_iopoll_req_issued(req);
    2173                 :          0 :                 if (s->needs_lock)
    2174                 :          0 :                         mutex_unlock(&ctx->uring_lock);
    2175                 :            :         }
    2176                 :            : 
    2177                 :            :         return 0;
    2178                 :            : }
    2179                 :            : 
    2180                 :            : static struct async_list *io_async_list_from_req(struct io_ring_ctx *ctx,
    2181                 :            :                                                  struct io_kiocb *req)
    2182                 :            : {
    2183                 :          0 :         switch (req->submit.opcode) {
    2184                 :            :         case IORING_OP_READV:
    2185                 :            :         case IORING_OP_READ_FIXED:
    2186                 :          0 :                 return &ctx->pending_async[READ];
    2187                 :            :         case IORING_OP_WRITEV:
    2188                 :            :         case IORING_OP_WRITE_FIXED:
    2189                 :          0 :                 return &ctx->pending_async[WRITE];
    2190                 :            :         default:
    2191                 :            :                 return NULL;
    2192                 :            :         }
    2193                 :            : }
    2194                 :            : 
    2195                 :            : static inline bool io_req_needs_user(struct io_kiocb *req)
    2196                 :            : {
    2197                 :          0 :         return !(req->submit.opcode == IORING_OP_READ_FIXED ||
    2198                 :            :                 req->submit.opcode == IORING_OP_WRITE_FIXED);
    2199                 :            : }
    2200                 :            : 
    2201                 :          0 : static void io_sq_wq_submit_work(struct work_struct *work)
    2202                 :            : {
    2203                 :          0 :         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
    2204                 :          0 :         struct fs_struct *old_fs_struct = current->fs;
    2205                 :          0 :         struct io_ring_ctx *ctx = req->ctx;
    2206                 :            :         struct mm_struct *cur_mm = NULL;
    2207                 :            :         struct async_list *async_list;
    2208                 :            :         const struct cred *old_cred;
    2209                 :          0 :         LIST_HEAD(req_list);
    2210                 :            :         mm_segment_t old_fs;
    2211                 :            :         int ret;
    2212                 :            : 
    2213                 :          0 :         old_cred = override_creds(ctx->creds);
    2214                 :            :         async_list = io_async_list_from_req(ctx, req);
    2215                 :            : 
    2216                 :            :         allow_kernel_signal(SIGINT);
    2217                 :            : restart:
    2218                 :            :         do {
    2219                 :          0 :                 struct sqe_submit *s = &req->submit;
    2220                 :          0 :                 const struct io_uring_sqe *sqe = s->sqe;
    2221                 :          0 :                 unsigned int flags = req->flags;
    2222                 :            : 
    2223                 :            :                 /* Ensure we clear previously set non-block flag */
    2224                 :          0 :                 req->rw.ki_flags &= ~IOCB_NOWAIT;
    2225                 :            : 
    2226                 :          0 :                 if (req->fs != current->fs && current->fs != old_fs_struct) {
    2227                 :            :                         task_lock(current);
    2228                 :          0 :                         if (req->fs)
    2229                 :          0 :                                 current->fs = req->fs;
    2230                 :            :                         else
    2231                 :          0 :                                 current->fs = old_fs_struct;
    2232                 :          0 :                         task_unlock(current);
    2233                 :            :                 }
    2234                 :            : 
    2235                 :            :                 ret = 0;
    2236                 :          0 :                 if (io_req_needs_user(req) && !cur_mm) {
    2237                 :          0 :                         if (!mmget_not_zero(ctx->sqo_mm)) {
    2238                 :            :                                 ret = -EFAULT;
    2239                 :            :                                 goto end_req;
    2240                 :            :                         } else {
    2241                 :          0 :                                 cur_mm = ctx->sqo_mm;
    2242                 :          0 :                                 use_mm(cur_mm);
    2243                 :          0 :                                 old_fs = get_fs();
    2244                 :            :                                 set_fs(USER_DS);
    2245                 :            :                         }
    2246                 :            :                 }
    2247                 :            : 
    2248                 :            :                 if (!ret) {
    2249                 :          0 :                         req->work_task = current;
    2250                 :          0 :                         if (req->flags & REQ_F_CANCEL) {
    2251                 :            :                                 ret = -ECANCELED;
    2252                 :            :                                 goto end_req;
    2253                 :            :                         }
    2254                 :            : 
    2255                 :          0 :                         s->has_user = cur_mm != NULL;
    2256                 :          0 :                         s->needs_lock = true;
    2257                 :            :                         do {
    2258                 :          0 :                                 ret = __io_submit_sqe(ctx, req, s, false);
    2259                 :            :                                 /*
    2260                 :            :                                  * We can get EAGAIN for polled IO even though
    2261                 :            :                                  * we're forcing a sync submission from here,
    2262                 :            :                                  * since we can't wait for request slots on the
    2263                 :            :                                  * block side.
    2264                 :            :                                  */
    2265                 :          0 :                                 if (ret != -EAGAIN)
    2266                 :            :                                         break;
    2267                 :          0 :                                 cond_resched();
    2268                 :          0 :                         } while (1);
    2269                 :            : end_req:
    2270                 :          0 :                         if (!list_empty(&req->task_list)) {
    2271                 :            :                                 spin_lock_irq(&ctx->task_lock);
    2272                 :            :                                 list_del_init(&req->task_list);
    2273                 :            :                                 spin_unlock_irq(&ctx->task_lock);
    2274                 :            :                         }
    2275                 :            :                 }
    2276                 :            : 
    2277                 :            :                 /* drop submission reference */
    2278                 :          0 :                 io_put_req(req);
    2279                 :            : 
    2280                 :          0 :                 if (ret) {
    2281                 :          0 :                         io_cqring_add_event(ctx, sqe->user_data, ret);
    2282                 :          0 :                         io_put_req(req);
    2283                 :            :                 }
    2284                 :            : 
    2285                 :            :                 /* async context always use a copy of the sqe */
    2286                 :          0 :                 kfree(sqe);
    2287                 :            : 
    2288                 :            :                 /* req from defer and link list needn't decrease async cnt */
    2289                 :          0 :                 if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
    2290                 :            :                         goto out;
    2291                 :            : 
    2292                 :          0 :                 if (!async_list)
    2293                 :            :                         break;
    2294                 :          0 :                 if (!list_empty(&req_list)) {
    2295                 :          0 :                         req = list_first_entry(&req_list, struct io_kiocb,
    2296                 :            :                                                 list);
    2297                 :            :                         list_del(&req->list);
    2298                 :          0 :                         continue;
    2299                 :            :                 }
    2300                 :          0 :                 if (list_empty(&async_list->list))
    2301                 :            :                         break;
    2302                 :            : 
    2303                 :            :                 req = NULL;
    2304                 :            :                 spin_lock(&async_list->lock);
    2305                 :          0 :                 if (list_empty(&async_list->list)) {
    2306                 :            :                         spin_unlock(&async_list->lock);
    2307                 :            :                         break;
    2308                 :            :                 }
    2309                 :            :                 list_splice_init(&async_list->list, &req_list);
    2310                 :            :                 spin_unlock(&async_list->lock);
    2311                 :            : 
    2312                 :          0 :                 req = list_first_entry(&req_list, struct io_kiocb, list);
    2313                 :            :                 list_del(&req->list);
    2314                 :          0 :         } while (req);
    2315                 :            : 
    2316                 :            :         /*
    2317                 :            :          * Rare case of racing with a submitter. If we find the count has
    2318                 :            :          * dropped to zero AND we have pending work items, then restart
    2319                 :            :          * the processing. This is a tiny race window.
    2320                 :            :          */
    2321                 :          0 :         if (async_list) {
    2322                 :          0 :                 ret = atomic_dec_return(&async_list->cnt);
    2323                 :          0 :                 while (!ret && !list_empty(&async_list->list)) {
    2324                 :            :                         spin_lock(&async_list->lock);
    2325                 :            :                         atomic_inc(&async_list->cnt);
    2326                 :            :                         list_splice_init(&async_list->list, &req_list);
    2327                 :            :                         spin_unlock(&async_list->lock);
    2328                 :            : 
    2329                 :          0 :                         if (!list_empty(&req_list)) {
    2330                 :          0 :                                 req = list_first_entry(&req_list,
    2331                 :            :                                                         struct io_kiocb, list);
    2332                 :            :                                 list_del(&req->list);
    2333                 :            :                                 goto restart;
    2334                 :            :                         }
    2335                 :          0 :                         ret = atomic_dec_return(&async_list->cnt);
    2336                 :            :                 }
    2337                 :            :         }
    2338                 :            : 
    2339                 :            : out:
    2340                 :            :         disallow_signal(SIGINT);
    2341                 :          0 :         if (cur_mm) {
    2342                 :            :                 set_fs(old_fs);
    2343                 :          0 :                 unuse_mm(cur_mm);
    2344                 :          0 :                 mmput(cur_mm);
    2345                 :            :         }
    2346                 :          0 :         revert_creds(old_cred);
    2347                 :          0 :         if (old_fs_struct) {
    2348                 :          0 :                 task_lock(current);
    2349                 :          0 :                 current->fs = old_fs_struct;
    2350                 :          0 :                 task_unlock(current);
    2351                 :            :         }
    2352                 :          0 : }
    2353                 :            : 
    2354                 :            : /*
    2355                 :            :  * See if we can piggy back onto previously submitted work, that is still
    2356                 :            :  * running. We currently only allow this if the new request is sequential
    2357                 :            :  * to the previous one we punted.
    2358                 :            :  */
    2359                 :          0 : static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
    2360                 :            : {
    2361                 :            :         bool ret;
    2362                 :            : 
    2363                 :          0 :         if (!list)
    2364                 :            :                 return false;
    2365                 :          0 :         if (!(req->flags & REQ_F_SEQ_PREV))
    2366                 :            :                 return false;
    2367                 :          0 :         if (!atomic_read(&list->cnt))
    2368                 :            :                 return false;
    2369                 :            : 
    2370                 :            :         ret = true;
    2371                 :            :         spin_lock(&list->lock);
    2372                 :          0 :         list_add_tail(&req->list, &list->list);
    2373                 :            :         /*
    2374                 :            :          * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
    2375                 :            :          */
    2376                 :          0 :         smp_mb();
    2377                 :          0 :         if (!atomic_read(&list->cnt)) {
    2378                 :            :                 list_del_init(&req->list);
    2379                 :            :                 ret = false;
    2380                 :            :         }
    2381                 :            :         spin_unlock(&list->lock);
    2382                 :          0 :         return ret;
    2383                 :            : }
    2384                 :            : 
    2385                 :            : static bool io_op_needs_file(struct io_kiocb *req)
    2386                 :            : {
    2387                 :          0 :         switch (req->submit.opcode) {
    2388                 :            :         case IORING_OP_NOP:
    2389                 :            :         case IORING_OP_POLL_REMOVE:
    2390                 :            :         case IORING_OP_TIMEOUT:
    2391                 :            :                 return false;
    2392                 :            :         default:
    2393                 :            :                 return true;
    2394                 :            :         }
    2395                 :            : }
    2396                 :            : 
    2397                 :          0 : static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
    2398                 :            :                            struct io_submit_state *state, struct io_kiocb *req)
    2399                 :            : {
    2400                 :            :         unsigned flags;
    2401                 :            :         int fd;
    2402                 :            : 
    2403                 :          0 :         flags = READ_ONCE(s->sqe->flags);
    2404                 :          0 :         fd = READ_ONCE(s->sqe->fd);
    2405                 :            : 
    2406                 :          0 :         if (flags & IOSQE_IO_DRAIN)
    2407                 :          0 :                 req->flags |= REQ_F_IO_DRAIN;
    2408                 :            :         /*
    2409                 :            :          * All io need record the previous position, if LINK vs DARIN,
    2410                 :            :          * it can be used to mark the position of the first IO in the
    2411                 :            :          * link list.
    2412                 :            :          */
    2413                 :          0 :         req->sequence = s->sequence;
    2414                 :            : 
    2415                 :          0 :         if (!io_op_needs_file(req))
    2416                 :            :                 return 0;
    2417                 :            : 
    2418                 :          0 :         if (flags & IOSQE_FIXED_FILE) {
    2419                 :          0 :                 if (unlikely(!ctx->user_files ||
    2420                 :            :                     (unsigned) fd >= ctx->nr_user_files))
    2421                 :            :                         return -EBADF;
    2422                 :          0 :                 req->file = ctx->user_files[fd];
    2423                 :          0 :                 req->flags |= REQ_F_FIXED_FILE;
    2424                 :            :         } else {
    2425                 :          0 :                 if (s->needs_fixed_file)
    2426                 :            :                         return -EBADF;
    2427                 :          0 :                 req->file = io_file_get(state, fd);
    2428                 :          0 :                 if (unlikely(!req->file))
    2429                 :            :                         return -EBADF;
    2430                 :            :         }
    2431                 :            : 
    2432                 :            :         return 0;
    2433                 :            : }
    2434                 :            : 
    2435                 :          0 : static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
    2436                 :            :                         struct sqe_submit *s)
    2437                 :            : {
    2438                 :            :         int ret;
    2439                 :            : 
    2440                 :          0 :         ret = __io_submit_sqe(ctx, req, s, true);
    2441                 :            : 
    2442                 :            :         /*
    2443                 :            :          * We async punt it if the file wasn't marked NOWAIT, or if the file
    2444                 :            :          * doesn't support non-blocking read/write attempts
    2445                 :            :          */
    2446                 :          0 :         if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
    2447                 :            :             (req->flags & REQ_F_MUST_PUNT))) {
    2448                 :            :                 struct io_uring_sqe *sqe_copy;
    2449                 :            : 
    2450                 :          0 :                 sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
    2451                 :          0 :                 if (sqe_copy) {
    2452                 :            :                         struct async_list *list;
    2453                 :            : 
    2454                 :          0 :                         s->sqe = sqe_copy;
    2455                 :          0 :                         memcpy(&req->submit, s, sizeof(*s));
    2456                 :            :                         list = io_async_list_from_req(ctx, req);
    2457                 :          0 :                         if (!io_add_to_prev_work(list, req)) {
    2458                 :          0 :                                 if (list)
    2459                 :          0 :                                         atomic_inc(&list->cnt);
    2460                 :          0 :                                 INIT_WORK(&req->work, io_sq_wq_submit_work);
    2461                 :          0 :                                 io_queue_async_work(ctx, req);
    2462                 :            :                         }
    2463                 :            : 
    2464                 :            :                         /*
    2465                 :            :                          * Queued up for async execution, worker will release
    2466                 :            :                          * submit reference when the iocb is actually submitted.
    2467                 :            :                          */
    2468                 :            :                         return 0;
    2469                 :            :                 }
    2470                 :            :         }
    2471                 :            : 
    2472                 :            :         /* drop submission reference */
    2473                 :          0 :         io_put_req(req);
    2474                 :            : 
    2475                 :            :         /* and drop final reference, if we failed */
    2476                 :          0 :         if (ret) {
    2477                 :          0 :                 io_cqring_add_event(ctx, req->user_data, ret);
    2478                 :          0 :                 if (req->flags & REQ_F_LINK)
    2479                 :          0 :                         req->flags |= REQ_F_FAIL_LINK;
    2480                 :          0 :                 io_put_req(req);
    2481                 :            :         }
    2482                 :            : 
    2483                 :          0 :         return ret;
    2484                 :            : }
    2485                 :            : 
    2486                 :          0 : static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
    2487                 :            :                         struct sqe_submit *s)
    2488                 :            : {
    2489                 :            :         int ret;
    2490                 :            : 
    2491                 :          0 :         ret = io_req_defer(ctx, req, s);
    2492                 :          0 :         if (ret) {
    2493                 :          0 :                 if (ret != -EIOCBQUEUED) {
    2494                 :          0 :                         io_free_req(req);
    2495                 :          0 :                         io_cqring_add_event(ctx, s->sqe->user_data, ret);
    2496                 :            :                 }
    2497                 :            :                 return 0;
    2498                 :            :         }
    2499                 :            : 
    2500                 :          0 :         return __io_queue_sqe(ctx, req, s);
    2501                 :            : }
    2502                 :            : 
    2503                 :          0 : static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
    2504                 :            :                               struct sqe_submit *s, struct io_kiocb *shadow)
    2505                 :            : {
    2506                 :            :         int ret;
    2507                 :            :         int need_submit = false;
    2508                 :            : 
    2509                 :          0 :         if (!shadow)
    2510                 :          0 :                 return io_queue_sqe(ctx, req, s);
    2511                 :            : 
    2512                 :            :         /*
    2513                 :            :          * Mark the first IO in link list as DRAIN, let all the following
    2514                 :            :          * IOs enter the defer list. all IO needs to be completed before link
    2515                 :            :          * list.
    2516                 :            :          */
    2517                 :          0 :         req->flags |= REQ_F_IO_DRAIN;
    2518                 :          0 :         ret = io_req_defer(ctx, req, s);
    2519                 :          0 :         if (ret) {
    2520                 :          0 :                 if (ret != -EIOCBQUEUED) {
    2521                 :          0 :                         io_free_req(req);
    2522                 :          0 :                         __io_free_req(shadow);
    2523                 :          0 :                         io_cqring_add_event(ctx, s->sqe->user_data, ret);
    2524                 :          0 :                         return 0;
    2525                 :            :                 }
    2526                 :            :         } else {
    2527                 :            :                 /*
    2528                 :            :                  * If ret == 0 means that all IOs in front of link io are
    2529                 :            :                  * running done. let's queue link head.
    2530                 :            :                  */
    2531                 :            :                 need_submit = true;
    2532                 :            :         }
    2533                 :            : 
    2534                 :            :         /* Insert shadow req to defer_list, blocking next IOs */
    2535                 :            :         spin_lock_irq(&ctx->completion_lock);
    2536                 :          0 :         list_add_tail(&shadow->list, &ctx->defer_list);
    2537                 :            :         spin_unlock_irq(&ctx->completion_lock);
    2538                 :            : 
    2539                 :          0 :         if (need_submit)
    2540                 :          0 :                 return __io_queue_sqe(ctx, req, s);
    2541                 :            : 
    2542                 :            :         return 0;
    2543                 :            : }
    2544                 :            : 
    2545                 :            : #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
    2546                 :            : 
    2547                 :          0 : static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
    2548                 :            :                           struct io_submit_state *state, struct io_kiocb **link)
    2549                 :            : {
    2550                 :            :         struct io_uring_sqe *sqe_copy;
    2551                 :            :         struct io_kiocb *req;
    2552                 :            :         int ret;
    2553                 :            : 
    2554                 :            :         /* enforce forwards compatibility on users */
    2555                 :          0 :         if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
    2556                 :            :                 ret = -EINVAL;
    2557                 :            :                 goto err;
    2558                 :            :         }
    2559                 :            : 
    2560                 :          0 :         req = io_get_req(ctx, state);
    2561                 :          0 :         if (unlikely(!req)) {
    2562                 :            :                 ret = -EAGAIN;
    2563                 :            :                 goto err;
    2564                 :            :         }
    2565                 :            : 
    2566                 :          0 :         memcpy(&req->submit, s, sizeof(*s));
    2567                 :          0 :         ret = io_req_set_file(ctx, s, state, req);
    2568                 :          0 :         if (unlikely(ret)) {
    2569                 :            : err_req:
    2570                 :          0 :                 io_free_req(req);
    2571                 :            : err:
    2572                 :          0 :                 io_cqring_add_event(ctx, s->sqe->user_data, ret);
    2573                 :          0 :                 return;
    2574                 :            :         }
    2575                 :            : 
    2576                 :          0 :         req->user_data = s->sqe->user_data;
    2577                 :            : 
    2578                 :            : #if defined(CONFIG_NET)
    2579                 :          0 :         switch (req->submit.opcode) {
    2580                 :            :         case IORING_OP_SENDMSG:
    2581                 :            :         case IORING_OP_RECVMSG:
    2582                 :          0 :                 spin_lock(&current->fs->lock);
    2583                 :          0 :                 if (!current->fs->in_exec) {
    2584                 :          0 :                         req->fs = current->fs;
    2585                 :          0 :                         req->fs->users++;
    2586                 :            :                 }
    2587                 :          0 :                 spin_unlock(&current->fs->lock);
    2588                 :          0 :                 if (!req->fs) {
    2589                 :            :                         ret = -EAGAIN;
    2590                 :            :                         goto err_req;
    2591                 :            :                 }
    2592                 :            :         }
    2593                 :            : #endif
    2594                 :            : 
    2595                 :            :         /*
    2596                 :            :          * If we already have a head request, queue this one for async
    2597                 :            :          * submittal once the head completes. If we don't have a head but
    2598                 :            :          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
    2599                 :            :          * submitted sync once the chain is complete. If none of those
    2600                 :            :          * conditions are true (normal request), then just queue it.
    2601                 :            :          */
    2602                 :          0 :         if (*link) {
    2603                 :            :                 struct io_kiocb *prev = *link;
    2604                 :            : 
    2605                 :          0 :                 sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
    2606                 :          0 :                 if (!sqe_copy) {
    2607                 :            :                         ret = -EAGAIN;
    2608                 :            :                         goto err_req;
    2609                 :            :                 }
    2610                 :            : 
    2611                 :          0 :                 s->sqe = sqe_copy;
    2612                 :          0 :                 memcpy(&req->submit, s, sizeof(*s));
    2613                 :          0 :                 list_add_tail(&req->list, &prev->link_list);
    2614                 :          0 :         } else if (s->sqe->flags & IOSQE_IO_LINK) {
    2615                 :          0 :                 req->flags |= REQ_F_LINK;
    2616                 :            : 
    2617                 :          0 :                 memcpy(&req->submit, s, sizeof(*s));
    2618                 :          0 :                 INIT_LIST_HEAD(&req->link_list);
    2619                 :          0 :                 *link = req;
    2620                 :            :         } else {
    2621                 :          0 :                 io_queue_sqe(ctx, req, s);
    2622                 :            :         }
    2623                 :            : }
    2624                 :            : 
    2625                 :            : /*
    2626                 :            :  * Batched submission is done, ensure local IO is flushed out.
    2627                 :            :  */
    2628                 :          0 : static void io_submit_state_end(struct io_submit_state *state)
    2629                 :            : {
    2630                 :          0 :         blk_finish_plug(&state->plug);
    2631                 :          0 :         io_file_put(state);
    2632                 :          0 :         if (state->free_reqs)
    2633                 :          0 :                 kmem_cache_free_bulk(req_cachep, state->free_reqs,
    2634                 :          0 :                                         &state->reqs[state->cur_req]);
    2635                 :          0 : }
    2636                 :            : 
    2637                 :            : /*
    2638                 :            :  * Start submission side cache.
    2639                 :            :  */
    2640                 :            : static void io_submit_state_start(struct io_submit_state *state,
    2641                 :            :                                   struct io_ring_ctx *ctx, unsigned max_ios)
    2642                 :            : {
    2643                 :          0 :         blk_start_plug(&state->plug);
    2644                 :          0 :         state->free_reqs = 0;
    2645                 :          0 :         state->file = NULL;
    2646                 :          0 :         state->ios_left = max_ios;
    2647                 :            : }
    2648                 :            : 
    2649                 :            : static void io_commit_sqring(struct io_ring_ctx *ctx)
    2650                 :            : {
    2651                 :          0 :         struct io_rings *rings = ctx->rings;
    2652                 :            : 
    2653                 :          0 :         if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
    2654                 :            :                 /*
    2655                 :            :                  * Ensure any loads from the SQEs are done at this point,
    2656                 :            :                  * since once we write the new head, the application could
    2657                 :            :                  * write new data to them.
    2658                 :            :                  */
    2659                 :          0 :                 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
    2660                 :            :         }
    2661                 :            : }
    2662                 :            : 
    2663                 :            : /*
    2664                 :            :  * Fetch an sqe, if one is available. Note that s->sqe will point to memory
    2665                 :            :  * that is mapped by userspace. This means that care needs to be taken to
    2666                 :            :  * ensure that reads are stable, as we cannot rely on userspace always
    2667                 :            :  * being a good citizen. If members of the sqe are validated and then later
    2668                 :            :  * used, it's important that those reads are done through READ_ONCE() to
    2669                 :            :  * prevent a re-load down the line.
    2670                 :            :  */
    2671                 :          0 : static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
    2672                 :            : {
    2673                 :          0 :         struct io_rings *rings = ctx->rings;
    2674                 :          0 :         u32 *sq_array = ctx->sq_array;
    2675                 :            :         unsigned head;
    2676                 :            : 
    2677                 :            :         /*
    2678                 :            :          * The cached sq head (or cq tail) serves two purposes:
    2679                 :            :          *
    2680                 :            :          * 1) allows us to batch the cost of updating the user visible
    2681                 :            :          *    head updates.
    2682                 :            :          * 2) allows the kernel side to track the head on its own, even
    2683                 :            :          *    though the application is the one updating it.
    2684                 :            :          */
    2685                 :          0 :         head = ctx->cached_sq_head;
    2686                 :            :         /* make sure SQ entry isn't read before tail */
    2687                 :          0 :         if (head == smp_load_acquire(&rings->sq.tail))
    2688                 :            :                 return false;
    2689                 :            : 
    2690                 :          0 :         head = READ_ONCE(sq_array[head & ctx->sq_mask]);
    2691                 :          0 :         if (head < ctx->sq_entries) {
    2692                 :          0 :                 s->index = head;
    2693                 :          0 :                 s->sqe = &ctx->sq_sqes[head];
    2694                 :          0 :                 s->opcode = READ_ONCE(s->sqe->opcode);
    2695                 :          0 :                 s->sequence = ctx->cached_sq_head;
    2696                 :          0 :                 ctx->cached_sq_head++;
    2697                 :          0 :                 return true;
    2698                 :            :         }
    2699                 :            : 
    2700                 :            :         /* drop invalid entries */
    2701                 :          0 :         ctx->cached_sq_head++;
    2702                 :          0 :         ctx->cached_sq_dropped++;
    2703                 :            :         WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);
    2704                 :          0 :         return false;
    2705                 :            : }
    2706                 :            : 
    2707                 :          0 : static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
    2708                 :            :                           bool has_user, bool mm_fault)
    2709                 :            : {
    2710                 :            :         struct io_submit_state state, *statep = NULL;
    2711                 :          0 :         struct io_kiocb *link = NULL;
    2712                 :            :         struct io_kiocb *shadow_req = NULL;
    2713                 :            :         bool prev_was_link = false;
    2714                 :            :         int i, submitted = 0;
    2715                 :            : 
    2716                 :          0 :         if (nr > IO_PLUG_THRESHOLD) {
    2717                 :            :                 io_submit_state_start(&state, ctx, nr);
    2718                 :            :                 statep = &state;
    2719                 :            :         }
    2720                 :            : 
    2721                 :          0 :         for (i = 0; i < nr; i++) {
    2722                 :            :                 struct sqe_submit s;
    2723                 :            : 
    2724                 :          0 :                 if (!io_get_sqring(ctx, &s))
    2725                 :            :                         break;
    2726                 :            : 
    2727                 :            :                 /*
    2728                 :            :                  * If previous wasn't linked and we have a linked command,
    2729                 :            :                  * that's the end of the chain. Submit the previous link.
    2730                 :            :                  */
    2731                 :          0 :                 if (!prev_was_link && link) {
    2732                 :          0 :                         io_queue_link_head(ctx, link, &link->submit, shadow_req);
    2733                 :          0 :                         link = NULL;
    2734                 :            :                         shadow_req = NULL;
    2735                 :            :                 }
    2736                 :          0 :                 prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
    2737                 :            : 
    2738                 :          0 :                 if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
    2739                 :          0 :                         if (!shadow_req) {
    2740                 :          0 :                                 shadow_req = io_get_req(ctx, NULL);
    2741                 :          0 :                                 if (unlikely(!shadow_req))
    2742                 :            :                                         goto out;
    2743                 :          0 :                                 shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
    2744                 :          0 :                                 refcount_dec(&shadow_req->refs);
    2745                 :            :                         }
    2746                 :          0 :                         shadow_req->sequence = s.sequence;
    2747                 :            :                 }
    2748                 :            : 
    2749                 :            : out:
    2750                 :          0 :                 if (unlikely(mm_fault)) {
    2751                 :          0 :                         io_cqring_add_event(ctx, s.sqe->user_data,
    2752                 :            :                                                 -EFAULT);
    2753                 :            :                 } else {
    2754                 :          0 :                         s.has_user = has_user;
    2755                 :          0 :                         s.needs_lock = true;
    2756                 :          0 :                         s.needs_fixed_file = true;
    2757                 :          0 :                         io_submit_sqe(ctx, &s, statep, &link);
    2758                 :          0 :                         submitted++;
    2759                 :            :                 }
    2760                 :            :         }
    2761                 :            : 
    2762                 :          0 :         if (link)
    2763                 :          0 :                 io_queue_link_head(ctx, link, &link->submit, shadow_req);
    2764                 :          0 :         if (statep)
    2765                 :          0 :                 io_submit_state_end(&state);
    2766                 :            : 
    2767                 :          0 :         return submitted;
    2768                 :            : }
    2769                 :            : 
    2770                 :          0 : static int io_sq_thread(void *data)
    2771                 :            : {
    2772                 :            :         struct io_ring_ctx *ctx = data;
    2773                 :            :         struct mm_struct *cur_mm = NULL;
    2774                 :            :         const struct cred *old_cred;
    2775                 :            :         mm_segment_t old_fs;
    2776                 :          0 :         DEFINE_WAIT(wait);
    2777                 :            :         unsigned inflight;
    2778                 :            :         unsigned long timeout;
    2779                 :            : 
    2780                 :          0 :         complete(&ctx->sqo_thread_started);
    2781                 :            : 
    2782                 :          0 :         old_fs = get_fs();
    2783                 :            :         set_fs(USER_DS);
    2784                 :          0 :         old_cred = override_creds(ctx->creds);
    2785                 :            : 
    2786                 :            :         timeout = inflight = 0;
    2787                 :          0 :         while (!kthread_should_park()) {
    2788                 :            :                 bool mm_fault = false;
    2789                 :            :                 unsigned int to_submit;
    2790                 :            : 
    2791                 :          0 :                 if (inflight) {
    2792                 :          0 :                         unsigned nr_events = 0;
    2793                 :            : 
    2794                 :          0 :                         if (ctx->flags & IORING_SETUP_IOPOLL) {
    2795                 :            :                                 /*
    2796                 :            :                                  * inflight is the count of the maximum possible
    2797                 :            :                                  * entries we submitted, but it can be smaller
    2798                 :            :                                  * if we dropped some of them. If we don't have
    2799                 :            :                                  * poll entries available, then we know that we
    2800                 :            :                                  * have nothing left to poll for. Reset the
    2801                 :            :                                  * inflight count to zero in that case.
    2802                 :            :                                  */
    2803                 :          0 :                                 mutex_lock(&ctx->uring_lock);
    2804                 :          0 :                                 if (!list_empty(&ctx->poll_list))
    2805                 :          0 :                                         io_iopoll_getevents(ctx, &nr_events, 0);
    2806                 :            :                                 else
    2807                 :            :                                         inflight = 0;
    2808                 :          0 :                                 mutex_unlock(&ctx->uring_lock);
    2809                 :            :                         } else {
    2810                 :            :                                 /*
    2811                 :            :                                  * Normal IO, just pretend everything completed.
    2812                 :            :                                  * We don't have to poll completions for that.
    2813                 :            :                                  */
    2814                 :          0 :                                 nr_events = inflight;
    2815                 :            :                         }
    2816                 :            : 
    2817                 :          0 :                         inflight -= nr_events;
    2818                 :          0 :                         if (!inflight)
    2819                 :          0 :                                 timeout = jiffies + ctx->sq_thread_idle;
    2820                 :            :                 }
    2821                 :            : 
    2822                 :            :                 to_submit = io_sqring_entries(ctx);
    2823                 :          0 :                 if (!to_submit) {
    2824                 :            :                         /*
    2825                 :            :                          * Drop cur_mm before scheduling, we can't hold it for
    2826                 :            :                          * long periods (or over schedule()). Do this before
    2827                 :            :                          * adding ourselves to the waitqueue, as the unuse/drop
    2828                 :            :                          * may sleep.
    2829                 :            :                          */
    2830                 :          0 :                         if (cur_mm) {
    2831                 :          0 :                                 unuse_mm(cur_mm);
    2832                 :          0 :                                 mmput(cur_mm);
    2833                 :            :                                 cur_mm = NULL;
    2834                 :            :                         }
    2835                 :            : 
    2836                 :            :                         /*
    2837                 :            :                          * We're polling. If we're within the defined idle
    2838                 :            :                          * period, then let us spin without work before going
    2839                 :            :                          * to sleep.
    2840                 :            :                          */
    2841                 :          0 :                         if (inflight || !time_after(jiffies, timeout)) {
    2842                 :          0 :                                 cond_resched();
    2843                 :          0 :                                 continue;
    2844                 :            :                         }
    2845                 :            : 
    2846                 :          0 :                         prepare_to_wait(&ctx->sqo_wait, &wait,
    2847                 :            :                                                 TASK_INTERRUPTIBLE);
    2848                 :            : 
    2849                 :            :                         /* Tell userspace we may need a wakeup call */
    2850                 :          0 :                         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
    2851                 :            :                         /* make sure to read SQ tail after writing flags */
    2852                 :          0 :                         smp_mb();
    2853                 :            : 
    2854                 :            :                         to_submit = io_sqring_entries(ctx);
    2855                 :          0 :                         if (!to_submit) {
    2856                 :          0 :                                 if (kthread_should_park()) {
    2857                 :          0 :                                         finish_wait(&ctx->sqo_wait, &wait);
    2858                 :          0 :                                         break;
    2859                 :            :                                 }
    2860                 :          0 :                                 if (signal_pending(current))
    2861                 :          0 :                                         flush_signals(current);
    2862                 :          0 :                                 schedule();
    2863                 :          0 :                                 finish_wait(&ctx->sqo_wait, &wait);
    2864                 :            : 
    2865                 :          0 :                                 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
    2866                 :          0 :                                 continue;
    2867                 :            :                         }
    2868                 :          0 :                         finish_wait(&ctx->sqo_wait, &wait);
    2869                 :            : 
    2870                 :          0 :                         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
    2871                 :            :                 }
    2872                 :            : 
    2873                 :            :                 /* Unless all new commands are FIXED regions, grab mm */
    2874                 :          0 :                 if (!cur_mm) {
    2875                 :          0 :                         mm_fault = !mmget_not_zero(ctx->sqo_mm);
    2876                 :          0 :                         if (!mm_fault) {
    2877                 :          0 :                                 use_mm(ctx->sqo_mm);
    2878                 :          0 :                                 cur_mm = ctx->sqo_mm;
    2879                 :            :                         }
    2880                 :            :                 }
    2881                 :            : 
    2882                 :          0 :                 to_submit = min(to_submit, ctx->sq_entries);
    2883                 :          0 :                 inflight += io_submit_sqes(ctx, to_submit, cur_mm != NULL,
    2884                 :            :                                            mm_fault);
    2885                 :            : 
    2886                 :            :                 /* Commit SQ ring head once we've consumed all SQEs */
    2887                 :            :                 io_commit_sqring(ctx);
    2888                 :            :         }
    2889                 :            : 
    2890                 :            :         set_fs(old_fs);
    2891                 :          0 :         if (cur_mm) {
    2892                 :          0 :                 unuse_mm(cur_mm);
    2893                 :          0 :                 mmput(cur_mm);
    2894                 :            :         }
    2895                 :          0 :         revert_creds(old_cred);
    2896                 :            : 
    2897                 :          0 :         kthread_parkme();
    2898                 :            : 
    2899                 :          0 :         return 0;
    2900                 :            : }
    2901                 :            : 
    2902                 :          0 : static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
    2903                 :            : {
    2904                 :            :         struct io_submit_state state, *statep = NULL;
    2905                 :          0 :         struct io_kiocb *link = NULL;
    2906                 :            :         struct io_kiocb *shadow_req = NULL;
    2907                 :            :         bool prev_was_link = false;
    2908                 :            :         int i, submit = 0;
    2909                 :            : 
    2910                 :          0 :         if (to_submit > IO_PLUG_THRESHOLD) {
    2911                 :            :                 io_submit_state_start(&state, ctx, to_submit);
    2912                 :            :                 statep = &state;
    2913                 :            :         }
    2914                 :            : 
    2915                 :          0 :         for (i = 0; i < to_submit; i++) {
    2916                 :            :                 struct sqe_submit s;
    2917                 :            : 
    2918                 :          0 :                 if (!io_get_sqring(ctx, &s))
    2919                 :            :                         break;
    2920                 :            : 
    2921                 :            :                 /*
    2922                 :            :                  * If previous wasn't linked and we have a linked command,
    2923                 :            :                  * that's the end of the chain. Submit the previous link.
    2924                 :            :                  */
    2925                 :          0 :                 if (!prev_was_link && link) {
    2926                 :          0 :                         io_queue_link_head(ctx, link, &link->submit, shadow_req);
    2927                 :          0 :                         link = NULL;
    2928                 :            :                         shadow_req = NULL;
    2929                 :            :                 }
    2930                 :          0 :                 prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
    2931                 :            : 
    2932                 :          0 :                 if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
    2933                 :          0 :                         if (!shadow_req) {
    2934                 :          0 :                                 shadow_req = io_get_req(ctx, NULL);
    2935                 :          0 :                                 if (unlikely(!shadow_req))
    2936                 :            :                                         goto out;
    2937                 :          0 :                                 shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
    2938                 :          0 :                                 refcount_dec(&shadow_req->refs);
    2939                 :            :                         }
    2940                 :          0 :                         shadow_req->sequence = s.sequence;
    2941                 :            :                 }
    2942                 :            : 
    2943                 :            : out:
    2944                 :          0 :                 s.has_user = true;
    2945                 :          0 :                 s.needs_lock = false;
    2946                 :          0 :                 s.needs_fixed_file = false;
    2947                 :          0 :                 submit++;
    2948                 :          0 :                 io_submit_sqe(ctx, &s, statep, &link);
    2949                 :            :         }
    2950                 :            : 
    2951                 :          0 :         if (link)
    2952                 :          0 :                 io_queue_link_head(ctx, link, &link->submit, shadow_req);
    2953                 :          0 :         if (statep)
    2954                 :          0 :                 io_submit_state_end(statep);
    2955                 :            : 
    2956                 :            :         io_commit_sqring(ctx);
    2957                 :            : 
    2958                 :          0 :         return submit;
    2959                 :            : }
    2960                 :            : 
    2961                 :            : struct io_wait_queue {
    2962                 :            :         struct wait_queue_entry wq;
    2963                 :            :         struct io_ring_ctx *ctx;
    2964                 :            :         unsigned to_wait;
    2965                 :            :         unsigned nr_timeouts;
    2966                 :            : };
    2967                 :            : 
    2968                 :            : static inline bool io_should_wake(struct io_wait_queue *iowq)
    2969                 :            : {
    2970                 :          0 :         struct io_ring_ctx *ctx = iowq->ctx;
    2971                 :            : 
    2972                 :            :         /*
    2973                 :            :          * Wake up if we have enough events, or if a timeout occured since we
    2974                 :            :          * started waiting. For timeouts, we always want to return to userspace,
    2975                 :            :          * regardless of event count.
    2976                 :            :          */
    2977                 :          0 :         return io_cqring_events(ctx->rings) >= iowq->to_wait ||
    2978                 :          0 :                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
    2979                 :            : }
    2980                 :            : 
    2981                 :          0 : static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
    2982                 :            :                             int wake_flags, void *key)
    2983                 :            : {
    2984                 :            :         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
    2985                 :            :                                                         wq);
    2986                 :            : 
    2987                 :          0 :         if (!io_should_wake(iowq))
    2988                 :            :                 return -1;
    2989                 :            : 
    2990                 :          0 :         return autoremove_wake_function(curr, mode, wake_flags, key);
    2991                 :            : }
    2992                 :            : 
    2993                 :            : /*
    2994                 :            :  * Wait until events become available, if we don't already have some. The
    2995                 :            :  * application must reap them itself, as they reside on the shared cq ring.
    2996                 :            :  */
    2997                 :          0 : static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
    2998                 :            :                           const sigset_t __user *sig, size_t sigsz)
    2999                 :            : {
    3000                 :          0 :         struct io_wait_queue iowq = {
    3001                 :            :                 .wq = {
    3002                 :          0 :                         .private        = current,
    3003                 :            :                         .func           = io_wake_function,
    3004                 :            :                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
    3005                 :            :                 },
    3006                 :            :                 .ctx            = ctx,
    3007                 :            :                 .to_wait        = min_events,
    3008                 :            :         };
    3009                 :          0 :         struct io_rings *rings = ctx->rings;
    3010                 :            :         int ret;
    3011                 :            : 
    3012                 :          0 :         if (io_cqring_events(rings) >= min_events)
    3013                 :            :                 return 0;
    3014                 :            : 
    3015                 :          0 :         if (sig) {
    3016                 :            : #ifdef CONFIG_COMPAT
    3017                 :            :                 if (in_compat_syscall())
    3018                 :            :                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
    3019                 :            :                                                       sigsz);
    3020                 :            :                 else
    3021                 :            : #endif
    3022                 :          0 :                         ret = set_user_sigmask(sig, sigsz);
    3023                 :            : 
    3024                 :          0 :                 if (ret)
    3025                 :            :                         return ret;
    3026                 :            :         }
    3027                 :            : 
    3028                 :            :         ret = 0;
    3029                 :          0 :         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
    3030                 :            :         do {
    3031                 :          0 :                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
    3032                 :            :                                                 TASK_INTERRUPTIBLE);
    3033                 :          0 :                 if (io_should_wake(&iowq))
    3034                 :            :                         break;
    3035                 :          0 :                 schedule();
    3036                 :          0 :                 if (signal_pending(current)) {
    3037                 :            :                         ret = -ERESTARTSYS;
    3038                 :            :                         break;
    3039                 :            :                 }
    3040                 :            :         } while (1);
    3041                 :          0 :         finish_wait(&ctx->wait, &iowq.wq);
    3042                 :            : 
    3043                 :          0 :         restore_saved_sigmask_unless(ret == -ERESTARTSYS);
    3044                 :          0 :         if (ret == -ERESTARTSYS)
    3045                 :            :                 ret = -EINTR;
    3046                 :            : 
    3047                 :          0 :         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
    3048                 :            : }
    3049                 :            : 
    3050                 :          0 : static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
    3051                 :            : {
    3052                 :            : #if defined(CONFIG_UNIX)
    3053                 :          0 :         if (ctx->ring_sock) {
    3054                 :          0 :                 struct sock *sock = ctx->ring_sock->sk;
    3055                 :            :                 struct sk_buff *skb;
    3056                 :            : 
    3057                 :          0 :                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
    3058                 :          0 :                         kfree_skb(skb);
    3059                 :            :         }
    3060                 :            : #else
    3061                 :            :         int i;
    3062                 :            : 
    3063                 :            :         for (i = 0; i < ctx->nr_user_files; i++)
    3064                 :            :                 fput(ctx->user_files[i]);
    3065                 :            : #endif
    3066                 :          0 : }
    3067                 :            : 
    3068                 :          0 : static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
    3069                 :            : {
    3070                 :          0 :         if (!ctx->user_files)
    3071                 :            :                 return -ENXIO;
    3072                 :            : 
    3073                 :          0 :         __io_sqe_files_unregister(ctx);
    3074                 :          0 :         kfree(ctx->user_files);
    3075                 :          0 :         ctx->user_files = NULL;
    3076                 :          0 :         ctx->nr_user_files = 0;
    3077                 :          0 :         return 0;
    3078                 :            : }
    3079                 :            : 
    3080                 :          0 : static void io_sq_thread_stop(struct io_ring_ctx *ctx)
    3081                 :            : {
    3082                 :          0 :         if (ctx->sqo_thread) {
    3083                 :          0 :                 wait_for_completion(&ctx->sqo_thread_started);
    3084                 :            :                 /*
    3085                 :            :                  * The park is a bit of a work-around, without it we get
    3086                 :            :                  * warning spews on shutdown with SQPOLL set and affinity
    3087                 :            :                  * set to a single CPU.
    3088                 :            :                  */
    3089                 :          0 :                 kthread_park(ctx->sqo_thread);
    3090                 :          0 :                 kthread_stop(ctx->sqo_thread);
    3091                 :          0 :                 ctx->sqo_thread = NULL;
    3092                 :            :         }
    3093                 :          0 : }
    3094                 :            : 
    3095                 :          0 : static void io_finish_async(struct io_ring_ctx *ctx)
    3096                 :            : {
    3097                 :            :         int i;
    3098                 :            : 
    3099                 :          0 :         io_sq_thread_stop(ctx);
    3100                 :            : 
    3101                 :          0 :         for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) {
    3102                 :          0 :                 if (ctx->sqo_wq[i]) {
    3103                 :          0 :                         destroy_workqueue(ctx->sqo_wq[i]);
    3104                 :          0 :                         ctx->sqo_wq[i] = NULL;
    3105                 :            :                 }
    3106                 :            :         }
    3107                 :          0 : }
    3108                 :            : 
    3109                 :            : #if defined(CONFIG_UNIX)
    3110                 :          0 : static void io_destruct_skb(struct sk_buff *skb)
    3111                 :            : {
    3112                 :          0 :         struct io_ring_ctx *ctx = skb->sk->sk_user_data;
    3113                 :            :         int i;
    3114                 :            : 
    3115                 :          0 :         for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++)
    3116                 :          0 :                 if (ctx->sqo_wq[i])
    3117                 :          0 :                         flush_workqueue(ctx->sqo_wq[i]);
    3118                 :            : 
    3119                 :          0 :         unix_destruct_scm(skb);
    3120                 :          0 : }
    3121                 :            : 
    3122                 :            : /*
    3123                 :            :  * Ensure the UNIX gc is aware of our file set, so we are certain that
    3124                 :            :  * the io_uring can be safely unregistered on process exit, even if we have
    3125                 :            :  * loops in the file referencing.
    3126                 :            :  */
    3127                 :          0 : static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
    3128                 :            : {
    3129                 :          0 :         struct sock *sk = ctx->ring_sock->sk;
    3130                 :            :         struct scm_fp_list *fpl;
    3131                 :            :         struct sk_buff *skb;
    3132                 :            :         int i;
    3133                 :            : 
    3134                 :          0 :         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
    3135                 :          0 :         if (!fpl)
    3136                 :            :                 return -ENOMEM;
    3137                 :            : 
    3138                 :            :         skb = alloc_skb(0, GFP_KERNEL);
    3139                 :          0 :         if (!skb) {
    3140                 :          0 :                 kfree(fpl);
    3141                 :          0 :                 return -ENOMEM;
    3142                 :            :         }
    3143                 :            : 
    3144                 :          0 :         skb->sk = sk;
    3145                 :          0 :         skb->destructor = io_destruct_skb;
    3146                 :            : 
    3147                 :          0 :         fpl->user = get_uid(ctx->user);
    3148                 :          0 :         for (i = 0; i < nr; i++) {
    3149                 :          0 :                 fpl->fp[i] = get_file(ctx->user_files[i + offset]);
    3150                 :          0 :                 unix_inflight(fpl->user, fpl->fp[i]);
    3151                 :            :         }
    3152                 :            : 
    3153                 :          0 :         fpl->max = fpl->count = nr;
    3154                 :          0 :         UNIXCB(skb).fp = fpl;
    3155                 :          0 :         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
    3156                 :          0 :         skb_queue_head(&sk->sk_receive_queue, skb);
    3157                 :            : 
    3158                 :          0 :         for (i = 0; i < nr; i++)
    3159                 :          0 :                 fput(fpl->fp[i]);
    3160                 :            : 
    3161                 :            :         return 0;
    3162                 :            : }
    3163                 :            : 
    3164                 :            : /*
    3165                 :            :  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
    3166                 :            :  * causes regular reference counting to break down. We rely on the UNIX
    3167                 :            :  * garbage collection to take care of this problem for us.
    3168                 :            :  */
    3169                 :          0 : static int io_sqe_files_scm(struct io_ring_ctx *ctx)
    3170                 :            : {
    3171                 :            :         unsigned left, total;
    3172                 :            :         int ret = 0;
    3173                 :            : 
    3174                 :            :         total = 0;
    3175                 :          0 :         left = ctx->nr_user_files;
    3176                 :          0 :         while (left) {
    3177                 :          0 :                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
    3178                 :            : 
    3179                 :          0 :                 ret = __io_sqe_files_scm(ctx, this_files, total);
    3180                 :          0 :                 if (ret)
    3181                 :            :                         break;
    3182                 :          0 :                 left -= this_files;
    3183                 :          0 :                 total += this_files;
    3184                 :            :         }
    3185                 :            : 
    3186                 :          0 :         if (!ret)
    3187                 :            :                 return 0;
    3188                 :            : 
    3189                 :          0 :         while (total < ctx->nr_user_files) {
    3190                 :          0 :                 fput(ctx->user_files[total]);
    3191                 :          0 :                 total++;
    3192                 :            :         }
    3193                 :            : 
    3194                 :            :         return ret;
    3195                 :            : }
    3196                 :            : #else
    3197                 :            : static int io_sqe_files_scm(struct io_ring_ctx *ctx)
    3198                 :            : {
    3199                 :            :         return 0;
    3200                 :            : }
    3201                 :            : #endif
    3202                 :            : 
    3203                 :          0 : static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
    3204                 :            :                                  unsigned nr_args)
    3205                 :            : {
    3206                 :            :         __s32 __user *fds = (__s32 __user *) arg;
    3207                 :            :         int fd, ret = 0;
    3208                 :            :         unsigned i;
    3209                 :            : 
    3210                 :          0 :         if (ctx->user_files)
    3211                 :            :                 return -EBUSY;
    3212                 :          0 :         if (!nr_args)
    3213                 :            :                 return -EINVAL;
    3214                 :          0 :         if (nr_args > IORING_MAX_FIXED_FILES)
    3215                 :            :                 return -EMFILE;
    3216                 :            : 
    3217                 :          0 :         ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
    3218                 :          0 :         if (!ctx->user_files)
    3219                 :            :                 return -ENOMEM;
    3220                 :            : 
    3221                 :          0 :         for (i = 0; i < nr_args; i++) {
    3222                 :            :                 ret = -EFAULT;
    3223                 :          0 :                 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
    3224                 :            :                         break;
    3225                 :            : 
    3226                 :          0 :                 ctx->user_files[i] = fget(fd);
    3227                 :            : 
    3228                 :            :                 ret = -EBADF;
    3229                 :          0 :                 if (!ctx->user_files[i])
    3230                 :            :                         break;
    3231                 :            :                 /*
    3232                 :            :                  * Don't allow io_uring instances to be registered. If UNIX
    3233                 :            :                  * isn't enabled, then this causes a reference cycle and this
    3234                 :            :                  * instance can never get freed. If UNIX is enabled we'll
    3235                 :            :                  * handle it just fine, but there's still no point in allowing
    3236                 :            :                  * a ring fd as it doesn't support regular read/write anyway.
    3237                 :            :                  */
    3238                 :          0 :                 if (ctx->user_files[i]->f_op == &io_uring_fops) {
    3239                 :          0 :                         fput(ctx->user_files[i]);
    3240                 :          0 :                         break;
    3241                 :            :                 }
    3242                 :          0 :                 ctx->nr_user_files++;
    3243                 :            :                 ret = 0;
    3244                 :            :         }
    3245                 :            : 
    3246                 :          0 :         if (ret) {
    3247                 :          0 :                 for (i = 0; i < ctx->nr_user_files; i++)
    3248                 :          0 :                         fput(ctx->user_files[i]);
    3249                 :            : 
    3250                 :          0 :                 kfree(ctx->user_files);
    3251                 :          0 :                 ctx->user_files = NULL;
    3252                 :          0 :                 ctx->nr_user_files = 0;
    3253                 :          0 :                 return ret;
    3254                 :            :         }
    3255                 :            : 
    3256                 :          0 :         ret = io_sqe_files_scm(ctx);
    3257                 :          0 :         if (ret)
    3258                 :          0 :                 io_sqe_files_unregister(ctx);
    3259                 :            : 
    3260                 :          0 :         return ret;
    3261                 :            : }
    3262                 :            : 
    3263                 :          0 : static int io_sq_offload_start(struct io_ring_ctx *ctx,
    3264                 :            :                                struct io_uring_params *p)
    3265                 :            : {
    3266                 :            :         int ret;
    3267                 :            : 
    3268                 :          0 :         mmgrab(current->mm);
    3269                 :          0 :         ctx->sqo_mm = current->mm;
    3270                 :            : 
    3271                 :          0 :         if (ctx->flags & IORING_SETUP_SQPOLL) {
    3272                 :            :                 ret = -EPERM;
    3273                 :          0 :                 if (!capable(CAP_SYS_ADMIN))
    3274                 :            :                         goto err;
    3275                 :            : 
    3276                 :          0 :                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
    3277                 :          0 :                 if (!ctx->sq_thread_idle)
    3278                 :          0 :                         ctx->sq_thread_idle = HZ;
    3279                 :            : 
    3280                 :          0 :                 if (p->flags & IORING_SETUP_SQ_AFF) {
    3281                 :          0 :                         int cpu = p->sq_thread_cpu;
    3282                 :            : 
    3283                 :            :                         ret = -EINVAL;
    3284                 :          0 :                         if (cpu >= nr_cpu_ids)
    3285                 :            :                                 goto err;
    3286                 :          0 :                         if (!cpu_online(cpu))
    3287                 :            :                                 goto err;
    3288                 :            : 
    3289                 :          0 :                         ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
    3290                 :            :                                                         ctx, cpu,
    3291                 :            :                                                         "io_uring-sq");
    3292                 :            :                 } else {
    3293                 :          0 :                         ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
    3294                 :            :                                                         "io_uring-sq");
    3295                 :            :                 }
    3296                 :          0 :                 if (IS_ERR(ctx->sqo_thread)) {
    3297                 :            :                         ret = PTR_ERR(ctx->sqo_thread);
    3298                 :          0 :                         ctx->sqo_thread = NULL;
    3299                 :          0 :                         goto err;
    3300                 :            :                 }
    3301                 :          0 :                 wake_up_process(ctx->sqo_thread);
    3302                 :          0 :         } else if (p->flags & IORING_SETUP_SQ_AFF) {
    3303                 :            :                 /* Can't have SQ_AFF without SQPOLL */
    3304                 :            :                 ret = -EINVAL;
    3305                 :            :                 goto err;
    3306                 :            :         }
    3307                 :            : 
    3308                 :            :         /* Do QD, or 2 * CPUS, whatever is smallest */
    3309                 :          0 :         ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq",
    3310                 :            :                         WQ_UNBOUND | WQ_FREEZABLE,
    3311                 :          0 :                         min(ctx->sq_entries - 1, 2 * num_online_cpus()));
    3312                 :          0 :         if (!ctx->sqo_wq[0]) {
    3313                 :            :                 ret = -ENOMEM;
    3314                 :            :                 goto err;
    3315                 :            :         }
    3316                 :            : 
    3317                 :            :         /*
    3318                 :            :          * This is for buffered writes, where we want to limit the parallelism
    3319                 :            :          * due to file locking in file systems. As "normal" buffered writes
    3320                 :            :          * should parellelize on writeout quite nicely, limit us to having 2
    3321                 :            :          * pending. This avoids massive contention on the inode when doing
    3322                 :            :          * buffered async writes.
    3323                 :            :          */
    3324                 :          0 :         ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq",
    3325                 :            :                                                 WQ_UNBOUND | WQ_FREEZABLE, 2);
    3326                 :          0 :         if (!ctx->sqo_wq[1]) {
    3327                 :            :                 ret = -ENOMEM;
    3328                 :            :                 goto err;
    3329                 :            :         }
    3330                 :            : 
    3331                 :            :         return 0;
    3332                 :            : err:
    3333                 :          0 :         io_finish_async(ctx);
    3334                 :          0 :         mmdrop(ctx->sqo_mm);
    3335                 :          0 :         ctx->sqo_mm = NULL;
    3336                 :          0 :         return ret;
    3337                 :            : }
    3338                 :            : 
    3339                 :            : static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
    3340                 :            : {
    3341                 :          0 :         atomic_long_sub(nr_pages, &user->locked_vm);
    3342                 :            : }
    3343                 :            : 
    3344                 :          0 : static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
    3345                 :            : {
    3346                 :            :         unsigned long page_limit, cur_pages, new_pages;
    3347                 :            : 
    3348                 :            :         /* Don't allow more pages than we can safely lock */
    3349                 :          0 :         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
    3350                 :            : 
    3351                 :            :         do {
    3352                 :          0 :                 cur_pages = atomic_long_read(&user->locked_vm);
    3353                 :          0 :                 new_pages = cur_pages + nr_pages;
    3354                 :          0 :                 if (new_pages > page_limit)
    3355                 :            :                         return -ENOMEM;
    3356                 :          0 :         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
    3357                 :          0 :                                         new_pages) != cur_pages);
    3358                 :            : 
    3359                 :            :         return 0;
    3360                 :            : }
    3361                 :            : 
    3362                 :          0 : static void io_mem_free(void *ptr)
    3363                 :            : {
    3364                 :            :         struct page *page;
    3365                 :            : 
    3366                 :          0 :         if (!ptr)
    3367                 :          0 :                 return;
    3368                 :            : 
    3369                 :            :         page = virt_to_head_page(ptr);
    3370                 :          0 :         if (put_page_testzero(page))
    3371                 :          0 :                 free_compound_page(page);
    3372                 :            : }
    3373                 :            : 
    3374                 :          0 : static void *io_mem_alloc(size_t size)
    3375                 :            : {
    3376                 :            :         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
    3377                 :            :                                 __GFP_NORETRY;
    3378                 :            : 
    3379                 :          0 :         return (void *) __get_free_pages(gfp_flags, get_order(size));
    3380                 :            : }
    3381                 :            : 
    3382                 :          0 : static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
    3383                 :            :                                 size_t *sq_offset)
    3384                 :            : {
    3385                 :            :         struct io_rings *rings;
    3386                 :            :         size_t off, sq_array_size;
    3387                 :            : 
    3388                 :          0 :         off = struct_size(rings, cqes, cq_entries);
    3389                 :          0 :         if (off == SIZE_MAX)
    3390                 :            :                 return SIZE_MAX;
    3391                 :            : 
    3392                 :            : #ifdef CONFIG_SMP
    3393                 :          0 :         off = ALIGN(off, SMP_CACHE_BYTES);
    3394                 :          0 :         if (off == 0)
    3395                 :            :                 return SIZE_MAX;
    3396                 :            : #endif
    3397                 :            : 
    3398                 :          0 :         if (sq_offset)
    3399                 :          0 :                 *sq_offset = off;
    3400                 :            : 
    3401                 :            :         sq_array_size = array_size(sizeof(u32), sq_entries);
    3402                 :          0 :         if (sq_array_size == SIZE_MAX)
    3403                 :            :                 return SIZE_MAX;
    3404                 :            : 
    3405                 :          0 :         if (check_add_overflow(off, sq_array_size, &off))
    3406                 :            :                 return SIZE_MAX;
    3407                 :            : 
    3408                 :          0 :         return off;
    3409                 :            : }
    3410                 :            : 
    3411                 :          0 : static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
    3412                 :            : {
    3413                 :            :         size_t pages;
    3414                 :            : 
    3415                 :          0 :         pages = (size_t)1 << get_order(
    3416                 :            :                 rings_size(sq_entries, cq_entries, NULL));
    3417                 :          0 :         pages += (size_t)1 << get_order(
    3418                 :            :                 array_size(sizeof(struct io_uring_sqe), sq_entries));
    3419                 :            : 
    3420                 :          0 :         return pages;
    3421                 :            : }
    3422                 :            : 
    3423                 :          0 : static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
    3424                 :            : {
    3425                 :            :         int i, j;
    3426                 :            : 
    3427                 :          0 :         if (!ctx->user_bufs)
    3428                 :            :                 return -ENXIO;
    3429                 :            : 
    3430                 :          0 :         for (i = 0; i < ctx->nr_user_bufs; i++) {
    3431                 :          0 :                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
    3432                 :            : 
    3433                 :          0 :                 for (j = 0; j < imu->nr_bvecs; j++)
    3434                 :          0 :                         put_user_page(imu->bvec[j].bv_page);
    3435                 :            : 
    3436                 :          0 :                 if (ctx->account_mem)
    3437                 :          0 :                         io_unaccount_mem(ctx->user, imu->nr_bvecs);
    3438                 :          0 :                 kvfree(imu->bvec);
    3439                 :          0 :                 imu->nr_bvecs = 0;
    3440                 :            :         }
    3441                 :            : 
    3442                 :          0 :         kfree(ctx->user_bufs);
    3443                 :          0 :         ctx->user_bufs = NULL;
    3444                 :          0 :         ctx->nr_user_bufs = 0;
    3445                 :          0 :         return 0;
    3446                 :            : }
    3447                 :            : 
    3448                 :          0 : static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
    3449                 :            :                        void __user *arg, unsigned index)
    3450                 :            : {
    3451                 :            :         struct iovec __user *src;
    3452                 :            : 
    3453                 :            : #ifdef CONFIG_COMPAT
    3454                 :            :         if (ctx->compat) {
    3455                 :            :                 struct compat_iovec __user *ciovs;
    3456                 :            :                 struct compat_iovec ciov;
    3457                 :            : 
    3458                 :            :                 ciovs = (struct compat_iovec __user *) arg;
    3459                 :            :                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
    3460                 :            :                         return -EFAULT;
    3461                 :            : 
    3462                 :            :                 dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
    3463                 :            :                 dst->iov_len = ciov.iov_len;
    3464                 :            :                 return 0;
    3465                 :            :         }
    3466                 :            : #endif
    3467                 :            :         src = (struct iovec __user *) arg;
    3468                 :          0 :         if (copy_from_user(dst, &src[index], sizeof(*dst)))
    3469                 :            :                 return -EFAULT;
    3470                 :          0 :         return 0;
    3471                 :            : }
    3472                 :            : 
    3473                 :          0 : static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
    3474                 :            :                                   unsigned nr_args)
    3475                 :            : {
    3476                 :            :         struct vm_area_struct **vmas = NULL;
    3477                 :            :         struct page **pages = NULL;
    3478                 :            :         int i, j, got_pages = 0;
    3479                 :            :         int ret = -EINVAL;
    3480                 :            : 
    3481                 :          0 :         if (ctx->user_bufs)
    3482                 :            :                 return -EBUSY;
    3483                 :          0 :         if (!nr_args || nr_args > UIO_MAXIOV)
    3484                 :            :                 return -EINVAL;
    3485                 :            : 
    3486                 :          0 :         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
    3487                 :            :                                         GFP_KERNEL);
    3488                 :          0 :         if (!ctx->user_bufs)
    3489                 :            :                 return -ENOMEM;
    3490                 :            : 
    3491                 :          0 :         for (i = 0; i < nr_args; i++) {
    3492                 :          0 :                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
    3493                 :            :                 unsigned long off, start, end, ubuf;
    3494                 :            :                 int pret, nr_pages;
    3495                 :            :                 struct iovec iov;
    3496                 :            :                 size_t size;
    3497                 :            : 
    3498                 :          0 :                 ret = io_copy_iov(ctx, &iov, arg, i);
    3499                 :          0 :                 if (ret)
    3500                 :            :                         goto err;
    3501                 :            : 
    3502                 :            :                 /*
    3503                 :            :                  * Don't impose further limits on the size and buffer
    3504                 :            :                  * constraints here, we'll -EINVAL later when IO is
    3505                 :            :                  * submitted if they are wrong.
    3506                 :            :                  */
    3507                 :            :                 ret = -EFAULT;
    3508                 :          0 :                 if (!iov.iov_base || !iov.iov_len)
    3509                 :            :                         goto err;
    3510                 :            : 
    3511                 :            :                 /* arbitrary limit, but we need something */
    3512                 :          0 :                 if (iov.iov_len > SZ_1G)
    3513                 :            :                         goto err;
    3514                 :            : 
    3515                 :          0 :                 ubuf = (unsigned long) iov.iov_base;
    3516                 :          0 :                 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
    3517                 :          0 :                 start = ubuf >> PAGE_SHIFT;
    3518                 :          0 :                 nr_pages = end - start;
    3519                 :            : 
    3520                 :          0 :                 if (ctx->account_mem) {
    3521                 :          0 :                         ret = io_account_mem(ctx->user, nr_pages);
    3522                 :          0 :                         if (ret)
    3523                 :            :                                 goto err;
    3524                 :            :                 }
    3525                 :            : 
    3526                 :            :                 ret = 0;
    3527                 :          0 :                 if (!pages || nr_pages > got_pages) {
    3528                 :          0 :                         kvfree(vmas);
    3529                 :          0 :                         kvfree(pages);
    3530                 :          0 :                         pages = kvmalloc_array(nr_pages, sizeof(struct page *),
    3531                 :            :                                                 GFP_KERNEL);
    3532                 :          0 :                         vmas = kvmalloc_array(nr_pages,
    3533                 :            :                                         sizeof(struct vm_area_struct *),
    3534                 :            :                                         GFP_KERNEL);
    3535                 :          0 :                         if (!pages || !vmas) {
    3536                 :            :                                 ret = -ENOMEM;
    3537                 :          0 :                                 if (ctx->account_mem)
    3538                 :          0 :                                         io_unaccount_mem(ctx->user, nr_pages);
    3539                 :            :                                 goto err;
    3540                 :            :                         }
    3541                 :            :                         got_pages = nr_pages;
    3542                 :            :                 }
    3543                 :            : 
    3544                 :          0 :                 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
    3545                 :            :                                                 GFP_KERNEL);
    3546                 :            :                 ret = -ENOMEM;
    3547                 :          0 :                 if (!imu->bvec) {
    3548                 :          0 :                         if (ctx->account_mem)
    3549                 :          0 :                                 io_unaccount_mem(ctx->user, nr_pages);
    3550                 :            :                         goto err;
    3551                 :            :                 }
    3552                 :            : 
    3553                 :            :                 ret = 0;
    3554                 :          0 :                 down_read(&current->mm->mmap_sem);
    3555                 :          0 :                 pret = get_user_pages(ubuf, nr_pages,
    3556                 :            :                                       FOLL_WRITE | FOLL_LONGTERM,
    3557                 :            :                                       pages, vmas);
    3558                 :          0 :                 if (pret == nr_pages) {
    3559                 :            :                         /* don't support file backed memory */
    3560                 :          0 :                         for (j = 0; j < nr_pages; j++) {
    3561                 :          0 :                                 struct vm_area_struct *vma = vmas[j];
    3562                 :            : 
    3563                 :          0 :                                 if (vma->vm_file &&
    3564                 :            :                                     !is_file_hugepages(vma->vm_file)) {
    3565                 :            :                                         ret = -EOPNOTSUPP;
    3566                 :            :                                         break;
    3567                 :            :                                 }
    3568                 :            :                         }
    3569                 :            :                 } else {
    3570                 :          0 :                         ret = pret < 0 ? pret : -EFAULT;
    3571                 :            :                 }
    3572                 :          0 :                 up_read(&current->mm->mmap_sem);
    3573                 :          0 :                 if (ret) {
    3574                 :            :                         /*
    3575                 :            :                          * if we did partial map, or found file backed vmas,
    3576                 :            :                          * release any pages we did get
    3577                 :            :                          */
    3578                 :          0 :                         if (pret > 0)
    3579                 :          0 :                                 put_user_pages(pages, pret);
    3580                 :          0 :                         if (ctx->account_mem)
    3581                 :          0 :                                 io_unaccount_mem(ctx->user, nr_pages);
    3582                 :          0 :                         kvfree(imu->bvec);
    3583                 :          0 :                         goto err;
    3584                 :            :                 }
    3585                 :            : 
    3586                 :          0 :                 off = ubuf & ~PAGE_MASK;
    3587                 :          0 :                 size = iov.iov_len;
    3588                 :          0 :                 for (j = 0; j < nr_pages; j++) {
    3589                 :            :                         size_t vec_len;
    3590                 :            : 
    3591                 :          0 :                         vec_len = min_t(size_t, size, PAGE_SIZE - off);
    3592                 :          0 :                         imu->bvec[j].bv_page = pages[j];
    3593                 :          0 :                         imu->bvec[j].bv_len = vec_len;
    3594                 :          0 :                         imu->bvec[j].bv_offset = off;
    3595                 :            :                         off = 0;
    3596                 :          0 :                         size -= vec_len;
    3597                 :            :                 }
    3598                 :            :                 /* store original address for later verification */
    3599                 :          0 :                 imu->ubuf = ubuf;
    3600                 :          0 :                 imu->len = iov.iov_len;
    3601                 :          0 :                 imu->nr_bvecs = nr_pages;
    3602                 :            : 
    3603                 :          0 :                 ctx->nr_user_bufs++;
    3604                 :            :         }
    3605                 :          0 :         kvfree(pages);
    3606                 :          0 :         kvfree(vmas);
    3607                 :          0 :         return 0;
    3608                 :            : err:
    3609                 :          0 :         kvfree(pages);
    3610                 :          0 :         kvfree(vmas);
    3611                 :          0 :         io_sqe_buffer_unregister(ctx);
    3612                 :          0 :         return ret;
    3613                 :            : }
    3614                 :            : 
    3615                 :          0 : static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
    3616                 :            : {
    3617                 :            :         __s32 __user *fds = arg;
    3618                 :            :         int fd;
    3619                 :            : 
    3620                 :          0 :         if (ctx->cq_ev_fd)
    3621                 :            :                 return -EBUSY;
    3622                 :            : 
    3623                 :          0 :         if (copy_from_user(&fd, fds, sizeof(*fds)))
    3624                 :            :                 return -EFAULT;
    3625                 :            : 
    3626                 :          0 :         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
    3627                 :          0 :         if (IS_ERR(ctx->cq_ev_fd)) {
    3628                 :            :                 int ret = PTR_ERR(ctx->cq_ev_fd);
    3629                 :          0 :                 ctx->cq_ev_fd = NULL;
    3630                 :          0 :                 return ret;
    3631                 :            :         }
    3632                 :            : 
    3633                 :            :         return 0;
    3634                 :            : }
    3635                 :            : 
    3636                 :            : static int io_eventfd_unregister(struct io_ring_ctx *ctx)
    3637                 :            : {
    3638                 :          0 :         if (ctx->cq_ev_fd) {
    3639                 :          0 :                 eventfd_ctx_put(ctx->cq_ev_fd);
    3640                 :          0 :                 ctx->cq_ev_fd = NULL;
    3641                 :            :                 return 0;
    3642                 :            :         }
    3643                 :            : 
    3644                 :            :         return -ENXIO;
    3645                 :            : }
    3646                 :            : 
    3647                 :          0 : static void io_ring_ctx_free(struct io_ring_ctx *ctx)
    3648                 :            : {
    3649                 :          0 :         io_finish_async(ctx);
    3650                 :          0 :         if (ctx->sqo_mm)
    3651                 :          0 :                 mmdrop(ctx->sqo_mm);
    3652                 :            : 
    3653                 :          0 :         io_iopoll_reap_events(ctx);
    3654                 :          0 :         io_sqe_buffer_unregister(ctx);
    3655                 :          0 :         io_sqe_files_unregister(ctx);
    3656                 :            :         io_eventfd_unregister(ctx);
    3657                 :            : 
    3658                 :            : #if defined(CONFIG_UNIX)
    3659                 :          0 :         if (ctx->ring_sock) {
    3660                 :          0 :                 ctx->ring_sock->file = NULL; /* so that iput() is called */
    3661                 :          0 :                 sock_release(ctx->ring_sock);
    3662                 :            :         }
    3663                 :            : #endif
    3664                 :            : 
    3665                 :          0 :         io_mem_free(ctx->rings);
    3666                 :          0 :         io_mem_free(ctx->sq_sqes);
    3667                 :            : 
    3668                 :          0 :         percpu_ref_exit(&ctx->refs);
    3669                 :          0 :         if (ctx->account_mem)
    3670                 :          0 :                 io_unaccount_mem(ctx->user,
    3671                 :            :                                 ring_pages(ctx->sq_entries, ctx->cq_entries));
    3672                 :          0 :         free_uid(ctx->user);
    3673                 :          0 :         if (ctx->creds)
    3674                 :          0 :                 put_cred(ctx->creds);
    3675                 :          0 :         kfree(ctx);
    3676                 :          0 : }
    3677                 :            : 
    3678                 :          0 : static __poll_t io_uring_poll(struct file *file, poll_table *wait)
    3679                 :            : {
    3680                 :          0 :         struct io_ring_ctx *ctx = file->private_data;
    3681                 :            :         __poll_t mask = 0;
    3682                 :            : 
    3683                 :          0 :         poll_wait(file, &ctx->cq_wait, wait);
    3684                 :            :         /*
    3685                 :            :          * synchronizes with barrier from wq_has_sleeper call in
    3686                 :            :          * io_commit_cqring
    3687                 :            :          */
    3688                 :          0 :         smp_rmb();
    3689                 :          0 :         if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
    3690                 :          0 :             ctx->rings->sq_ring_entries)
    3691                 :            :                 mask |= EPOLLOUT | EPOLLWRNORM;
    3692                 :          0 :         if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
    3693                 :          0 :                 mask |= EPOLLIN | EPOLLRDNORM;
    3694                 :            : 
    3695                 :          0 :         return mask;
    3696                 :            : }
    3697                 :            : 
    3698                 :          0 : static int io_uring_fasync(int fd, struct file *file, int on)
    3699                 :            : {
    3700                 :          0 :         struct io_ring_ctx *ctx = file->private_data;
    3701                 :            : 
    3702                 :          0 :         return fasync_helper(fd, file, on, &ctx->cq_fasync);
    3703                 :            : }
    3704                 :            : 
    3705                 :          0 : static void io_cancel_async_work(struct io_ring_ctx *ctx,
    3706                 :            :                                  struct task_struct *task)
    3707                 :            : {
    3708                 :          0 :         if (list_empty(&ctx->task_list))
    3709                 :          0 :                 return;
    3710                 :            : 
    3711                 :            :         spin_lock_irq(&ctx->task_lock);
    3712                 :          0 :         while (!list_empty(&ctx->task_list)) {
    3713                 :            :                 struct io_kiocb *req;
    3714                 :            : 
    3715                 :          0 :                 req = list_first_entry(&ctx->task_list, struct io_kiocb, task_list);
    3716                 :          0 :                 list_del_init(&req->task_list);
    3717                 :          0 :                 req->flags |= REQ_F_CANCEL;
    3718                 :          0 :                 if (req->work_task && (!task || req->task == task))
    3719                 :          0 :                         send_sig(SIGINT, req->work_task, 1);
    3720                 :            :         }
    3721                 :            :         spin_unlock_irq(&ctx->task_lock);
    3722                 :            : }
    3723                 :            : 
    3724                 :          0 : static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
    3725                 :            : {
    3726                 :          0 :         mutex_lock(&ctx->uring_lock);
    3727                 :          0 :         percpu_ref_kill(&ctx->refs);
    3728                 :          0 :         mutex_unlock(&ctx->uring_lock);
    3729                 :            : 
    3730                 :          0 :         io_cancel_async_work(ctx, NULL);
    3731                 :          0 :         io_kill_timeouts(ctx);
    3732                 :          0 :         io_poll_remove_all(ctx);
    3733                 :          0 :         io_iopoll_reap_events(ctx);
    3734                 :          0 :         wait_for_completion(&ctx->ctx_done);
    3735                 :          0 :         io_ring_ctx_free(ctx);
    3736                 :          0 : }
    3737                 :            : 
    3738                 :          0 : static int io_uring_flush(struct file *file, void *data)
    3739                 :            : {
    3740                 :          0 :         struct io_ring_ctx *ctx = file->private_data;
    3741                 :            : 
    3742                 :          0 :         if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
    3743                 :          0 :                 io_cancel_async_work(ctx, current);
    3744                 :            : 
    3745                 :          0 :         return 0;
    3746                 :            : }
    3747                 :            : 
    3748                 :          0 : static int io_uring_release(struct inode *inode, struct file *file)
    3749                 :            : {
    3750                 :          0 :         struct io_ring_ctx *ctx = file->private_data;
    3751                 :            : 
    3752                 :          0 :         file->private_data = NULL;
    3753                 :          0 :         io_ring_ctx_wait_and_kill(ctx);
    3754                 :          0 :         return 0;
    3755                 :            : }
    3756                 :            : 
    3757                 :          0 : static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
    3758                 :            : {
    3759                 :          0 :         loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
    3760                 :          0 :         unsigned long sz = vma->vm_end - vma->vm_start;
    3761                 :          0 :         struct io_ring_ctx *ctx = file->private_data;
    3762                 :            :         unsigned long pfn;
    3763                 :            :         struct page *page;
    3764                 :            :         void *ptr;
    3765                 :            : 
    3766                 :          0 :         switch (offset) {
    3767                 :            :         case IORING_OFF_SQ_RING:
    3768                 :            :         case IORING_OFF_CQ_RING:
    3769                 :          0 :                 ptr = ctx->rings;
    3770                 :          0 :                 break;
    3771                 :            :         case IORING_OFF_SQES:
    3772                 :          0 :                 ptr = ctx->sq_sqes;
    3773                 :          0 :                 break;
    3774                 :            :         default:
    3775                 :            :                 return -EINVAL;
    3776                 :            :         }
    3777                 :            : 
    3778                 :            :         page = virt_to_head_page(ptr);
    3779                 :          0 :         if (sz > page_size(page))
    3780                 :            :                 return -EINVAL;
    3781                 :            : 
    3782                 :          0 :         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
    3783                 :          0 :         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
    3784                 :            : }
    3785                 :            : 
    3786                 :          0 : SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
    3787                 :            :                 u32, min_complete, u32, flags, const sigset_t __user *, sig,
    3788                 :            :                 size_t, sigsz)
    3789                 :            : {
    3790                 :            :         struct io_ring_ctx *ctx;
    3791                 :            :         long ret = -EBADF;
    3792                 :            :         int submitted = 0;
    3793                 :            :         struct fd f;
    3794                 :            : 
    3795                 :          0 :         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
    3796                 :            :                 return -EINVAL;
    3797                 :            : 
    3798                 :            :         f = fdget(fd);
    3799                 :          0 :         if (!f.file)
    3800                 :            :                 return -EBADF;
    3801                 :            : 
    3802                 :            :         ret = -EOPNOTSUPP;
    3803                 :          0 :         if (f.file->f_op != &io_uring_fops)
    3804                 :            :                 goto out_fput;
    3805                 :            : 
    3806                 :            :         ret = -ENXIO;
    3807                 :          0 :         ctx = f.file->private_data;
    3808                 :          0 :         if (!percpu_ref_tryget(&ctx->refs))
    3809                 :            :                 goto out_fput;
    3810                 :            : 
    3811                 :            :         /*
    3812                 :            :          * For SQ polling, the thread will do all submissions and completions.
    3813                 :            :          * Just return the requested submit count, and wake the thread if
    3814                 :            :          * we were asked to.
    3815                 :            :          */
    3816                 :            :         ret = 0;
    3817                 :          0 :         if (ctx->flags & IORING_SETUP_SQPOLL) {
    3818                 :          0 :                 if (flags & IORING_ENTER_SQ_WAKEUP)
    3819                 :          0 :                         wake_up(&ctx->sqo_wait);
    3820                 :          0 :                 submitted = to_submit;
    3821                 :          0 :         } else if (to_submit) {
    3822                 :          0 :                 to_submit = min(to_submit, ctx->sq_entries);
    3823                 :            : 
    3824                 :          0 :                 mutex_lock(&ctx->uring_lock);
    3825                 :          0 :                 submitted = io_ring_submit(ctx, to_submit);
    3826                 :          0 :                 mutex_unlock(&ctx->uring_lock);
    3827                 :            : 
    3828                 :          0 :                 if (submitted != to_submit)
    3829                 :            :                         goto out;
    3830                 :            :         }
    3831                 :          0 :         if (flags & IORING_ENTER_GETEVENTS) {
    3832                 :          0 :                 unsigned nr_events = 0;
    3833                 :            : 
    3834                 :          0 :                 min_complete = min(min_complete, ctx->cq_entries);
    3835                 :            : 
    3836                 :          0 :                 if (ctx->flags & IORING_SETUP_IOPOLL) {
    3837                 :          0 :                         ret = io_iopoll_check(ctx, &nr_events, min_complete);
    3838                 :            :                 } else {
    3839                 :          0 :                         ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
    3840                 :            :                 }
    3841                 :            :         }
    3842                 :            : 
    3843                 :            : out:
    3844                 :            :         percpu_ref_put(&ctx->refs);
    3845                 :            : out_fput:
    3846                 :            :         fdput(f);
    3847                 :          0 :         return submitted ? submitted : ret;
    3848                 :            : }
    3849                 :            : 
    3850                 :            : static const struct file_operations io_uring_fops = {
    3851                 :            :         .release        = io_uring_release,
    3852                 :            :         .flush          = io_uring_flush,
    3853                 :            :         .mmap           = io_uring_mmap,
    3854                 :            :         .poll           = io_uring_poll,
    3855                 :            :         .fasync         = io_uring_fasync,
    3856                 :            : };
    3857                 :            : 
    3858                 :          0 : static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
    3859                 :            :                                   struct io_uring_params *p)
    3860                 :            : {
    3861                 :            :         struct io_rings *rings;
    3862                 :            :         size_t size, sq_array_offset;
    3863                 :            : 
    3864                 :            :         /* make sure these are sane, as we already accounted them */
    3865                 :          0 :         ctx->sq_entries = p->sq_entries;
    3866                 :          0 :         ctx->cq_entries = p->cq_entries;
    3867                 :            : 
    3868                 :          0 :         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
    3869                 :          0 :         if (size == SIZE_MAX)
    3870                 :            :                 return -EOVERFLOW;
    3871                 :            : 
    3872                 :          0 :         rings = io_mem_alloc(size);
    3873                 :          0 :         if (!rings)
    3874                 :            :                 return -ENOMEM;
    3875                 :            : 
    3876                 :          0 :         ctx->rings = rings;
    3877                 :          0 :         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
    3878                 :          0 :         rings->sq_ring_mask = p->sq_entries - 1;
    3879                 :          0 :         rings->cq_ring_mask = p->cq_entries - 1;
    3880                 :          0 :         rings->sq_ring_entries = p->sq_entries;
    3881                 :          0 :         rings->cq_ring_entries = p->cq_entries;
    3882                 :          0 :         ctx->sq_mask = rings->sq_ring_mask;
    3883                 :          0 :         ctx->cq_mask = rings->cq_ring_mask;
    3884                 :            : 
    3885                 :          0 :         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
    3886                 :          0 :         if (size == SIZE_MAX) {
    3887                 :          0 :                 io_mem_free(ctx->rings);
    3888                 :          0 :                 ctx->rings = NULL;
    3889                 :          0 :                 return -EOVERFLOW;
    3890                 :            :         }
    3891                 :            : 
    3892                 :          0 :         ctx->sq_sqes = io_mem_alloc(size);
    3893                 :          0 :         if (!ctx->sq_sqes) {
    3894                 :          0 :                 io_mem_free(ctx->rings);
    3895                 :          0 :                 ctx->rings = NULL;
    3896                 :          0 :                 return -ENOMEM;
    3897                 :            :         }
    3898                 :            : 
    3899                 :            :         return 0;
    3900                 :            : }
    3901                 :            : 
    3902                 :            : /*
    3903                 :            :  * Allocate an anonymous fd, this is what constitutes the application
    3904                 :            :  * visible backing of an io_uring instance. The application mmaps this
    3905                 :            :  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
    3906                 :            :  * we have to tie this fd to a socket for file garbage collection purposes.
    3907                 :            :  */
    3908                 :          0 : static int io_uring_get_fd(struct io_ring_ctx *ctx)
    3909                 :            : {
    3910                 :            :         struct file *file;
    3911                 :            :         int ret;
    3912                 :            : 
    3913                 :            : #if defined(CONFIG_UNIX)
    3914                 :          0 :         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
    3915                 :            :                                 &ctx->ring_sock);
    3916                 :          0 :         if (ret)
    3917                 :            :                 return ret;
    3918                 :            : #endif
    3919                 :            : 
    3920                 :          0 :         ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
    3921                 :          0 :         if (ret < 0)
    3922                 :            :                 goto err;
    3923                 :            : 
    3924                 :          0 :         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
    3925                 :            :                                         O_RDWR | O_CLOEXEC);
    3926                 :          0 :         if (IS_ERR(file)) {
    3927                 :          0 :                 put_unused_fd(ret);
    3928                 :            :                 ret = PTR_ERR(file);
    3929                 :          0 :                 goto err;
    3930                 :            :         }
    3931                 :            : 
    3932                 :            : #if defined(CONFIG_UNIX)
    3933                 :          0 :         ctx->ring_sock->file = file;
    3934                 :          0 :         ctx->ring_sock->sk->sk_user_data = ctx;
    3935                 :            : #endif
    3936                 :          0 :         fd_install(ret, file);
    3937                 :          0 :         return ret;
    3938                 :            : err:
    3939                 :            : #if defined(CONFIG_UNIX)
    3940                 :          0 :         sock_release(ctx->ring_sock);
    3941                 :          0 :         ctx->ring_sock = NULL;
    3942                 :            : #endif
    3943                 :          0 :         return ret;
    3944                 :            : }
    3945                 :            : 
    3946                 :          0 : static int io_uring_create(unsigned entries, struct io_uring_params *p)
    3947                 :            : {
    3948                 :            :         struct user_struct *user = NULL;
    3949                 :            :         struct io_ring_ctx *ctx;
    3950                 :            :         bool account_mem;
    3951                 :            :         int ret;
    3952                 :            : 
    3953                 :          0 :         if (!entries || entries > IORING_MAX_ENTRIES)
    3954                 :            :                 return -EINVAL;
    3955                 :            : 
    3956                 :            :         /*
    3957                 :            :          * Use twice as many entries for the CQ ring. It's possible for the
    3958                 :            :          * application to drive a higher depth than the size of the SQ ring,
    3959                 :            :          * since the sqes are only used at submission time. This allows for
    3960                 :            :          * some flexibility in overcommitting a bit.
    3961                 :            :          */
    3962                 :          0 :         p->sq_entries = roundup_pow_of_two(entries);
    3963                 :          0 :         p->cq_entries = 2 * p->sq_entries;
    3964                 :            : 
    3965                 :          0 :         user = get_uid(current_user());
    3966                 :          0 :         account_mem = !capable(CAP_IPC_LOCK);
    3967                 :            : 
    3968                 :          0 :         if (account_mem) {
    3969                 :          0 :                 ret = io_account_mem(user,
    3970                 :            :                                 ring_pages(p->sq_entries, p->cq_entries));
    3971                 :          0 :                 if (ret) {
    3972                 :          0 :                         free_uid(user);
    3973                 :          0 :                         return ret;
    3974                 :            :                 }
    3975                 :            :         }
    3976                 :            : 
    3977                 :          0 :         ctx = io_ring_ctx_alloc(p);
    3978                 :          0 :         if (!ctx) {
    3979                 :          0 :                 if (account_mem)
    3980                 :          0 :                         io_unaccount_mem(user, ring_pages(p->sq_entries,
    3981                 :            :                                                                 p->cq_entries));
    3982                 :          0 :                 free_uid(user);
    3983                 :          0 :                 return -ENOMEM;
    3984                 :            :         }
    3985                 :          0 :         ctx->compat = in_compat_syscall();
    3986                 :          0 :         ctx->account_mem = account_mem;
    3987                 :          0 :         ctx->user = user;
    3988                 :            : 
    3989                 :          0 :         ctx->creds = get_current_cred();
    3990                 :          0 :         if (!ctx->creds) {
    3991                 :            :                 ret = -ENOMEM;
    3992                 :            :                 goto err;
    3993                 :            :         }
    3994                 :            : 
    3995                 :          0 :         ret = io_allocate_scq_urings(ctx, p);
    3996                 :          0 :         if (ret)
    3997                 :            :                 goto err;
    3998                 :            : 
    3999                 :          0 :         ret = io_sq_offload_start(ctx, p);
    4000                 :          0 :         if (ret)
    4001                 :            :                 goto err;
    4002                 :            : 
    4003                 :          0 :         memset(&p->sq_off, 0, sizeof(p->sq_off));
    4004                 :          0 :         p->sq_off.head = offsetof(struct io_rings, sq.head);
    4005                 :          0 :         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
    4006                 :          0 :         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
    4007                 :          0 :         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
    4008                 :          0 :         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
    4009                 :          0 :         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
    4010                 :          0 :         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
    4011                 :            : 
    4012                 :          0 :         memset(&p->cq_off, 0, sizeof(p->cq_off));
    4013                 :          0 :         p->cq_off.head = offsetof(struct io_rings, cq.head);
    4014                 :          0 :         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
    4015                 :          0 :         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
    4016                 :          0 :         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
    4017                 :          0 :         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
    4018                 :          0 :         p->cq_off.cqes = offsetof(struct io_rings, cqes);
    4019                 :            : 
    4020                 :            :         /*
    4021                 :            :          * Install ring fd as the very last thing, so we don't risk someone
    4022                 :            :          * having closed it before we finish setup
    4023                 :            :          */
    4024                 :          0 :         ret = io_uring_get_fd(ctx);
    4025                 :          0 :         if (ret < 0)
    4026                 :            :                 goto err;
    4027                 :            : 
    4028                 :          0 :         p->features = IORING_FEAT_SINGLE_MMAP;
    4029                 :          0 :         return ret;
    4030                 :            : err:
    4031                 :          0 :         io_ring_ctx_wait_and_kill(ctx);
    4032                 :          0 :         return ret;
    4033                 :            : }
    4034                 :            : 
    4035                 :            : /*
    4036                 :            :  * Sets up an aio uring context, and returns the fd. Applications asks for a
    4037                 :            :  * ring size, we return the actual sq/cq ring sizes (among other things) in the
    4038                 :            :  * params structure passed in.
    4039                 :            :  */
    4040                 :          0 : static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
    4041                 :            : {
    4042                 :            :         struct io_uring_params p;
    4043                 :            :         long ret;
    4044                 :            :         int i;
    4045                 :            : 
    4046                 :          0 :         if (copy_from_user(&p, params, sizeof(p)))
    4047                 :            :                 return -EFAULT;
    4048                 :          0 :         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
    4049                 :          0 :                 if (p.resv[i])
    4050                 :            :                         return -EINVAL;
    4051                 :            :         }
    4052                 :            : 
    4053                 :          0 :         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
    4054                 :            :                         IORING_SETUP_SQ_AFF))
    4055                 :            :                 return -EINVAL;
    4056                 :            : 
    4057                 :          0 :         ret = io_uring_create(entries, &p);
    4058                 :          0 :         if (ret < 0)
    4059                 :            :                 return ret;
    4060                 :            : 
    4061                 :          0 :         if (copy_to_user(params, &p, sizeof(p)))
    4062                 :            :                 return -EFAULT;
    4063                 :            : 
    4064                 :          0 :         return ret;
    4065                 :            : }
    4066                 :            : 
    4067                 :          0 : SYSCALL_DEFINE2(io_uring_setup, u32, entries,
    4068                 :            :                 struct io_uring_params __user *, params)
    4069                 :            : {
    4070                 :          0 :         return io_uring_setup(entries, params);
    4071                 :            : }
    4072                 :            : 
    4073                 :          0 : static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
    4074                 :            :                                void __user *arg, unsigned nr_args)
    4075                 :            :         __releases(ctx->uring_lock)
    4076                 :            :         __acquires(ctx->uring_lock)
    4077                 :            : {
    4078                 :            :         int ret;
    4079                 :            : 
    4080                 :            :         /*
    4081                 :            :          * We're inside the ring mutex, if the ref is already dying, then
    4082                 :            :          * someone else killed the ctx or is already going through
    4083                 :            :          * io_uring_register().
    4084                 :            :          */
    4085                 :          0 :         if (percpu_ref_is_dying(&ctx->refs))
    4086                 :            :                 return -ENXIO;
    4087                 :            : 
    4088                 :          0 :         percpu_ref_kill(&ctx->refs);
    4089                 :            : 
    4090                 :            :         /*
    4091                 :            :          * Drop uring mutex before waiting for references to exit. If another
    4092                 :            :          * thread is currently inside io_uring_enter() it might need to grab
    4093                 :            :          * the uring_lock to make progress. If we hold it here across the drain
    4094                 :            :          * wait, then we can deadlock. It's safe to drop the mutex here, since
    4095                 :            :          * no new references will come in after we've killed the percpu ref.
    4096                 :            :          */
    4097                 :          0 :         mutex_unlock(&ctx->uring_lock);
    4098                 :          0 :         wait_for_completion(&ctx->ctx_done);
    4099                 :          0 :         mutex_lock(&ctx->uring_lock);
    4100                 :            : 
    4101                 :          0 :         switch (opcode) {
    4102                 :            :         case IORING_REGISTER_BUFFERS:
    4103                 :          0 :                 ret = io_sqe_buffer_register(ctx, arg, nr_args);
    4104                 :          0 :                 break;
    4105                 :            :         case IORING_UNREGISTER_BUFFERS:
    4106                 :            :                 ret = -EINVAL;
    4107                 :          0 :                 if (arg || nr_args)
    4108                 :            :                         break;
    4109                 :          0 :                 ret = io_sqe_buffer_unregister(ctx);
    4110                 :          0 :                 break;
    4111                 :            :         case IORING_REGISTER_FILES:
    4112                 :          0 :                 ret = io_sqe_files_register(ctx, arg, nr_args);
    4113                 :          0 :                 break;
    4114                 :            :         case IORING_UNREGISTER_FILES:
    4115                 :            :                 ret = -EINVAL;
    4116                 :          0 :                 if (arg || nr_args)
    4117                 :            :                         break;
    4118                 :          0 :                 ret = io_sqe_files_unregister(ctx);
    4119                 :          0 :                 break;
    4120                 :            :         case IORING_REGISTER_EVENTFD:
    4121                 :            :                 ret = -EINVAL;
    4122                 :          0 :                 if (nr_args != 1)
    4123                 :            :                         break;
    4124                 :          0 :                 ret = io_eventfd_register(ctx, arg);
    4125                 :          0 :                 break;
    4126                 :            :         case IORING_UNREGISTER_EVENTFD:
    4127                 :            :                 ret = -EINVAL;
    4128                 :          0 :                 if (arg || nr_args)
    4129                 :            :                         break;
    4130                 :            :                 ret = io_eventfd_unregister(ctx);
    4131                 :          0 :                 break;
    4132                 :            :         default:
    4133                 :            :                 ret = -EINVAL;
    4134                 :            :                 break;
    4135                 :            :         }
    4136                 :            : 
    4137                 :            :         /* bring the ctx back to life */
    4138                 :            :         reinit_completion(&ctx->ctx_done);
    4139                 :          0 :         percpu_ref_reinit(&ctx->refs);
    4140                 :          0 :         return ret;
    4141                 :            : }
    4142                 :            : 
    4143                 :          0 : SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
    4144                 :            :                 void __user *, arg, unsigned int, nr_args)
    4145                 :            : {
    4146                 :            :         struct io_ring_ctx *ctx;
    4147                 :            :         long ret = -EBADF;
    4148                 :            :         struct fd f;
    4149                 :            : 
    4150                 :            :         f = fdget(fd);
    4151                 :          0 :         if (!f.file)
    4152                 :            :                 return -EBADF;
    4153                 :            : 
    4154                 :            :         ret = -EOPNOTSUPP;
    4155                 :          0 :         if (f.file->f_op != &io_uring_fops)
    4156                 :            :                 goto out_fput;
    4157                 :            : 
    4158                 :          0 :         ctx = f.file->private_data;
    4159                 :            : 
    4160                 :          0 :         mutex_lock(&ctx->uring_lock);
    4161                 :          0 :         ret = __io_uring_register(ctx, opcode, arg, nr_args);
    4162                 :          0 :         mutex_unlock(&ctx->uring_lock);
    4163                 :            : out_fput:
    4164                 :            :         fdput(f);
    4165                 :          0 :         return ret;
    4166                 :            : }
    4167                 :            : 
    4168                 :          3 : static int __init io_uring_init(void)
    4169                 :            : {
    4170                 :          3 :         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
    4171                 :          3 :         return 0;
    4172                 :            : };
    4173                 :            : __initcall(io_uring_init);

Generated by: LCOV version 1.14